In this notebook the eQTL identification is performed.

General workflow description: for each marker identify whether it is a QTL for some expressed gene by running a statistical test over sets of expression values of each of them, divided by inheritance pattern in a strain, and applying FDR-correction thereafter 

TODO: clean the notebook up before writing more code

In [1]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import time
from scipy import stats
import networkx as nx
import multiprocessing as mp
from statsmodels.sandbox.stats.multicomp import multipletests 

%autosave 15

Autosaving every 15 seconds


In [2]:
# BY and RM strains values must be averaged and placed into two separate columns

expression_df = pd.read_table('./data/rna_expression_avg.csv', sep='\t').drop("Unnamed: 0", axis=1)
genotypes_df = pd.read_table('./data/genotypes.csv', sep='\t').drop("Unnamed: 0", axis=1)

markers_n = genotypes_df.shape[0]
rna_n = expression_df.shape[0]

expression_matrix = expression_df.as_matrix(
    expression_df.columns.tolist()[2:]
)

genotypes_matrix = genotypes_df.as_matrix(
    genotypes_df.columns.tolist()[1:]
) 

strain_names = expression_df.columns.tolist()[2:]



In [3]:
marker_to_loc = dict(zip(genotypes_df["RQTL_name"], np.arange(markers_n)))
RNA_to_loc = dict(zip(expression_df["IDENTIFIER"], np.arange(rna_n)))

In [4]:
# Divide all progeny into groups by their inheritance pattern
# for a given genetic marker, and then plot the data clouds
# to visually observe if there is any correlation between marker
# and RNA expression 

# Divide expression data for a given gene in two groups,
# based on inheritance pattern of a given marker


def expression_by_RNA_and_marker(RNA_name, marker_name):
    RNA_pos = RNA_to_loc[RNA_name]
    marker_pos = marker_to_loc[marker_name]

    expression_values = expression_matrix[RNA_pos]
    inheritance_patterns = genotypes_matrix[marker_pos]

    # Can this be optimized further?
    
    from_BY = expression_values[inheritance_patterns == 0]
    from_RM = expression_values[inheritance_patterns == 1]
    
    return from_BY, from_RM


# For the given pair (expressed gene, marker) test the hypothesis
# that inherited variant of a marker influences gene expression significantly 
def test_linkage(RNA_name, marker_name, eps=1e-5):
    global ftime
    fstart_time = time.time()
    from_BY, from_RM = expression_by_RNA_and_marker(RNA_name, marker_name)
    fend_time = time.time()
    ftime += fend_time - fstart_time
    statistic, pvalue = stats.mannwhitneyu(x=from_BY, y=from_RM)
    return (pvalue <= eps, pvalue)


# Divide expression data by inherited marker 
# variant and then plot the resulting groups 
def plot_expression_to_marker_correlation(RNA_name, marker_name):
    from_BY, from_RM = expression_by_RNA_and_marker(RNA_name, marker_name)
    xlabels = np.append(
                    np.full((1, len(from_BY)), 1), 
                    np.full((1, len(from_RM)), 2))\
                    + np.random.normal(0, 0.01, len(from_BY) + len(from_RM)
            )  
    ylabels = np.array(from_BY + from_RM)
    plt.figure(figsize=(20, 10))
    plt.rcParams["axes.facecolor"] = 'white'
    plt.title("p-value: {}".format(pvalue))
    plt.xlabel("class label")
    plt.ylabel("expression value")
    plt.scatter(
        x=xlabels, y=ylabels,
        c=ylabels, cmap=cm.jet
    )
    plt.savefig("./img/" + RNA_name + "_to_" + marker_name + ".png")
    plt.close()

In [77]:
marker_list = genotypes_df["RQTL_name"].as_matrix()
RNA_list = expression_df["IDENTIFIER"].as_matrix()

CHUNKS_N = 4
marker_chunks = np.split(marker_list, CHUNKS_N)
chunk_lens = np.roll(
    np.cumsum(
        np.apply_along_axis(len, 1, marker_chunks)
    ), 1
)
chunk_lens[0] = 0
marker_samples = list(zip(marker_chunks, chunk_lens))

In [78]:
# Current runtime — 20 minutes
# Goal: < 10 minutes

# How to get rid of two nested for-loops?
# Is it possible to write a vectorized version
# of the function that will run faster?
# And if it is, how much faster will it run?

inh_by = (genotypes_matrix == 0)
inh_rm = (genotypes_matrix == 1)

def calculate_pvalues(sample_pair):
    marker_sample, offset = sample_pair
    pvalues = np.zeros(len(marker_sample) * rna_n, dtype=np.float32)
    iter = 0
    for marker_rownum in range(offset, len(marker_sample) + offset):
        BY_allele_pos = inh_by[marker_rownum] #(inheritance_patterns == 0)
        RM_allele_pos = inh_rm[marker_rownum] #inheritance_patterns == 1)
        for expression_row in expression_matrix:
            from_BY = expression_row[BY_allele_pos]
            from_RM = expression_row[RM_allele_pos]
            # CPU hog
            statistics, pvalue = stats.mannwhitneyu(x=from_BY, y=from_RM)
            pvalues[iter] = pvalue
            iter += 1        
    return pvalues

In [82]:
# Set the seed to ensure either reproducibility
# or randomness of the generated sample 

np.random.seed(int(time.time()))

pool = mp.Pool(processes=4)
start_time = time.time()
results = pool.map(calculate_pvalues, marker_samples)
end_time = time.time()
pool.close()

pvalues = np.concatenate([results[i] for i in range(CHUNKS_N)])
adjusted_results = multipletests(pvalues, method="fdr_bh")

print("Calculation of pvalues: {}".format(end_time - start_time))

In [84]:
# Build linkage graph from qvalues

reject, qvalues = adjusted_results[0], adjusted_results[1]
linkage_graph = nx.Graph()
idx = 0
# this can be optimized
for marker_name in marker_list:
    for RNA_name in RNA_list:
        if reject[idx] == True:
            if not linkage_graph.has_node(RNA_name):
                linkage_graph.add_node(RNA_name, bipartite=0)
            if not linkage_graph.has_node(marker_name):
                linkage_graph.add_node(marker_name, bipartite=1)
            linkage_graph.add_edge(RNA_name, marker_name)
        idx += 1


In [86]:
# Built-in bipartite.sets() works strangely 
# maybe, it's only so for undirected graphs,
# I should check that on some toy example

top_v, bottom_v = [], []
for node, data in linkage_graph.nodes(data=True):
    if data["bipartite"] == 0:
        bottom_v.append(node)
    else:
        top_v.append(node)
        
# To plot a bipartite graph correctly, the positions
# of the vertices must be written down explicitly

pos = dict()
pos.update((n, (1, 3*i)) for i, n in enumerate(top_v))
pos.update((n, (2, 3*i)) for i, n in enumerate(bottom_v))

plt.figure(figsize=(20, 100))
nx.draw(
    linkage_graph,
    with_labels=True,
    node_size=50,
    edge_width=3.0,
    pos=pos,
    node_color=list(linkage_graph.degree().values()),
    edge_color='b',
    cmap=plt.cm.Blues,
    alpha=0.5,
    font_size=8
)
plt.savefig("./img/graph.png")
plt.close()

    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
  b = plt.ishold()


    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
  plt.hold(b)


In [87]:
# Extract the marker-nodes and number of linkages to them
# preserving their order based on genome location

marker_nodes = sorted(
    list(linkage_graph.degree(top_v).items()), 
    key=lambda p: marker_to_loc[p[0]]
)

# Pythonic way of unzipping a list of tuples
# into two separate lists of their coordinates

m_names, m_degrees = map(list, zip(*marker_nodes))  

plt.figure(figsize=(40, 20))
plt.plot(m_degrees)
plt.xticks(
    range(len(marker_nodes)), 
    m_names,
    rotation="vertical"
)
plt.savefig("./img/linkage_map.png")
plt.close()

In [89]:
graph_file = open("./data/linkage_graph.txt", "w+")
for u in top_v:
    graph_file.write("{}: {}\n".format(u, linkage_graph.degree(u)))
    for v in linkage_graph[u]:
        graph_file.write("{}\n".format(v))
graph_file.close()