In this notebook the eQTL identification is performed.

General workflow description: for each marker identify whether it is a QTL for some expressed gene by running a statistical test over sets of expression values of each of them, divided by inheritance pattern in a strain, and applying FDR-correction thereafter 

In [3]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import time
from scipy import stats
import networkx as nx

%autosave 15

Autosaving every 15 seconds


In [4]:
# BY and RM strains values must be averaged and placed into two separate columns

expression_df = pd.read_table('./data/rna_expression_avg.csv', sep='\t').drop("Unnamed: 0", axis=1)
genotypes_df = pd.read_table('./data/genotypes.csv', sep='\t').drop("Unnamed: 0", axis=1)

markers_n = genotypes_df.shape[0]
rna_n = expression_df.shape[0]

expression_matrix = expression_df.as_matrix(
    expression_df.columns.tolist()[2:]
)

genotypes_matrix = genotypes_df.as_matrix(
    genotypes_df.columns.tolist()[1:]
) 

strain_names = expression_df.columns.tolist()[2:]

In [10]:
marker_to_loc = dict(zip(genotypes_df["RQTL_name"], np.arange(markers_n)))
RNA_to_loc = dict(zip(expression_df["IDENTIFIER"], np.arange(rna_n)))

In [7]:
# Divide all progeny into groups by their inheritance pattern
# for a given genetic marker, and then plot the data clouds
# to visually observe if there is any correlation between marker
# and RNA expression 

# Divide expression data for a given gene in two groups,
# based on inheritance pattern of a given marker

# Also, there is no FDR correction yet. 
# At least the permutation test must be added, I guess
# But, well, after the linkage analysis: performed on every step,
# it will slow the program down by an order of magnitude

ftime = 0.0

def expression_by_RNA_and_marker(RNA_name, marker_name):
    RNA_pos = RNA_to_loc[RNA_name]
    marker_pos = marker_to_loc[marker_name]

    expression_values = expression_matrix[RNA_pos]
    inheritance_patterns = genotypes_matrix[marker_pos]

    # Can this be optimized further?
    
    from_BY = expression_values[inheritance_patterns == 0]
    from_RM = expression_values[inheritance_patterns == 1]
    
    return from_BY, from_RM


# For the given pair (expressed gene, marker) test the hypothesis
# that inherited variant of a marker influences gene expression significantly 
def test_linkage(RNA_name, marker_name, eps=1e-5):
    global ftime
    fstart_time = time.time()
    from_BY, from_RM = expression_by_RNA_and_marker(RNA_name, marker_name)
    fend_time = time.time()
    ftime += fend_time - fstart_time
    statistic, pvalue = stats.mannwhitneyu(x=from_BY, y=from_RM)
    return (pvalue <= eps, pvalue)


# Divide expression data by inherited marker 
# variant and then plot the resulting groups 
def plot_expression_to_marker_correlation(RNA_name, marker_name):
    from_BY, from_RM = expression_by_RNA_and_marker(RNA_name, marker_name)
    xlabels = np.append(
                    np.full((1, len(from_BY)), 1), 
                    np.full((1, len(from_RM)), 2))\
                    + np.random.normal(0, 0.01, len(from_BY) + len(from_RM)
            )  
    ylabels = np.array(from_BY + from_RM)
    plt.figure(figsize=(20, 10))
    plt.rcParams["axes.facecolor"] = 'white'
    plt.title("p-value: {}".format(pvalue))
    plt.xlabel("class label")
    plt.ylabel("expression value")
    plt.scatter(
        x=xlabels, y=ylabels,
        c=ylabels, cmap=cm.jet
    )
    plt.savefig("./img/" + RNA_name + "_to_" + marker_name + ".png")
    plt.close()

In [18]:
# Set the seed to ensure either reproducibility
# or randomness of the generated sample 
np.random.seed(int(time.time()))

marker_list = genotypes_df["RQTL_name"].tolist()
RNA_list = expression_df["IDENTIFIER"].tolist() 

start_time = time.time()
linkage_graph = nx.Graph()
# Construct a bipartite graph of interactions
for marker_pos, marker_name in enumerate(marker_list):
    fstart_time = time.time()
    inheritance_patterns = genotypes_matrix[marker_pos]
    fend_time = time.time()
    ftime += fend_time - fstart_time
    
    BY_allele_pos = (inheritance_patterns == 0)
    RM_allele_pos = (inheritance_patterns == 1)
    
    for RNA_pos, RNA_name in enumerate(RNA_list):
        fstart_time = time.time()
        expression_values = expression_matrix[RNA_pos]
        from_BY = expression_values[BY_allele_pos]
        from_RM = expression_values[RM_allele_pos]
        fend_time = time.time()
        ftime += fend_time - fstart_time
        
        statistic, pvalue = stats.mannwhitneyu(x=from_BY, y=from_RM)
        
        if pvalue <= 1e-5:
            if not linkage_graph.has_node(RNA_name):
                linkage_graph.add_node(RNA_name, bipartite=0)
            if not linkage_graph.has_node(marker_name):
                linkage_graph.add_node(marker_name, bipartite=1)
            linkage_graph.add_edge(RNA_name, marker_name)
end_time = time.time()

print("Graph construction: {}".format(end_time - start_time))
print("ftime: {}".format(ftime))

Graph construction: 4337.505954504013
ftime: 87.03302097320557


**TODO**:
    1.  Find QTLs for every gene using 5% pvalue threshold
    2.  Validate QTLs using permutations test and calculate FDR
    3.  Construct and visualize a bipartite graph visualizing 
        the linkages found during experiments 
    4.  Also, it's worth calculating the linkages for every
        marker gene and plotting the bar chart, placing markers
        accordingly to their position in the chromosome.
**Problems**:
    1. Graph construction is really slow: 6.68s for 2000 test pairs. 
       I guess, I should find out, what the bottleneck is.
    2. Spontaneous division by zero occurs _every time_ after rewriting
       the function to work with numpy.ndarray

In [20]:
# Built-in bipartite.sets() works strangely 
# maybe, it's only so for undirected graphs,
# I should check that on some toy example

top_v, bottom_v = [], []
for node, data in linkage_graph.nodes(data=True):
    if data["bipartite"] == 0:
        bottom_v.append(node)
    else:
        top_v.append(node)
        
# To plot a bipartite graph correctly, the positions
# of the vertices must be written down explicitly

pos = dict()
pos.update((n, (1, 3*i)) for i, n in enumerate(top_v))
pos.update((n, (2, 3*i)) for i, n in enumerate(bottom_v))

plt.figure(figsize=(20, 100))
nx.draw(
    linkage_graph,
    with_labels=True,
    node_size=50,
    edge_width=3.0,
    pos=pos,
    node_color=list(linkage_graph.degree().values()),
    edge_color='b',
    cmap=plt.cm.Blues,
    alpha=0.5,
    font_size=8
)
plt.savefig("./img/graph.png")
plt.close()

    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
  b = plt.ishold()


    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
  plt.hold(b)


In [21]:
# Extract the marker-nodes and number of linkages to them
# preserving their order based on genome location

marker_nodes = sorted(
    list(linkage_graph.degree(top_v).items()), 
    key=lambda p: marker_loc[p[0]]
)

# Pythonic way of unzipping a list of tuples
# into two separate lists of their coordinates

m_names, m_degrees = map(list, zip(*marker_nodes))  

plt.figure(figsize=(40, 20))
plt.plot(m_degrees)
plt.xticks(
    range(len(marker_nodes)), 
    m_names,
    rotation="vertical"
)
plt.savefig("./img/linkage_map.png")
plt.close()

In [34]:
graph_file = open("./data/linkage_graph.txt", "w+")
for u in top_v:
    graph_file.write("{}: {}\n".format(u, linkage_graph.degree(u)))
    for v in linkage_graph[u]:
        graph_file.write("{}\n".format(v))
graph_file.close()