In this notebook the eQTL identification is performed.

General workflow description: for each marker identify whether it is a QTL for some expressed gene by running a statistical test over sets of expression values of each of them, divided by inheritance pattern in a strain, and applying FDR-correction thereafter 

In [125]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import time
from scipy import stats
import networkx as nx

In [None]:
# BY and RM strains values must be averaged and placed into two separate columns

expression_df = pd.read_table('./data/brem2005_RNA_expression.csv')
genotypes_df = pd.read_table('./data/jbloom_strains_genotyped.tab')

markers_n = genotypes_df.shape[0]
rna_n = expression_df.shape[0]

# Add columns with the parental genotypes
# to use their averaged expression data

genotypes_df["BY"] = pd.DataFrame(data=np.zeros(markers_n, dtype="int")) 
genotypes_df["RM"] = pd.DataFrame(data=np.ones(markers_n, dtype="int"))

In [147]:
marker_loc = dict(zip(genotypes_df["RQTL_name"], np.arange(markers_n)))
RNA_loc = dict(zip(expression_df["IDENTIFIER"], np.arange(rna_n)))

In [127]:
# Average the expression values over all parental strains.
# Add averaged columns called "BY" and "RM" to expression dataframe
# and remove the parental strains' columns

BY_expr_df = expression_df.iloc[:, 2:8]
RM_expr_df = expression_df.iloc[:, 8:20]


def avg_expr_df(df):
    row_n, col_n = df.shape 
    averaged = np.full(shape=row_n, fill_value=np.nan)
    # iterate over rows of a dataframe
    for i, row in enumerate(df.itertuples()):
        expr_sum, cnt = 0., 0.
        # and average non-NaN values of the columns
        # presence of NaNs is exactly the reason why the columns 
        # cannot be simply added together and averaged
        for j in range(1, col_n + 1):
            if not np.isnan(row[j]):
                expr_sum += row[j]
                cnt += 1
        if cnt:
            averaged[i] = expr_sum / cnt
    return averaged


expression_df["BY"] = pd.DataFrame(avg_expr_df(BY_expr_df))
expression_df["RM"] = pd.DataFrame(avg_expr_df(RM_expr_df))

# Non-averaged expression data of parental strains is no longer necessary,
# so the appropriate columns are dropped from the dataframe
expression_df = expression_df.drop(
    BY_expr_df.columns.tolist() + RM_expr_df.columns.tolist(), 
    axis=1
)

In [153]:
# Haven't I cleaned the data before exactly not to write these lines?

# strain_data = pd.read_csv('./data/brem2005_strains.csv')
# strain_names = strain_data['Strain name'].tolist()
# progeny_names = strain_names[18:]

strain_names = expression_df.columns.tolist()[2:]
progeny_names = strain_names[18:]

In [129]:
expression_df.iloc[:, 0:110].head()

Unnamed: 0,ID_REF,IDENTIFIER,1_1_d,1_3_d,1_4_d,1_5_c,2_2_d,2_3_d,2_4_a,2_5_d,...,21_4_d,21_5_c,22_2_d,22_3_b,22_4_d,22_5_d,23_3_d,23_5_d,24_1_d,25_1_d
0,0,TRS120,0.1235,0.079,-0.023,0.105,0.1205,0.1425,0.015,-0.276,...,-0.033,0.2555,0.3165,0.1775,0.0615,0.064,-0.4475,0.109,0.1445,0.313
1,1,SCC2,0.2955,0.3965,-0.0795,0.053,0.3795,0.205,0.1655,-0.2225,...,0.2935,0.54,0.423,0.2575,0.077,0.228,0.3815,0.239,0.1045,0.3535
2,2,FLO1,0.718,-0.009,0.903,0.1855,0.904,0.2455,-1.6495,-0.036,...,-0.155,-0.137,0.4565,0.552,0.0735,0.261,-0.0545,0.074,0.269,0.437
3,3,MYO3,-0.022,0.2745,0.3285,0.3425,0.1615,0.3145,-0.14,0.116,...,-0.088,-0.134,0.08,0.0725,-0.0875,-0.0995,-0.5545,0.132,0.222,0.243
4,4,PDR10,-0.013,0.21,0.254,0.2925,0.604,0.4395,-0.137,-0.0465,...,-0.061,-0.1505,-0.237,-0.0255,0.036,0.113,0.1615,0.011,0.2985,-0.076


In [130]:
genotypes_df.iloc[:, 0:128].head()

Unnamed: 0,RQTL_name,2_7_a,2_7_b,2_7_c,2_7_d,21_5_c,22_1_d,22_2_d,19_2_c,19_3_c,...,9_7_d,26_2_d,19_1_c,21_3_d,21_1_d,15_6_c,17_1_a,name,chromosome,position
0,YAL069W_1,0,1,1,0,1,0,0,0,0,...,0,1,1,2,0,0,1,YAL069W,1,483
1,YAL069W_2,0,1,1,0,1,2,0,0,0,...,0,1,1,2,2,0,1,YAL069W,1,484
2,NAL013C_3,0,2,1,0,1,0,0,0,0,...,0,1,1,0,0,0,1,NAL013C,1,3220
3,NAL013C_4,0,2,1,0,1,0,0,0,0,...,0,1,1,0,0,0,1,NAL013C,1,3223
4,NAL013C_5,0,2,1,0,1,0,0,0,0,...,0,1,1,0,0,0,1,NAL013C,1,3232


In [131]:
# Divide all progeny into groups by their inheritance pattern
# for a given genetic marker, and then plot the data clouds
# to visually observe if there is any correlation between marker
# and RNA expression 

# What is the asymptotics of this function?

# Divide expression data for a given gene in two groups,
# based on inheritance pattern of a given marker
def expression_by_RNA_and_marker(RNA_name, marker_name):
    from_BY, from_RM = [], []
    RNA_pos = expression_df[expression_df.IDENTIFIER == RNA_name].index.tolist()[0]
    marker_pos = genotypes_df[genotypes_df.RQTL_name == marker_name].index.tolist()[0]
    for strain in progeny_names:
        # Optimization: extract the corresponding rows in advance
        expression_value = expression_df[strain].iloc[RNA_pos]
        inheritance_pattern = genotypes_df[strain].iloc[marker_pos]
        if np.isnan(expression_value):
            continue
        if inheritance_pattern == 0:
            from_BY.append(expression_value)
        elif inheritance_pattern == 1:
            from_RM.append(expression_value)
    return from_BY, from_RM


# For the given pair (expressed gene, marker) test the hypothesis
# that inherited variant of a marker influences gene expression significantly 
def test_linkage(RNA_name, marker_name, eps=1e-5):
    from_BY, from_RM = expression_by_RNA_and_marker(RNA_name, marker_name)
    statistic, pvalue = stats.mannwhitneyu(x=from_BY, y=from_RM)
    return (pvalue <= eps, pvalue)


# Divide expression data by inherited marker 
# variant and then plot the resulting groups 
def plot_expression_to_marker_correlation(RNA_name, marker_name):
    from_BY, from_RM = expression_by_RNA_and_marker(RNA_name, marker_name)
    xlabels = np.append(
                    np.full((1, len(from_BY)), 1), 
                    np.full((1, len(from_RM)), 2))\
                    + np.random.normal(0, 0.01, len(from_BY) + len(from_RM)
            )  
    ylabels = np.array(from_BY + from_RM)
    plt.figure(figsize=(20, 10))
    plt.rcParams["axes.facecolor"] = 'white'
    plt.title("p-value: {}".format(pvalue))
    plt.xlabel("class label")
    plt.ylabel("expression value")
    plt.scatter(
        x=xlabels, y=ylabels,
        c=ylabels, cmap=cm.jet
    )
    plt.savefig("./img/" + RNA_name + "_to_" + marker_name + ".png")
    plt.close()

**TODO**:
    1.  Find QTLs for every gene using 5% pvalue threshold
    2.  Validate QTLs using permutations test and calculate FDR
    3.  Construct and visualize a bipartite graph visualizing 
        the linkages found during experiments 
    4.  Also, it's worth calculating the linkages for every
        marker gene and plotting the bar chart, placing markers
        accordingly to their position in the chromosome.

In [192]:
# When and how to apply multiple-testing correction (FDR, q-value)?

# Set the seed to ensure either reproducibility
# or randomness of the generated sample 
np.random.seed(int(time.time()))

# Extract subsample
test_pairs = list(
                zip( 
                    np.random.choice(expression_df["IDENTIFIER"].tolist(), 2000, replace=True),
                    np.random.choice(genotypes_df["RQTL_name"].tolist(), 2000, replace=True)
                )
            )

# Check linkage for all entries of this subsample
# and construct a bipartite graph of interactions
linkage_graph = nx.Graph()
for RNA_name, marker_name in test_pairs:
    result, pvalue = test_linkage(RNA_name, marker_name, eps=0.05)
    if result:
        if not linkage_graph.has_node(RNA_name):
            linkage_graph.add_node(RNA_name, bipartite=0)
        if not linkage_graph.has_node(marker_name):
            linkage_graph.add_node(marker_name, bipartite=1)
        linkage_graph.add_edge(RNA_name, marker_name)

In [193]:
# Built-in bipartite.sets() works strangely 
# maybe, it's only so for undirected graphs,
# I should check that on some toy example

top_v, bottom_v = [], []
for node, data in linkage_graph.nodes(data=True):
    if data["bipartite"] == 0:
        bottom_v.append(node)
    else:
        top_v.append(node)
        
# To plot a bipartite graph correctly, the positions
# of the vertices must be written down explicitly

pos = dict()
pos.update((n, (1, 2*i)) for i, n in enumerate(top_v))
pos.update((n, (2, 2*i)) for i, n in enumerate(bottom_v))

# Maybe, (20, 20) is too small: the labels overlap significantly

plt.figure(figsize=(20, 30))
nx.draw(
    linkage_graph,
    with_labels=True,
    node_size=50,
    edge_width=3.0,
    pos=pos,
    node_color=list(linkage_graph.degree().values()),
    edge_color='b',
    cmap=plt.cm.Blues,
    alpha=0.5
)
plt.savefig("./img/graph.png")
plt.close()

    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
  b = plt.ishold()
    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
  plt.hold(b)


In [194]:
# Extract the marker-nodes and number of linkages to them
# preserving their order based on genome location

marker_nodes = sorted(
    list(linkage_graph.degree(top_v).items()), 
    key=lambda p: marker_loc[p[0]]
)

# Pythonic way of unzipping a list of tuples
# into two separate lists of their coordinates

m_names, m_degrees = map(list, zip(*marker_nodes))  

plt.figure(figsize=(40, 20))
plt.plot(m_degrees)
plt.xticks(
    range(len(marker_nodes)), 
    m_names,
    rotation="vertical"
)
plt.savefig("./img/linkage_map.png")
plt.close()