In [24]:
% autosave 15

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import random
import time
import seaborn as sns

Autosaving every 15 seconds


In [25]:
expression_df = pd.read_table('./data/brem2005_RNA_expression.csv')
genotypes_df = pd.read_table('./data/jbloom_strains_genotyped.tab')

In [26]:
strain_data = pd.read_csv('./data/brem2005_strains.csv')
strain_names = strain_data['Strain name'].tolist()
progeny_names = strain_names[18:]

In [27]:
expression_df.head()

Unnamed: 0,ID_REF,IDENTIFIER,By1,By2,BY3,BY4,BY5,BY6,RM1_1,RM2_1,...,22_4_d,22_5_d,23_3_d,23_5_d,24_1_d,25_1_d,25_3_d,25_4_d,26_1_d,26_2_d
0,0,TRS120,-0.062,-0.094,0.0925,-0.1135,-0.289,-0.042,-0.057,-0.014,...,0.0615,0.064,-0.4475,0.109,0.1445,0.313,-0.0185,-0.036,0.121,0.0575
1,1,SCC2,0.134,0.066,,-0.046,-0.409,-0.41,0.4235,0.2045,...,0.077,0.228,0.3815,0.239,0.1045,0.3535,-0.03,0.0775,0.4845,0.115
2,2,FLO1,-0.1165,-0.209,,-0.005,0.078,,0.897,0.646,...,0.0735,0.261,-0.0545,0.074,0.269,0.437,0.2475,0.084,-0.157,0.004
3,3,MYO3,-0.1595,-0.228,,0.183,-0.102,0.1815,-0.1555,0.0445,...,-0.0875,-0.0995,-0.5545,0.132,0.222,0.243,0.183,-0.004,-0.1085,0.222
4,4,PDR10,0.0395,-0.001,,0.043,-0.162,0.156,0.2075,0.1125,...,0.036,0.113,0.1615,0.011,0.2985,-0.076,-0.2335,0.1015,-0.201,-0.3375


In [28]:
genotypes_df[genotypes_df.columns.tolist()[0:128]].head()

Unnamed: 0,RQTL_name,2_7_a,2_7_b,2_7_c,2_7_d,21_5_c,22_1_d,22_2_d,19_2_c,19_3_c,...,9_7_d,26_2_d,19_1_c,21_3_d,21_1_d,15_6_c,17_1_a,name,chromosome,position
0,YAL069W_1,0,1,1,0,1,0,0,0,0,...,0,1,1,2,0,0,1,YAL069W,1,483
1,YAL069W_2,0,1,1,0,1,2,0,0,0,...,0,1,1,2,2,0,1,YAL069W,1,484
2,NAL013C_3,0,2,1,0,1,0,0,0,0,...,0,1,1,0,0,0,1,NAL013C,1,3220
3,NAL013C_4,0,2,1,0,1,0,0,0,0,...,0,1,1,0,0,0,1,NAL013C,1,3223
4,NAL013C_5,0,2,1,0,1,0,0,0,0,...,0,1,1,0,0,0,1,NAL013C,1,3232


In [57]:
# Divide all progeny into groups by their inheritance pattern
# for a given genetic marker, and then plot the data clouds
# to visually observe if there is any correlation between marker
# and RNA expression 


def plot_expression_to_marker_correlation(RNA_name, marker_name):
    marker_pos = genotypes_df[genotypes_df.RQTL_name == marker_name].index.tolist()[0]
    RNA_pos = expression_df[expression_df.IDENTIFIER == RNA_name].index.tolist()[0]
    from_BY, from_RM = [], []
    for strain in progeny_names:
        inheritance_pattern = genotypes_df[strain].iloc[marker_pos]
        expression_value = expression_df[strain].iloc[RNA_pos]
        if inheritance_pattern == 0:
            from_BY.append(expression_value)
        elif inheritance_pattern == 1:
            from_RM.append(expression_value)
    statistic, pvalue = scipy.stats.mannwhitneyu(
        x=from_BY,
        y=from_RM
    )
    if pvalue <= 1e-7:
        xlabels = np.append(
                        np.full((1, len(from_BY)), 1), 
                        np.full((1, len(from_RM)), 2))\
                        + np.random.normal(0, 0.01, len(from_BY) + len(from_RM)
                )  
        ylabels = np.array(from_BY + from_RM)
        plt.figure(figsize=(20, 10))
        plt.rcParams["axes.facecolor"] = 'white'
        plt.title("p-value: {}".format(pvalue))
        plt.xlabel("class label")
        plt.ylabel("expression value")
        plt.scatter(
            x=xlabels, y=ylabels,
            c=ylabels, cmap=cm.jet
        )
        plt.savefig("./img/" + RNA_name + "_to_" + marker_name + ".png")
        plt.close()
    return pvalue

In [58]:
# It's quite hard to identify QTLs just by looking at the point clouds.
# I guess, the next step is to apply MWU and then plot the pairs with 
# the highest predicted linkage, just to verify the validity of a program
np.random.seed(int(time.time()))
test_pairs = list(
                zip( 
                    np.random.choice(expression_df["IDENTIFIER"].tolist(), 10000, replace=True),
                    np.random.choice(genotypes_df["RQTL_name"].tolist(), 10000, replace=True)
                )
            )
min_pvalue = 1.
argmin_RNA, argmin_marker_name = "", ""
for RNA_name, marker_name in test_pairs:
    pvalue = plot_expression_to_marker_correlation(RNA_name, marker_name)
    if pvalue < min_pvalue:
        min_pvalue = pvalue
        argmin_RNA = RNA_name
        argmin_marker_name = marker_name
print("{}: {} to {}".format(min_pvalue, argmin_RNA, argmin_marker_name))

1.8012250539173446e-18: ARR1 to gKR08_1850
