In this notebook the eQTL identification is performed.

General workflow:
- Extract two dataframes: marker-based genotype and expression data. Each column represents a strain. All columns are sorted accordingly to the strain name.
- Transform them to matrices and perform MWU statistical test for every pair (marker, expressed gene) and save them into list. Use multiprocessing to speed the computation up.
	- For each marker, divide the strains by inherited variant.
	- For each gene, divide the expression data in two groups.
	- Test null hypothesis using MWU test.
- Adjust the p-values using Benjamini-Hochberg procedure.
- Construct the bipartite linkage graph using calculated q-values.
- Plot the graph and the linkage map.

TODO: 
- Прологарифмировать данные белковой экспрессии? — Это бесполезно, т.к. тест ранговый, а на монотонность логарифм не влияет.
- Попытаться выяснить как можно больше про данные белковой экспрессии: 
	- В столбцах со средними не средние арифметические. Что там за данные, насколько существенно отклонение? Нужно провести усреднение самостоятельно.
	- Гистрограмма p-values практически ровная.
	- Нормализация не сказывается на результате (да и без неё распределения довольно похожи на нормальное). + прочесть, для чего, в принципе, нужна нормализация, какие проблемы она решает, почему её нужно делать в моём случае. + прочесть о том, как были нормализованы данные экспрессии РНК и повторить аналогичную процедуру.
	- Даже без FDR-коррекции находятся не те linkages: это ОЧЕНЬ странно.
	- Та же функция корректно работает на данных экспрессии РНК.
- Написать функцию, которая будет рисовать таблицу, где клетки будут окрашены в зависимости от унаследованного варианта, а в конце будут два столбца с центрамми масс облаков точек

In [1]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import time
from scipy import stats
import networkx as nx
import multiprocessing as mp
from statsmodels.sandbox.stats.multicomp import multipletests
from functools import partial

%autosave 15

Autosaving every 15 seconds


In [2]:
# Expression data wasn't measured for all of the strains genotyped, 
# thereby some of them need to be filtered out beforehand
genotype_df = pd.read_table('./data/genotypes.csv')

rna_expression_df = pd.read_table('./data/rna_expression_avg.csv')
rna_genotype_df = genotype_df[["RQTL_name"] + rna_expression_df.columns.tolist()[1:]]
pd.DataFrame.to_csv(rna_genotype_df, "./data/rna_genotypes.csv", sep='\t', index=False)

protein_expression_df = pd.read_table("./data/test_df.csv")
protein_genotype_df = genotype_df[["RQTL_name"] + protein_expression_df.columns.tolist()[1:]]
pd.DataFrame.to_csv(protein_genotype_df, "./data/protein_genotypes.csv", sep='\t', index=False)


In [13]:
def calculate_p_values(genotype_matrix, expression_matrix, sample_pair):
    markers_chunk, offset = sample_pair
    p_values = np.zeros(len(markers_chunk) * expression_matrix.shape[0], dtype=np.float32)
    iter = 0
    for marker_rownum in range(offset, len(markers_chunk) + offset):
        genotype_row = genotype_matrix[marker_rownum]
        for expression_row in expression_matrix:
            from_BY = expression_row[genotype_row == 0]
            from_RM = expression_row[genotype_row == 1]
            ''' CPU hog '''
            statistics, p_value = stats.mannwhitneyu(x=from_BY, y=from_RM)
            p_values[iter] = p_value
            iter += 1
    return p_values


def perform_analysis(genotype_df, expression_df, analysis_name):
    marker_cnt = genotype_df.shape[0]
    
    genotype_matrix = genotype_df.as_matrix(genotype_df.columns.tolist()[1:]) 
    expression_matrix = expression_df.as_matrix(expression_df.columns.tolist()[1:])
    
    expression_matrix = stats.zscore(
        np.ma.array(
            expression_matrix,
            mask=np.isnan(expression_matrix)
        ),
        axis=1
    )
    
    
    marker_list = genotype_df.iloc[:, 0].as_matrix()
    gene_list = expression_df.iloc[:, 0].as_matrix()
    
    
    CHUNKS_N = mp.cpu_count() // 2
    marker_chunks = np.array_split(marker_list[:100], CHUNKS_N)
    chunk_lens = np.roll(
        np.cumsum(
            [len(chunk) for chunk in marker_chunks]
        ), 1
    )
    chunk_lens[0] = 0    
    marker_samples = list(zip(marker_chunks, chunk_lens))

    calculate_p_values_subroutine = partial(
        calculate_p_values, 
        genotype_matrix, expression_matrix 
    )
    
    pool = mp.Pool(processes=CHUNKS_N)
    start_time = time.time()
    results = pool.map(calculate_p_values_subroutine, marker_samples)
    end_time = time.time()
    pool.close()
    pool.join()
    
    p_values = np.concatenate([results[i] for i in range(CHUNKS_N)])
    # np.savetxt(analysis_name + "_p_values.txt", p_values, delimiter=',')
    
    adjusted_results = multipletests(p_values, method="fdr_bh")
    print("Calculation of pvalues: {}".format(end_time - start_time))
    
    # Build linkage graph from qvalues
    
    reject, q_values = adjusted_results[0], adjusted_results[1]
    linkage_graph = nx.Graph()
    idx = 0
    
    for marker_name in marker_list[:100]:
        for gene_name in gene_list:
            if reject[idx] == True:
                if not linkage_graph.has_node(gene_name):
                    linkage_graph.add_node(gene_name, bipartite=0)
                if not linkage_graph.has_node(marker_name):
                    linkage_graph.add_node(marker_name, bipartite=1)
                linkage_graph.add_edge(gene_name, marker_name)
            idx += 1

    # Built-in bipartite.sets() works strangely 
    # maybe, it's only so for undirected graphs,
    # I should check that on some toy example

    top_v, bottom_v = [], []
    for node, data in linkage_graph.nodes(data=True):
        if data["bipartite"] == 0:
            bottom_v.append(node)
        else:
            top_v.append(node)

    if not linkage_graph.nodes():
        print("No linkages found")
        return
    
    # Extract the marker-nodes and number of linkages to them
    # preserving their order based on genome location

    marker_to_rownum = dict(zip(genotype_df.iloc[:, 0], np.arange(marker_cnt)))
    marker_nodes = sorted(
        list(linkage_graph.degree(top_v).items()), 
        key=lambda p: marker_to_rownum[p[0]]
    )

    # Pythonic way of unzipping a list of tuples
    # into two separate lists of their coordinates

    marker_names, linkages = map(list, zip(*marker_nodes))  

    plt.figure(figsize=(40, 20))
    plt.plot(linkages)
    plt.savefig("./img/" + analysis_name + "_linkage_map.png")
    plt.close()

    graph_file = open("./data/" + analysis_name + "_linkage_graph.txt", "w+")
    for u in top_v:
        graph_file.write("{}: {}\n".format(u, linkage_graph.degree(u)))
        for v in linkage_graph[u]:
            graph_file.write("{}\n".format(v))
    graph_file.close()

In [14]:
perform_analysis(rna_genotype_df, rna_expression_df, "eQTLs")
# perform_analysis(protein_genotype_df, protein_expression_df, "pQTLs")


Calculation of pvalues: 76.19243812561035


In [70]:
eQTL_df = pd.read_table(
    "eRESULTS.csv",
    sep='\t'
)

linkage_graph = nx.Graph()
for row in eQTL_df.itertuples():
    marker_name, gene_name = row[1], row[2]
    if not linkage_graph.has_node(gene_name):
        linkage_graph.add_node(gene_name, bipartite=0)
    if not linkage_graph.has_node(marker_name):
        linkage_graph.add_node(marker_name, bipartite=1)
    linkage_graph.add_edge(gene_name, marker_name)
    
top_v, bottom_v = [], []
for node, data in linkage_graph.nodes(data=True):
    if data["bipartite"] == 0:
        bottom_v.append(node)
    else:
        top_v.append(node)

if not linkage_graph.nodes():
    print("No linkages found")

    
marker_to_rownum = dict(zip(
    rna_genotype_df.iloc[:, 0], 
    np.arange(rna_genotype_df.shape[0])
))

marker_nodes = sorted(
    list(linkage_graph.degree(top_v).items()), 
    key=lambda p: marker_to_rownum[p[0]]
)

marker_names, linkages = map(list, zip(*marker_nodes))  

plt.figure(figsize=(40, 20))
plt.plot(linkages)
plt.savefig("./img/" + "MatrixQTL" + "_eQTL_linkage_map.png")
plt.close()

# marker_nodes = sorted(
#     list(linkage_graph.degree(top_v).items()), 
#     key=lambda p: marker_to_rownum[p[0]]
# )


In [87]:
pQTL_df = pd.read_table(
    "pRESULTS.csv",
    sep='\t'
)

linkage_graph = nx.Graph()
for row in pQTL_df.itertuples():
    marker_name, gene_name = row[1], row[2]
    if not linkage_graph.has_node(gene_name):
        linkage_graph.add_node(gene_name, bipartite=0)
    if not linkage_graph.has_node(marker_name):
        linkage_graph.add_node(marker_name, bipartite=1)
    linkage_graph.add_edge(gene_name, marker_name)
    
top_v, bottom_v = [], []
for node, data in linkage_graph.nodes(data=True):
    if data["bipartite"] == 0:
        bottom_v.append(node)
    else:
        top_v.append(node)

if not linkage_graph.nodes():
    print("No linkages found")

    
marker_to_rownum = dict(zip(
    protein_genotype_df.iloc[:, 0], 
    np.arange(protein_genotype_df.shape[0])
))

marker_nodes = sorted(
    list(linkage_graph.degree(top_v).items()), 
    key=lambda p: marker_to_rownum[p[0]]
)

marker_names, linkages = map(list, zip(*marker_nodes))  

plt.figure(figsize=(40, 20))
plt.plot(linkages)
plt.savefig("./img/" + "MatrixQTL" + "_pQTL_linkage_map.png")
plt.close()

# marker_nodes = sorted(
#     list(linkage_graph.degree(top_v).items()), 
#     key=lambda p: marker_to_rownum[p[0]]
# )

In [3]:
pd.DataFrame.to_csv(
    rna_expression_df,
    "data/CPY_rna_expression_avg.csv",
    sep='\t',
    na_rep='NA',
    index=False
)
pd.DataFrame.to_csv(
    rna_genotype_df.replace([0, 1, 2], [0, 2, "NA"]),
    "data/CPY_rna_genotypes.csv",
    sep='\t',
    na_rep='NA',
    index=False
)
pd.DataFrame.to_csv(
    protein_expression_df,
    "data/CPY_protein_expression_avg.csv",
    sep='\t',
    na_rep='NA',
    index=False
)
pd.DataFrame.to_csv(
    protein_genotype_df.replace([0, 1, 2], [0, 2, "NA"]),
    "data/CPY_protein_genotypes.csv",
    sep='\t',
    na_rep='NA',
    index=False
)

In [82]:
raw_protein_df = pd.read_csv(
    "./data/Foss2007_protein_expression.csv",
    sep='\t'
)
averaged_df = pd.DataFrame(
    np.zeros((1318, 107)), 
    columns=protein_genotype_df.columns.tolist()[1:]
)
columns_grouped = dict(
    [(strain_name, []) for strain_name in 
     protein_genotype_df.columns.tolist()[1:]
    ]
)
for strain in raw_protein_df.columns.tolist():
    tokenized = strain.split('.')
    if tokenized[0] == 'cond' and tokenized[2] != 'median':
            if tokenized[1] in columns_grouped.keys():
                columns_grouped[tokenized[1]].append(strain)
            
for strain_name, group in columns_grouped.items():
    subdf = raw_protein_df[group]
    averaged_df[strain_name] = subdf.mean(axis=1)
    
# diff_df = protein_expression_df.iloc[:, 1:] - averaged_df
averaged_df.insert(0, "protein.group", protein_expression_df["protein.group"])
    
pd.DataFrame.to_csv(
    averaged_df,
    "./data/test_df.csv",
    sep='\t',
    index=False,
    na_rep='NA'
)