In this notebook the eQTL identification is performed.

General workflow:
- Extract two dataframes: marker-based genotype and expression data. Each column represents a strain. All columns are sorted accordingly to the strain name.
- Transform them to matrices and perform MWU statistical test for every pair (marker, expressed gene) and save them into list. Use multiprocessing to speed the computation up.
	- For each marker, divide the strains by inherited variant.
	- For each gene, divide the expression data in two groups.
	- Test null hypothesis using MWU test.
- Adjust the p-values using Benjamini-Hochberg procedure.
- Construct the bipartite linkage graph using calculated q-values.
- Plot the graph and the linkage map.

In [15]:
%matplotlib inline

# utilities
from functools import partial
import time

# data anal|ysis tools
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests

# network analysis tools
import igraph
import networkx as nx

# multiprocessing tools
import multiprocessing as mp

# visualization tools
import matplotlib.pyplot as plt
from matplotlib import cm

%autosave 15

Autosaving every 15 seconds


In [14]:
# Expression data wasn't measured for all of the strains genotyped, 
# thereby some of them need to be filtered out beforehand
genotype_df = pd.read_table("./data/genotypes_full.csv")


rna_expression_df = pd.read_table("./data/rna_expression_avg.csv")
rna_genotype_df = pd.read_table("./data/rna_genotypes.csv")

protein_expression_df = pd.read_table("./data/protein_expression_avg.csv")
protein_genotype_df = pd.read_table("./data/protein_genotypes.csv")

In [4]:
def calculate_p_values(genotype_matrix, expression_matrix, sample_pair):
    markers_chunk, offset = sample_pair
    p_values = np.zeros(len(markers_chunk) * expression_matrix.shape[0], dtype=np.float32)
    iter = 0
    for marker_rownum in range(offset, len(markers_chunk) + offset):
        genotype_row = genotype_matrix[marker_rownum]
        for expression_row in expression_matrix:
            from_BY = expression_row[genotype_row == 0]
            from_RM = expression_row[genotype_row == 1]
            ''' CPU hog '''
            statistics, p_value = stats.mannwhitneyu(x=from_BY, y=from_RM)
            p_values[iter] = p_value
            iter += 1
    return p_values

# This way to identify QTLs is by orders of magnitude slower,
# but provides a controllable environment, because I am fully aware
# of all statistical procedures used in the analysis, unlike MatrixQTL
def identify_QTLs(genotype_df, expression_df, analysis_name):
    marker_cnt = genotype_df.shape[0]
    
    genotype_matrix = genotype_df.as_matrix(genotype_df.columns.tolist()[1:]) 
    expression_matrix = expression_df.as_matrix(expression_df.columns.tolist()[1:])
    
    expression_matrix = stats.zscore(
        np.ma.array(
            expression_matrix,
            mask=np.isnan(expression_matrix)
        ),
        axis=1
    )
    
    
    marker_list = genotype_df.iloc[:, 0].as_matrix()
    gene_list = expression_df.iloc[:, 0].as_matrix()
    
    
    CHUNKS_N = mp.cpu_count() // 2
    marker_chunks = np.array_split(marker_list[:100], CHUNKS_N)
    chunk_lens = np.roll(
        np.cumsum(
            [len(chunk) for chunk in marker_chunks]
        ), 1
    )
    chunk_lens[0] = 0    
    marker_samples = list(zip(marker_chunks, chunk_lens))

    calculate_p_values_subroutine = partial(
        calculate_p_values, 
        genotype_matrix, expression_matrix 
    )
    
    pool = mp.Pool(processes=CHUNKS_N)
    start_time = time.time()
    results = pool.map(calculate_p_values_subroutine, marker_samples)
    end_time = time.time()
    pool.close()
    pool.join()
    
    p_values = np.concatenate([results[i] for i in range(CHUNKS_N)])
    
    adjusted_results = multipletests(p_values, method="fdr_bh")
    print("Calculation of pvalues: {}".format(end_time - start_time))
    
    # Build linkage graph from qvalues
    
    reject, q_values = adjusted_results[0], adjusted_results[1]
    linkage_graph = nx.Graph()
    idx = 0
    
    for marker_name in marker_list[:100]:
        for gene_name in gene_list:
            if reject[idx] == True:
                if not linkage_graph.has_node(gene_name):
                    linkage_graph.add_node(gene_name, bipartite=0)
                if not linkage_graph.has_node(marker_name):
                    linkage_graph.add_node(marker_name, bipartite=1)
                linkage_graph.add_edge(gene_name, marker_name)
            idx += 1

    # Built-in bipartite.sets() works strangely 
    # maybe, it's only so for undirected graphs,
    # I should check that on some toy example

    top_v, bottom_v = [], []
    for node, data in linkage_graph.nodes(data=True):
        if data["bipartite"] == 0:
            bottom_v.append(node)
        else:
            top_v.append(node)

    if not linkage_graph.nodes():
        print("No linkages found")
        return
    
    # Extract the marker-nodes and number of linkages to them
    # preserving their order based on genome location

    marker_to_rownum = dict(zip(genotype_df.iloc[:, 0], np.arange(marker_cnt)))
    marker_nodes = sorted(
        list(linkage_graph.degree(top_v).items()), 
        key=lambda p: marker_to_rownum[p[0]]
    )

    # Pythonic way of unzipping a list of tuples
    # into two separate lists of their coordinates

    marker_names, linkages = map(list, zip(*marker_nodes))  

    plt.figure(figsize=(40, 20))
    plt.plot(linkages)
    plt.savefig("./img/" + analysis_name + "_linkage_map.png")
    plt.close()

    graph_file = open("./data/" + analysis_name + "_linkage_graph.txt", "w+")
    for u in top_v:
        graph_file.write("{}: {}\n".format(u, linkage_graph.degree(u)))
        for v in linkage_graph[u]:
            graph_file.write("{}\n".format(v))
    graph_file.close()

In [5]:
# identify_QTLs(rna_genotype_df, rna_expression_df, "eQTLs")
# identify_QTLs(protein_genotype_df, protein_expression_df, "pQTLs")

In [45]:
def assemble_QTLs_graph(QTLs_type, genotypes_df):
    QTLs_df = pd.read_table(
        "./data/" + QTLs_type + ".csv",
        sep='\t'
    )
    
    linkage_graph = nx.Graph()
    for row in QTLs_df.itertuples():
        marker_name, gene_name = row[1], row[2]
        if not linkage_graph.has_node(gene_name):
            linkage_graph.add_node(gene_name, bipartite=0)
        if not linkage_graph.has_node(marker_name):
            linkage_graph.add_node(marker_name, bipartite=1)
        linkage_graph.add_edge(gene_name, marker_name)
        
    top_v, bottom_v = [], []
    for node, data in linkage_graph.nodes(data=True):
        if data["bipartite"] == 0:
            bottom_v.append(node)
        else:
            top_v.append(node)
    
    if not linkage_graph.nodes():
        print("No linkages found")
    
        
    marker_to_rownum = dict(zip(
        genotypes_df.iloc[:, 0], 
        np.arange(genotypes_df.shape[0])
    ))
    
    marker_nodes = sorted(
        list(linkage_graph.degree(top_v).items()), 
        key=lambda p: marker_to_rownum[p[0]]
    )
    
    marker_names, linkages = map(list, zip(*marker_nodes))  
    
    nx.write_gexf(linkage_graph, "./data/MatrixQTL_" + QTLs_type + "_linkage_graph.gexf")
    
    plt.figure(figsize=(40, 20))
    plt.plot(linkages)
    plt.savefig("./img/MatrixQTL_" + QTLs_type + "_linkage_map.png")
    plt.close()
    
    return linkage_graph

In [88]:
eQTL_graph = assemble_QTLs_graph("eQTLs", rna_genotype_df)
pQTL_graph = assemble_QTLs_graph("pQTLs", protein_genotype_df)

In [251]:
full_interactome_df = pd.read_table("./data/yeast_interactome.csv")
eQTL_df = pd.read_table('./data/eQTLs.csv')
pQTL_df = pd.read_table('./data/pQTLs.csv')

eQTL_target_set = set(eQTL_df['gene']) 
pQTL_target_set = set(pQTL_df['gene'])

interactome_df = full_interactome_df[
    full_interactome_df['Interactor A'].isin(eQTL_target_set | pQTL_target_set)
    & full_interactome_df['Interactor B'].isin(eQTL_target_set | pQTL_target_set)
]

interactome_df.set_index('Interaction Type', inplace=True)
genetic_interaction_codes = [
    'additive genetic interaction defined by inequality',
    'suppressive genetic interaction defined by inequality',
    'synthetic genetic interaction defined by inequality'
]
physical_interaction_codes = [
    'association',
    'colocalization',   
    'direct interaction',
    'physical association',
]
biogrid_mi_mapping_df = pd.read_table(
    './data/biogrid_mi_mapping.csv'
)
genetic_interactions_df = interactome_df[
    interactome_df.index.isin(genetic_interaction_codes)
]
physical_interactions_df = interactome_df[
    interactome_df.index.isin(physical_interaction_codes)
]

In [268]:
def assemble_interactions_graph(interaction_type, interacting_genes_df):
    interactions_graph = nx.from_pandas_dataframe(
        interacting_genes_df,
        source='Interactor A',
        target='Interactor B',
        create_using=nx.MultiGraph()   
    )
    nx.write_gexf(interactions_graph, "./data/interaction_graphs/" + interaction_type + "_interactions_graph.gexf")
    return interactions_graph

In [269]:
interactions_graphs_dict = {}
for interaction_type in set(interactome_df.index):
    interacting_genes_df = interactome_df[interactome_df.index == interaction_type]
    interactions_graphs_dict[interaction_type] = assemble_interactions_graph(interaction_type, interacting_genes_df)
genetic_interactions_graph = assemble_interactions_graph('genetic', genetic_interactions_df)  
physical_interactions_graph = assemble_interactions_graph('physical', physical_interactions_df)

In [173]:
# full_interactome_df = pd.read_table(
#     './data/BIOGRID-ORGANISM-Saccharomyces_cerevisiae_S288c-3.4.152.mitab.csv',
#     usecols=['Alt IDs Interactor A', 'Alt IDs Interactor B', 'Interaction Types']
# )
# full_interactome_df['Alt IDs Interactor A'] = full_interactome_df['Alt IDs Interactor A'].apply(
#     lambda s: s.split('|')[1].split(':')[1]
# ) 
# full_interactome_df['Alt IDs Interactor B'] = full_interactome_df['Alt IDs Interactor B'].apply(
#     lambda s: s.split('|')[1].split(':')[1]
# ) 
# 
# full_interactome_df['Interaction Types'] = full_interactome_df['Interaction Types'].apply(
#     lambda s: s.split('(')[1].split(')')[0]
# ) 
# full_interactome_df.columns = [['Interactor A', 'Interactor B', 'Interaction Type']]
# full_interactome_df.to_csv(
#     './data/yeast_interactome.csv',
#     index=False,
#     sep='\t'
# )

In [271]:
print('Genetic interactions, total: {} vertices, {} edges'.format(
    genetic_interactions_graph.number_of_nodes(),
    genetic_interactions_graph.number_of_edges()
))
for name, graph in interactions_graphs_dict.items():
    if name in genetic_interaction_codes:
       print('\t{}: {} vertices, {} edges'.format(
            name,
            graph.number_of_nodes(),
            graph.number_of_edges()
        ))
print('Physical interactions, total: {} vertices, {} edges'.format(
    physical_interactions_graph.number_of_nodes(),
    physical_interactions_graph.number_of_edges()
))
for name, graph in interactions_graphs_dict.items():
    if name in physical_interaction_codes:
       print('\t{}: {} vertices, {} edges'.format(
            name,
            graph.number_of_nodes(),
            graph.number_of_edges()
        ))

Genetic interactions, total: 3178 vertices, 188165 edges
	synthetic genetic interaction defined by inequality: 2521 vertices, 16387 edges
	additive genetic interaction defined by inequality: 3069 vertices, 139251 edges
	suppressive genetic interaction defined by inequality: 3026 vertices, 32527 edges
Physical interactions, total: 3168 vertices, 56066 edges
	association: 1574 vertices, 2436 edges
	colocalization: 595 vertices, 689 edges
	physical association: 3030 vertices, 38521 edges
	direct interaction: 2581 vertices, 14420 edges
