In this notebook the eQTLs and pQTLs indentified with statistical package MatrixEQTL are analysed with respect to yeast interactome (various kinds of genetic and physical interactions, with particular emphasis on protein-protein interactions).

In [425]:
%matplotlib inline

# utilities
from functools import partial
import time
import os

# data analysis tools
import random
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests

# network analysis tools
import networkx as nx
import igraph as ig

# multiprocessing tools
import multiprocessing as mp

# visualization tools
import matplotlib.pyplot as plt
from matplotlib import cm

%autosave 15


Autosaving every 15 seconds


In [442]:
random.seed(int(time.time()))
# Expression data wasn't measured for all of the strains genotyped, 
# thereby some of them need to be filtered out beforehand
genotype_df = pd.read_table("./data/genotypes_full.csv")

rna_expression_df = pd.read_table("./data/rna_expression_avg.csv")
rna_genotype_df = pd.read_table("./data/rna_genotypes.csv")

protein_expression_df = pd.read_table("./data/protein_expression_avg.csv")
protein_genotype_df = pd.read_table("./data/protein_genotypes.csv")

full_interactome_df = pd.read_table("./data/yeast_interactome.csv")
# full_interactome_df = full_interactome_df[full_interactome_df['Publication year'] <= 2011]
genetic_interaction_codes = [
    'additive genetic interaction defined by inequality',
    'suppressive genetic interaction defined by inequality',
    'synthetic genetic interaction defined by inequality'
]
physical_interaction_codes = [
    'association',
    'colocalization',   
    'direct interaction',
    'physical association'
]
# where is it used?
# what is it, at all?
biogrid_mi_mapping_df = pd.read_table(
    './data/biogrid_mi_mapping.csv'
)   

In [427]:
# Estimate QTLs from the data. Average runtime — 10 seconds.  
# os.system("Rscript ./src/identify_QTLs.R")

In [431]:
def assemble_linkage_graph(QTL_df, shuffle_markers=False):
    linkage_graph = ig.Graph(directed=True)
    # For some weird reason, igraph can add multiple copies of the same
    # vertex without even signalling about it, therefore the duplicates
    # require manual removal from the dataset 
    vertices = set(QTL_df['SNP'] ) | set(QTL_df['gene'])
    linkage_graph.add_vertices(list(vertices))
    edges = QTL_df[['SNP', 'gene']].values
    m = len(edges) 
    if shuffle_markers:
        for iter in range(m):
            i = random.randint(0, m)
            j = random.randint(0, m)
            edges[i][1], edges[j][1] = edges[j][1], edges[i][1]  
    linkage_graph.add_edges(edges)
    return linkage_graph
    
def assemble_interactions_graph(interaction_df, shuffle=False):
    interaction_graph = ig.Graph()
    # Same reason as in "assemble_linkage_graph"
    vertices = set(interaction_df['Interactor A']) | set(interaction_df['Interactor B'])
    interaction_graph.add_vertices(list(vertices))
    edges = interaction_df[['Interactor A', 'Interactor B']].values
    m = len(edges) 
    if shuffle:
        for iter in range(m):
            i = random.randint(0, m - 1)
            j = random.randint(0, m - 1)
            edges[i][1], edges[j][1] = edges[j][1], edges[i][1]  
    interaction_graph.add_edges(edges)
    return interaction_graph

def mean_jaccard_coefficient(interaction_graph, QTL_graph):
    global t1
    linked_genes = set([vertex["name"] for vertex in QTL_graph.vs])
    interacting_genes = [vertex["name"] for vertex in interaction_graph.vs]

    mean_jaccard = 0.
    # Перебрать все рёбра и сопоставить каждой вершине 
    # пару множеств: eQTLs и pQTLs, которые с ней линкуются,
    # а затем рассмотреть меру пересечения их объединения с мерой пересечения
    if interaction_graph.ecount():
        for edge in interaction_graph.es:
            s_id, t_id = edge.source, edge.target
            s_name = interacting_genes[s_id]
            t_name = interacting_genes[t_id]
            if s_name in linked_genes and t_name in linked_genes:
                tmp = time.time()
                s_neigh = set(QTL_graph.neighbors(s_name, mode="IN"))
                t_neigh = set(QTL_graph.neighbors(t_name, mode="IN"))
                mean_jaccard += len(s_neigh & t_neigh) / len(s_neigh | t_neigh)
                t1 += time.time() - tmp
        mean_jaccard /= interaction_graph.ecount()
    return mean_jaccard

In [432]:
# for q-value threshold in range [1e-8; 1e-3] 
# rebuild graph of interactions and calculate  
# mean value of Jaccard coefficient among graph vertices.

eQTL_df = pd.read_table("./data/eQTLsUpdated.csv")
pQTL_df = pd.read_table("./data/pQTLsUpdated.csv")

In [435]:

# Почему получается так мало рёбер в графах взаимодействий?
# Это ОЧЕНЬ странно. Их должны быть десятки тысяч!

interactome_df = full_interactome_df.copy()
interactome_df.set_index('Interaction Type', inplace=True)

genetic_interactions_df = interactome_df[
    interactome_df.index.isin(genetic_interaction_codes)
]
physical_interactions_df = interactome_df[
    interactome_df.index.isin(physical_interaction_codes)
]

interactions_graphs_dict = {}
for interaction_type in genetic_interaction_codes + physical_interaction_codes:
    interacting_genes_df = interactome_df[interactome_df.index == interaction_type]
    interactions_graphs_dict[interaction_type] = assemble_interactions_graph(interacting_genes_df)
t1 = time.time()
genetic_interactions_graph = assemble_interactions_graph(genetic_interactions_df)
physical_interactions_graph = assemble_interactions_graph(physical_interactions_df)

In [439]:
PROB = 0.5
# можно ускорить: не ворочать сам граф, а тупо сохранить
# в отдельный список рёбра, поворочать их случайным образом
# и затем по ним построить граф заново
# если я правильно понимаю, как реализован внутри ig.Graph(),
# то все сложности проистекают именно из необходимости динамического
# выделения памяти и линейного сдвига массивов, т.е. там внутри
# сидит вектор и с insert/erase у него всё тупо очень плохо
# это нужно 

def analyze_interactions(QTL_df, Q_RANGE, REWIRE_FLAG=False):
    global PROB
    t0 = time.time()
    average_similarity = {}
    for name in {'genetic', 'physical'}:# | set(genetic_interaction_codes) | set(physical_interaction_codes):
        average_similarity[name] = []
    interactions_graphs_dict = {}
    # for interaction_type in genetic_interaction_codes + physical_interaction_codes:
    #     interacting_genes_df = interactome_df[interactome_df.index == interaction_type]
    #     interactions_graphs_dict[interaction_type] = assemble_interactions_graph(interacting_genes_df)
    #     if REWIRE_FLAG:
    #         interactions_graphs_dict[interaction_type].rewire(interactions_graphs_dict[interaction_type].ecount() * 10)#rewire_edges(prob=PROB)
    t1 = time.time()
    genetic_interactions_graph = assemble_interactions_graph(genetic_interactions_df, REWIRE_FLAG)
    physical_interactions_graph = assemble_interactions_graph(physical_interactions_df, REWIRE_FLAG)
    print("SHUFFLING: {}".format(time.time() - t1))
    # if REWIRE_FLAG:
    #     t1 = time.time()
    #     genetic_interactions_graph.rewire(n=10000)#genetic_interactions_graph.ecount() * 10)#rewire_edges(prob=PROB)
    #     physical_interactions_graph.rewire(n=10000)#physical_interactions_graph.ecount() * 10)#rewire_edges(prob=PROB)
    #     print("REWIRE: {}".format(time.time() - t1))
    
    for Q_THRESHOLD in Q_RANGE[::-1]:
        # Accordingly to qvalue package in R, all estimated linkages
        # are significant, thereby p- and q-values are interchangeable.
        QTL_df = QTL_df[QTL_df['p.value'] <= Q_THRESHOLD]
        QTL_graph = assemble_linkage_graph(QTL_df)
        average_similarity['genetic'].append(mean_jaccard_coefficient(genetic_interactions_graph, QTL_graph))
        # for name, graph in interactions_graphs_dict.items():
        #     if name in genetic_interaction_codes:
        #         average_similarity[name].append(mean_jaccard_coefficient(graph, QTL_graph))
        average_similarity['physical'].append(mean_jaccard_coefficient(physical_interactions_graph, QTL_graph))
        # for name, graph in interactions_graphs_dict.items():
        #     if name in physical_interaction_codes:
        #         average_similarity[name].append(mean_jaccard_coefficient(graph, QTL_graph))
                
    print(time.time() - t0)
    return average_similarity


In [438]:
Q_RANGE = np.linspace(1e-8, 1e-3, 20)
average_similarity_eQTL = analyze_interactions(
    QTL_df=eQTL_df,  
    Q_RANGE=Q_RANGE
)
average_similarity_eQTL_shuffled = analyze_interactions(
    QTL_df=eQTL_df,  
    Q_RANGE=Q_RANGE,
    REWIRE_FLAG=True
)
average_similarity_pQTL = analyze_interactions(
    QTL_df=pQTL_df, 
    Q_RANGE=Q_RANGE
)
average_similarity_pQTL_shuffled = analyze_interactions(
    QTL_df=pQTL_df,
    Q_RANGE=Q_RANGE,
    REWIRE_FLAG=True
)


24.207005262374878


27.92242956161499


8.48112678527832


10.126733541488647


In [443]:
def make_plots(average_similarity, average_similarity_shuffled, type_of_QTLs, Q_RANGE):
   for name in average_similarity.keys():
        plt.figure(figsize=(20, 10))
        plt.xscale('log')
        plt.plot(
            Q_RANGE, average_similarity[name][::-1], 
            Q_RANGE, average_similarity_shuffled[name][::-1]
        )
        # plt.xticks(Q_RANGE[1::10])
        plt.title('{} average linkage similarity'.format(type_of_QTLs))
        plt.xlabel(name)
        plt.savefig("./img/interactions/" + type_of_QTLs + '_' + name + ".png")
        plt.close()
    # plt.show()
        
make_plots(average_similarity_eQTL, average_similarity_eQTL_shuffled, 'eQTLs', Q_RANGE)
make_plots(average_similarity_pQTL, average_similarity_pQTL_shuffled, 'pQTLs', Q_RANGE)
