In this notebook the eQTLs and pQTLs indentified with statistical package MatrixEQTL are analysed with respect to yeast interactome (various kinds of genetic and physical interactions, with particular emphasis on protein-protein interactions).

In [3]:
%matplotlib inline

# utilities
import time

# data anal|ysis tools
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests


# network analysis tools
import networkx as nx
import igraph as ig

# visualization tools
import matplotlib.pyplot as plt
from matplotlib import cm

%autosave 15

Autosaving every 15 seconds


In [4]:
# Expression data wasn't measured for all of the strains genotyped, 
# thereby some of them need to be filtered out beforehand
genotype_df = pd.read_table("./data/genotypes_full.csv")

rna_expression_df = pd.read_table("./data/rna_expression_avg.csv")
rna_genotype_df = pd.read_table("./data/rna_genotypes.csv")

protein_expression_df = pd.read_table("./data/protein_expression_avg.csv")
protein_genotype_df = pd.read_table("./data/protein_genotypes.csv")

full_interactome_df = pd.read_table("./data/yeast_interactome.csv")
full_interactome_df = full_interactome_df[full_interactome_df['Publication year'] <= 2011]
genetic_interaction_codes = [
    'additive genetic interaction defined by inequality',
    'suppressive genetic interaction defined by inequality',
    'synthetic genetic interaction defined by inequality'
]
physical_interaction_codes = [
    'association',
    'colocalization',   
    'direct interaction',
    'physical association'
]
# where is it used?
# what is it, at all?
biogrid_mi_mapping_df = pd.read_table(
    './data/biogrid_mi_mapping.csv'
)   

In [5]:
# Estimate QTLs from the data. Average runtime — 10 seconds.  
# os.system("Rscript ./src/identify_QTLs.R")

In [6]:
# Assemble a bipartite graph representing linkages between markers and genes of interest 
def assemble_QTLs_graph(QTL_df, type_of_QTLs, output_filename):
    linkage_graph = nx.DiGraph()
    for row in QTL_df.itertuples():
        marker_name, gene_name = row[1], row[2]
        if not linkage_graph.has_node(gene_name):
            linkage_graph.add_node(gene_name, bipartite=0)
        if not linkage_graph.has_node(marker_name):
            linkage_graph.add_node(marker_name, bipartite=1)
        linkage_graph.add_edge(gene_name, marker_name)
        
    top_v, bottom_v = [], []
    for node, data in linkage_graph.nodes(data=True):
        if data["bipartite"] == 0:
            bottom_v.append(node)
        else:
            top_v.append(node)
    
    if not linkage_graph.nodes():
        print("No linkages found")
    
        
    # marker_to_rownum = dict(zip(
    #     genotypes_df.iloc[:, 0], 
    #     np.arange(genotypes_df.shape[0])
    # ))
    # 
    # marker_nodes = sorted(
    #     list(linkage_graph.degree(top_v).items()), 
    #     key=lambda p: marker_to_rownum[p[0]]
    # )
    # 
    # marker_names, linkages = map(list, zip(*marker_nodes))  
    # 
    # nx.write_gexf(linkage_graph, "./data/" + output_filename + ".gexf")
    # 
    # plt.figure(figsize=(40, 20))
    # plt.plot(linkages)
    # plt.savefig("./img/" + output_filename + ".png")
    # plt.close()
    # 
    return linkage_graph

# Build a multigraph of protein-protein interactions
def assemble_interactions_graph(interaction_type, interacting_genes_df):
    interactions_graph = nx.from_pandas_dataframe(
        interacting_genes_df,
        source='Interactor A',
        target='Interactor B',
        create_using=nx.MultiGraph()
    )
    # nx.write_gexf(interactions_graph, "./data/interaction_graphs/" + interaction_type + "_interactions_graph.gexf")
    return interactions_graph

def mean_jaccard_coefficient(interactions_graph, QTL_graph):
    # Рассматривать только гены, для которых есть хотя бы один linkage
    
    QTL_target_set = set(QTL_graph.nodes())
    mean_jaccard = 0.
    # Перебрать все рёбра и сопоставить каждой вершине 
    # пару множеств: eQTLs и pQTLs, которые с ней линкуются,
    # а затем рассмотреть меру пересечения их объединения с мерой пересечения
    if interactions_graph.number_of_edges() != 0:
        for u, v in interactions_graph.edges():
            u_neigh, v_neigh = set(), set()
            if u in QTL_target_set:
                u_neigh = set(QTL_graph.neighbors(u))
            if v in QTL_target_set: 
                v_neigh = set(QTL_graph.neighbors(v))
            if len(u_neigh) != 0 or len(v_neigh) != 0:
                mean_jaccard += len(u_neigh & v_neigh) / len(u_neigh | v_neigh)
        mean_jaccard /= interactions_graph.number_of_edges()
    return mean_jaccard

In [7]:
# for q-value threshold in range [1e-8; 1e-3] 
# rebuild graph of interactions and calculate  
# mean value of Jaccard coefficient among graph vertices.

eQTL_df = pd.read_table("./data/eQTLs.csv")
pQTL_df = pd.read_table("./data/pQTLs.csv")

In [None]:

# Почему получается так мало рёбер в графах взаимодействий?
# Это ОЧЕНЬ странно. Их должны быть десятки тысяч!

interactome_df = full_interactome_df.copy()
interactome_df.set_index('Interaction Type', inplace=True)

genetic_interactions_df = interactome_df[
    interactome_df.index.isin(genetic_interaction_codes)
]
physical_interactions_df = interactome_df[
    interactome_df.index.isin(physical_interaction_codes)
]

interactions_graphs_dict = {}
for interaction_type in genetic_interaction_codes + physical_interaction_codes:
    interacting_genes_df = interactome_df[interactome_df.index == interaction_type]
    interactions_graphs_dict[interaction_type] = assemble_interactions_graph(interaction_type, interacting_genes_df)

genetic_interactions_graph = assemble_interactions_graph('genetic', genetic_interactions_df)  
physical_interactions_graph = assemble_interactions_graph('physical', physical_interactions_df)

In [21]:
def analyze_interactions(QTL_df, type_of_QTLs, Q_RANGE):
    t0 = time.time()
    average_similarity = {}
    for name in set(genetic_interaction_codes) | set(physical_interaction_codes) | {'genetic', 'physical'}:
        average_similarity[name] = []
        
    for Q_THRESHOLD in Q_RANGE[::-1]:
        # Accordingly to qvalue package in R, all estimated linkages
        # are significant, thereby p- and q-values are interchangeable.
        QTL_df = QTL_df[QTL_df['p-value'] <= Q_THRESHOLD]
        
        # Построить их граф.
        QTL_graph = assemble_QTLs_graph(
            QTL_df=QTL_df,
            type_of_QTLs=type_of_QTLs, 
            output_filename="tmp/" + type_of_QTLs
        )
        # Сузить рассмотрение до тех вершин, которые участвуют хотя бы в одном linkage
        # QTL_target_set = set(QTL_df['gene'])
        
        # interactome_df = full_interactome_df[
        #     full_interactome_df['Interactor A'].isin(QTL_target_set)
        #     & full_interactome_df['Interactor B'].isin(QTL_target_set)
        # ]
        # 
        # interactome_df.set_index('Interaction Type', inplace=True)

        # genetic_interactions_df = interactome_df[
        #     interactome_df.index.isin(genetic_interaction_codes)
        # ]
        # physical_interactions_df = interactome_df[
        #     interactome_df.index.isin(physical_interaction_codes)
        # ]
        # 
        # interactions_graphs_dict = {}
        # for interaction_type in genetic_interaction_codes + physical_interaction_codes:
        #     interacting_genes_df = interactome_df[interactome_df.index == interaction_type]
        #     interactions_graphs_dict[interaction_type] = assemble_interactions_graph(interaction_type, interacting_genes_df)
        # 
        # genetic_interactions_graph = assemble_interactions_graph('genetic', genetic_interactions_df)  
        # physical_interactions_graph = assemble_interactions_graph('physical', physical_interactions_df)
        
        # print('{}, total: {} vertices, {} edges'.format(
        #     type_of_QTLs,
        #     QTL_graph.number_of_nodes(),
        #     QTL_graph.number_of_edges()
        # )) 
    
        # print('Genetic interactions, total: {} vertices, {} edges'.format(
        #     genetic_interactions_graph.number_of_nodes(),
        #     genetic_interactions_graph.number_of_edges()
        # ))
        # for name, graph in interactions_graphs_dict.items():
        #     if name in genetic_interaction_codes:
        #        print('\t{}: {} vertices, {} edges'.format(
        #             name,
        #             graph.number_of_nodes(),
        #             graph.number_of_edges()
        #         ))
        # print('Physical interactions, total: {} vertices, {} edges'.format(
        #     physical_interactions_graph.number_of_nodes(),
        #     physical_interactions_graph.number_of_edges()
        # ))
        # for name, graph in interactions_graphs_dict.items():
        #     if name in physical_interaction_codes:
        #        print('\t{}: {} vertices, {} edges'.format(
        #             name,
        #             graph.number_of_nodes(),
        #             graph.number_of_edges()
        #         ))
        # 
        # print("Q-value threshold: {}; Interactome shape: {}".format(Q_THRESHOLD, interactome_df.shape))
        average_similarity['genetic'].append(mean_jaccard_coefficient(genetic_interactions_graph, QTL_graph))
        # print(genetic_interactions_graph.number_of_edges())
        # average_similarity['genetic'].append(mean_jaccard_coefficient(genetic_interactions_graph, eQTL_graph, pQTL_graph))
        # print('Genetic interactions, total: {}'.format(
        #     mean_jaccard_coefficient(genetic_interactions_graph, QTL_graph)
        # ))   
    
        for name, graph in interactions_graphs_dict.items():
            if name in genetic_interaction_codes:
                average_similarity[name].append(mean_jaccard_coefficient(graph, QTL_graph))
                # average_similarity[name].append(mean_jaccard_coefficient(graph, eQTL_graph, pQTL_graph))
                # print('\t{}: {}'.format(
                #     name, mean_jaccard_coefficient(graph, QTL_graph)
                # 
                # ))
        # print((Q_THRESHOLD, mean_jaccard_coefficient(physical_interactions_graph, eQTL_graph, pQTL_graph)))
        average_similarity['physical'].append(mean_jaccard_coefficient(physical_interactions_graph, QTL_graph))
        # average_similarity['physical'].append(mean_jaccard_coefficient(physical_interactions_graph, eQTL_graph, pQTL_graph))
        # print('Physical interactions, total: {}'.format(
        #     mean_jaccard_coefficient(physical_interactions_graph, QTL_graph)
        # ))
        for name, graph in interactions_graphs_dict.items():
            if name in physical_interaction_codes:
                average_similarity[name].append(mean_jaccard_coefficient(graph, QTL_graph))
                # average_similarity[name].append(mean_jaccard_coefficient(graph, eQTL_graph, pQTL_graph))
                # print('\t{}: {}'.format(
                #     name, mean_jaccard_coefficient(graph, QTL_graph)
                # ))    
                
    print(time.time() - t0)
    return average_similarity


In [23]:
Q_RANGE = np.linspace(1e-8, 0.001, 50)
average_similarity_eQTL = analyze_interactions(
    QTL_df=eQTL_df, 
    type_of_QTLs='eQTLs', 
    Q_RANGE=Q_RANGE
)
average_similarity_pQTL = analyze_interactions(
    QTL_df=pQTL_df, 
    type_of_QTLs='pQTLs', 
    Q_RANGE=Q_RANGE
)


111.98966860771179


74.42881608009338


In [25]:
def make_plots(average_similarity, type_of_QTLs, Q_RANGE):
   for name in average_similarity.keys():
        plt.figure(figsize=(20, 10))
        plt.xscale('log')
        plt.plot(Q_RANGE, average_similarity[name][::-1])
        # plt.xticks(Q_RANGE[1::10])
        plt.title('{} average linkage similarity'.format(type_of_QTLs))
        plt.xlabel(name)
        plt.savefig("./img/interactions/" + type_of_QTLs + '_' + name + ".png")
        plt.close()
    # plt.show()
        
make_plots(average_similarity_eQTL, 'eQTLs', Q_RANGE)
make_plots(average_similarity_pQTL, 'pQTLs', Q_RANGE)