In [2]:

import collections
import itertools
from importlib import reload
import pickle

# data analysis tools
import random
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests

# os.chdir("/home/ivanov_vv/eQTL_analysis/")

import networks
import qtls
import util
import ontologies

# 112 segregants genotyped by inherited marker variants
full_genotypes_df = pd.read_table("./data/genotypes/processed_genotypes.csv")

''' Where possible, gene names were converted from systematic to standard notation '''
# mRNA expression and genotypes of strains the data is available for
eQTLs_expression_df = pd.read_table("./data/eQTLs/averaged_expression.csv")
eQTLs_genotypes_df = pd.read_table("./data/eQTLs/processed_genotypes.csv")
eQTLs_genotypes_df.drop_duplicates("SNP", inplace=True)
# Protein expression and genotypes of strains the data is available for 
pQTLs_expression_df = pd.read_table("./data/pQTLs/averaged_expression.csv")
pQTLs_genotypes_df = pd.read_table("./data/pQTLs/processed_genotypes.csv")
pQTLs_genotypes_df.drop_duplicates("SNP", inplace=True)

# pandas DataFrame —> numpy matrix = 2x speedup
pQTLs_expression_mx = pQTLs_expression_df.as_matrix(columns=pQTLs_expression_df.columns[1:])
pQTLs_genotypes_mx = pQTLs_genotypes_df.as_matrix(columns=pQTLs_genotypes_df.columns[1:])

# QTLs estimated with MatrixEQTL package for R
eQTLs_df = pd.read_table("./data/eQTLs/results.csv")
eQTLs_df = eQTLs_df[eQTLs_df["q.value"] <= 0.05]
pQTLs_df = pd.read_table("./data/pQTLs/results.csv")
pQTLs_df = pQTLs_df[pQTLs_df["q.value"] <= 0.05]

# Physical and genetic interactions taken from TheBioGrid project
full_interactome_df = pd.read_table("./data/interactions/yeast_interactome.csv")
# full_interactome_df = full_interactome_df[full_interactome_df['Publication year'] <= 2011]

'''
Move these to their project part, there is no need for them anywhere else
'''
genetic_interaction_categories = [
    "additive genetic interaction defined by inequality",
    "suppressive genetic interaction defined by inequality",
    "synthetic genetic interaction defined by inequality"
]
physical_interaction_categories = [
    "association",
    "colocalization",   
    "direct interaction",
    "physical association"
]

interaction_categories = genetic_interaction_categories + physical_interaction_categories
interactome_df = full_interactome_df.copy()
interactome_df.set_index('Interaction Type', inplace=True)
genetic_interactions_df = interactome_df[
    interactome_df.index.isin(genetic_interaction_categories)
]
physical_interactions_df = interactome_df[
    interactome_df.index.isin(physical_interaction_categories)
]

def extract_interactions(interactions_df):
    ''' Helper function to retrieve the edges of the graph of interactions '''
    return interactions_df[["Interactor A", "Interactor B"]].values

interaction_graphs_dict = {}
for category_name in interaction_categories:
    interacting_genes_df = interactome_df[interactome_df.index == category_name]
    interaction_graphs_dict[category_name] = networks.graph_from_edges(
        extract_interactions(interacting_genes_df),
    )
    
interaction_graphs_dict.update(dict(zip(["all", "genetic", "physical"],
    itertools.starmap(networks.graph_from_edges, 
        [(extract_interactions(interactome_df), False, False),
         (extract_interactions(genetic_interactions_df), False, False), 
         (extract_interactions(physical_interactions_df), False, False)]
    )))
)

In [3]:
# with open("./results/kegg_pathways/modules_dict.pkl", "wb+") as pickle_file:
#     pickle.dump(kegg_pathways_dict, pickle_file)
with open("./results/kegg_pathways/modules_dict.pkl", "rb+") as pickle_file:
    kegg_pathways_dict = pickle.load(pickle_file)

In [4]:
reload(qtls)
qtls.process_ontologies(
    database_name="kegg_pathways",
    expression_dfs=[eQTLs_expression_df, pQTLs_expression_df],
    qtl_dfs=[eQTLs_df, pQTLs_df],
    qtl_types=["eQTLs", "pQTLs"],
    interactome_graph=interaction_graphs_dict["physical"],
    module_gene=kegg_pathways_dict,
    q_thresholds=np.logspace(-5, -2, 10)
)