In [None]:
import numpy as np
import pandas as pd
import os
import sys
import networkx as nx
import matplotlib.pyplot as plt
import pickle as pkl
from tqdm.notebook import tqdm
from itertools import combinations

%matplotlib inline

In [None]:
STRING_CUTOFF = 60

### Define paths

In [None]:
# Input files
GENE_COM_ESS_MAP = "results/candidate_genes/public_20Q2/gene-com_ess.dict"
ESSENTIAL_GENES  = 'results/essential_candidates/public_20Q2/essential_genes-all.pkl'

PARALOG_DATA     = 'data/DGD/duplicate_genes_Hsapiens.tsv'
PANTHER_MAP      = 'data/PANTHER/paralogs-GeneID'
PANTHER_PARALOGS = 'data/PANTHER/paralogs'

CORUM_DATA       = 'data/CORUM/allComplexes.txt'
STRING_DATA      = 'data/STRING/9606.protein.links.full.v11.0.txt'
STRING_MAP       = 'data/ens_entrez_maps/ensp_entrez_mapping.pkl'
SIGNOR_DATA      = 'data/SIGNOR/geneID_interactions.pkl'
HURI_DATA        = 'data/HuRI/HuRI.tsv'
HURI_MAP         = 'data/ens_entrez_maps/ensg_entrez_mapping.pkl'

NCBI_GENE_NAMES  = 'data/misc/ncbi_gene_names.pkl'

# Output files
RESULTS_FILE = "results/essential_candidates/public_20Q2/essential_genes_annotated.pkl"

## 1.  Load data

### Load gene - common essential data

In [None]:
with open(GENE_COM_ESS_MAP, 'rb') as f:
    gene_com_ess_map = pkl.load(f)
gene_com_ess_map

### Load genes essential in cell lines

In [None]:
essential_genes = pd.read_pickle(ESSENTIAL_GENES)
essential_genes.head()

### Load DGD data

In [None]:
dup_gens = pd.read_csv(PARALOG_DATA, delimiter='\t', index_col=9)
dup_gens.head()

### Load PANTHER data

In [None]:
panther_map = pd.read_csv(PANTHER_MAP, delimiter=' ', index_col=0)
panther_map.head()

In [None]:
paralogs = pd.read_csv(PANTHER_PARALOGS, delimiter=' ', header=None, names=['gene', 'paralog'])
paralogs.head()

### Load CORUM data

In [None]:
prot_complexes = pd.read_csv(CORUM_DATA, delimiter='\t')

human_prot_complexes = prot_complexes.loc[prot_complexes['SWISSPROT organism'].str.match("Homo")]
human_prot_complexes['subunits(Entrez IDs)'] = [[int(idx.strip())
                                                 for idx in x.split(';')
                                                     if idx != 'None' and idx.strip() != '']
                                                for x in human_prot_complexes['subunits(Entrez IDs)']]
human_prot_complexes.head()

In [None]:
corum_interactions = set(i for comp in human_prot_complexes['subunits(Entrez IDs)'].values 
                             for i in combinations(comp, 2))
corum_interactions

### Load STRING data

In [None]:
prot_interactions = pd.read_csv(STRING_DATA, delimiter=' ')

def f(*args):
    if len(args) < 1:
        raise ValueError("Function must have at least one provided argument")
    return 1. - np.prod([1. - i for i in args], axis=0)

prot_interactions['experiments_score']  = f(prot_interactions.experiments/1000, 
                                            prot_interactions.experiments_transferred/1000)
prot_interactions['database_score']     = f(prot_interactions.database/1000, 
                                            prot_interactions.database_transferred/1000)
prot_interactions['coexpression_score'] = f(prot_interactions.coexpression/1000, 
                                            prot_interactions.coexpression_transferred/1000)
prot_interactions['interaction_score']  = f(prot_interactions.experiments_score, 
                                            prot_interactions.database_score, 
                                            prot_interactions.coexpression_score)

In [None]:
# Keep interactions only if score >= threshold
probable_prot_interactions = prot_interactions.loc[prot_interactions.interaction_score >= STRING_CUTOFF / 100, 
                                                   ['protein1', 'protein2', 'interaction_score']]
probable_prot_interactions.reset_index(drop=True, inplace=True)

probable_prot_interactions.head()

In [None]:
with open(STRING_MAP, 'rb') as f:
    string_map = pkl.load(f)
dict(list(string_map.items())[:5])

In [None]:
string_interactions = set((string_map[p1], string_map[p2])
                          for p1, p2 in probable_prot_interactions[['protein1', 'protein2']].values 
                              if string_map[p1] != -1 and string_map[p2] != -1)
string_interactions

### Load SIGNOR data

In [None]:
with open(SIGNOR_DATA, 'rb') as f:
    signor_data = np.array(list(pkl.load(f)))
signor_data

In [None]:
signor_interactions = set(map(tuple, signor_data))
signor_interactions

### Load HuRI data

In [None]:
huri_data = pd.read_csv(HURI_DATA, delimiter='\t', header=None).values
huri_data

In [None]:
with open(HURI_MAP, 'rb') as f:
    huri_map = pkl.load(f)
dict(list(huri_map.items())[:5])

In [None]:
huri_interactions = set((huri_map[p1], huri_map[p2]) 
                        for p1, p2 in huri_data
                            if huri_map[p1] != -1 and huri_map[p2] != -1)
huri_interactions

## 2.  Analysis of essential genes

### Get common essentials

In [None]:
for idx, row in essential_genes.iterrows():
    print(f"{row.gene} ({idx}):\t{gene_com_ess_map[int(idx)]}")

### Get paralogs

In [None]:
gene_paralogs = {}

for idx, row in essential_genes.iterrows():
    p = set()  
    if idx in panther_map.index:
        uniprot = panther_map.loc[idx, 'uniprot']
        if type(uniprot) != str:
            uniprot = uniprot.values[0]
            
        p.update(panther_map.loc[panther_map.uniprot.isin(paralogs.loc[paralogs.paralog == uniprot , 'gene'])].index)
        p.update(panther_map.loc[panther_map.uniprot.isin(paralogs.loc[paralogs.gene == uniprot, 'paralog'])].index)
        
    if idx in dup_gens.index:
        p.update(dup_gens.loc[dup_gens.group_id == dup_gens.loc[idx, 'group_id']].index)
    if -1 in p:
        p.remove(-1)
    gene_paralogs[idx] = p
gene_paralogs

### Check for interaction between paralogs and common essential

In [None]:
all_interactions = corum_interactions.union(string_interactions).union(signor_interactions).union(huri_interactions)
print(len(all_interactions))
list(all_interactions)[:5]

In [None]:
def add_to_dict_of_lists(a, b, dic={}):
    if a not in dic:
        dic[a] = set()
    dic[a].add(b)
    
interaction_dict = dict()
for i, j in tqdm(all_interactions):
    add_to_dict_of_lists(i, j, interaction_dict)
    add_to_dict_of_lists(j, i, interaction_dict)
    
print(len(interaction_dict))
dict(list(interaction_dict.items())[:5])

In [None]:
def interaction(gene1, gene2):
    if gene1 in interaction_dict:
        return gene2 in interaction_dict[gene1]

In [None]:
with open(NCBI_GENE_NAMES, 'rb') as f:
    ncbi_gene_names = pkl.load(f)

def get_gene_name(geneID):
    if geneID not in ncbi_gene_names:
        Entrez.email = "test@gmail.com"
        handle = Entrez.efetch("gene", id=str(geneID), rettype="gene_table", retmode="text")
        info = handle.readline().split()
        name = info[0]
        ncbi_gene_names[geneID] = f"{name} ({geneID})", f"{' '.join(info[1:]).strip()}"
        with open(NCBI_GENE_NAMES, 'wb') as f:
            pkl.dump(ncbi_gene_names, f)
    return ncbi_gene_names[geneID]

In [None]:
gene_paralogs_interact = {}
gene_paralogs_com_ess_graph = {}

for idx, row in tqdm(essential_genes.iterrows(), total=len(essential_genes)):
#     print(f"\n{row.Name} ({idx}):\n\t{len(gene_paralogs[idx])} paralogs\n\t{len(gene_com_ess_map[idx])} comm. ess.")
    _any = 0
    _all = 0
    
    _paralogs = []
    _com_ess  = []
    
    remove  = []
    dropped = False
    
#     if len(gene_com_ess_map[idx]) > 50:
#         print(f"{idx} interacts with too many common essentials ({len(gene_com_ess_map[idx])})!")
#         dropped = True
    
    for paralog in gene_paralogs[idx]:
        p_start = time()
        if paralog == idx:
            print(f"{row.gene} ({idx}) has itself marked as paralog!")
            remove.append(paralog)
            if len(gene_paralogs[idx]) - len(remove) == 0:
                dropped = True
        else:
            interactions = []
            for com_ess in gene_com_ess_map[idx]:
                if paralog == com_ess:
                    print(f"{row.gene} ({idx}) has the common essential {get_gene_name(com_ess)[0]} as paralog!")
                    remove.append(paralog)  # TODO: Moet dit wel? We kunnen hem ook uit de com_ess lijst halen.
                    if len(gene_paralogs[idx]) - len(remove) == 0:
                        print(f"No paralogs left!")
                        dropped = True
                else:    
                    interactions.append(interaction(paralog, com_ess))
                    if interactions[-1]:
                        _paralogs.append(get_gene_name(paralog)[0])
                        _com_ess.append(get_gene_name(com_ess)[0])
                if dropped:
                    break
        if dropped:
            break
        _any += any(interactions)
        _all += all(interactions)
#         print(f"\tParalog {paralog} done after {(time()-p_start)/1000:.3f} s")
    for i in remove:
        gene_paralogs[idx].remove(i)
    if dropped:
        print(f"Dropping {row.gene} ({idx}) from essential genes list...")
        essential_genes.drop(idx, inplace=True)
        continue
    
    gene_paralogs_interact[idx] = (float(_any)/len(gene_paralogs[idx]), float(_all)/len(gene_paralogs[idx]))
    gene_paralogs_com_ess_graph[idx] = {"paralogs": _paralogs, "com_ess": _com_ess}
        
gene_paralogs_interact

## 3.  Results

In [None]:
print(gene_paralogs_com_ess_graph[10006]['paralogs'])
sum(['10152' in i for i in gene_paralogs_com_ess_graph[10006]['paralogs']])

In [None]:
pd.set_option("display.max_rows", 100)
results = {"gene": [], 
           "geneID": [], 
           "n_paralogs": [], 
           "n_common_essentials": [], 
           "% paralogs interacting with any": [], 
           "% paralogs interacting with all": [], 
           "interaction_graph": [],
           "paralogs": [],
           "common_essentials": [],
           "interacting_paralogs": [],
           "interacting_common_essentials": [],
          }

for idx, row in essential_genes.iterrows():
    results["gene"].append(row.gene)
    results["geneID"].append(idx)
    results["n_paralogs"].append(len(gene_paralogs[idx]))
    results["n_common_essentials"].append(len(gene_com_ess_map[idx]))
    results["% paralogs interacting with any"].append(int(gene_paralogs_interact[idx][0]*100))
    results["% paralogs interacting with all"].append(int(gene_paralogs_interact[idx][1]*100))
    results["interaction_graph"].append(gene_paralogs_com_ess_graph[idx])
    results["paralogs"].append(gene_paralogs[idx])
    results["common_essentials"].append(gene_com_ess_map[idx])
    results["interacting_paralogs"].append(set(p for p in gene_paralogs[idx] if any([str(p) in i for i in gene_paralogs_com_ess_graph[idx]['paralogs']])))
    results["interacting_common_essentials"].append(set(c for c in gene_com_ess_map[idx] if any([str(c) in i for i in gene_paralogs_com_ess_graph[idx]['com_ess']])))

results = pd.DataFrame(results).set_index('geneID')
results

In [None]:
def get_inf(geneID):
    cs = gene_com_ess_map[geneID]
    ps = gene_paralogs[geneID]
    
    print("Gene:\t"+'\t'.join(get_gene_name(geneID)))
    print("Com. ess.:")
    for c in cs:
        print('\t'+'\t'.join(get_gene_name(c)))
    print("Paralogs:")
    for p in ps:
        print('\t'+'\t'.join(get_gene_name(p)))

In [None]:
# These graphs were used to visualize the interactions between paralogs and common essentials
# In some cases the graphs are not visually very appealing, especially when there are a large number of genes involved

def draw_graph(graph, idx):
    df = pd.DataFrame(graph)
    
    B = nx.Graph()
    B.add_nodes_from(df['com_ess'], bipartite=0)
    B.add_nodes_from(df['paralogs'], bipartite=1)
    B.add_weighted_edges_from(
        [(row['paralogs'], row['com_ess'], 1) for idx, row in df.iterrows()], 
        weight='weight')

    pos = {node:[0, i] for i, node in enumerate(df['paralogs'])}
    pos.update({node:[20, i] for i, node in enumerate(df['com_ess'])})
    nx.draw(B, pos, with_labels=False)
    for p in pos:  # raise text positions
        if len(pos) == 2:
            pos[p][1] += .01
#         elif len(pos) > 
        else:
            pos[p][1] += len(pos)/10
            
    nx.draw_networkx_labels(B, pos)
    
    plt.margins(.3)
    
    plt.figtext(.5,1.12,f'Interaction network for gene {get_gene_name(idx)[0]}', fontsize=17, ha='center')
    plt.figtext(.2,1.02,"Paralogs",fontsize=15,ha='center')
    plt.figtext(.78,1.02,"Common Essentials",fontsize=15,ha='center')
    
    plt.show()

In [None]:
for idx, p_any, graph in results.loc[results.Gene.isin(pediatric_genes),
                                     ["GeneID", "% paralogs interacting with any", "interaction_graph"]].values:
    get_inf(idx)
    if p_any > 0:
        draw_graph(graph, idx)
    else:
        print("### No interactions betwen paralogs and common essentials ###")
    print('------------------------------------------------------------')

### Save results

In [None]:
results.to_pickle(RESULTS_FILE)