In [None]:
import numpy as np
import pandas as pd
import os
import sys
from tqdm.notebook import tqdm
import pickle as pkl
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
from re import search
from itertools import combinations
from venn import venn
from taigapy import TaigaClient
import seaborn as sns
tc = TaigaClient()

%matplotlib inline

In [None]:
STRING_CUTOFF = 60

### Define paths

In [None]:
# Input files
CORUM_DATA       = 'data/CORUM/allComplexes.txt'
STRING_DATA      = 'data/STRING/9606.protein.links.full.v11.0.txt'
SIGNOR_DATA      = 'data/SIGNOR/geneID_interactions.pkl'
HURI_DATA        = 'data/HuRI/HuRI.tsv'
PARALOG_DATA     = 'data/DGD/duplicate_genes_Hsapiens.tsv'
PANTHER_PARALOGS = 'data/PANTHER/paralogs-GeneID'
ENSP_ENTREZ_MAP  = 'data/ens_entrez_maps/ensp_entrez_mapping.pkl'
ENSG_ENTREZ_MAP  = 'data/ens_entrez_maps/ensg_entrez_mapping.pkl'

# Output directory
SAVE_DIR = f"results/candidate_genes/public_20Q2"

## 1.  Load Data

### Get common essentials

In [None]:
com_ess = tc.get(name='public-20q2-075d', version=22, file='Achilles_common_essentials')

com_ess = pd.DataFrame(com_ess.gene.str.split(' ',1).tolist(), columns=['gene','geneID']).set_index('geneID')
com_ess.index = com_ess.index.str.strip('()').astype(int)
com_ess.head()

### Load CORUM protein complexes

In [None]:
corum_data = pd.read_csv(CORUM_DATA, delimiter='\t')
corum_data.head()

### Load STRING database

In [None]:
string_data = pd.read_csv(STRING_DATA, delimiter=' ')
string_data.head()

### Load preprocessed SIGNOR database

In [None]:
with open(SIGNOR_DATA, 'rb') as f:
    signor_data = np.array(list(pkl.load(f)))
signor_data

### Load HuRI database

In [None]:
huri_data = pd.read_csv(HURI_DATA, delimiter='\t', header=None)
huri_data.head()

### Load duplicate gene data

In [None]:
dup_gens = pd.read_csv(PARALOG_DATA, delimiter='\t')
dup_gens.head()

### Load PANTHER paralog data

In [None]:
paralogs = pd.read_csv(PANTHER_PARALOGS, delimiter=' ')
paralogs.head()

### Load ensemble EnsProtID to EntrezID map

In [None]:
with open(ENSP_ENTREZ_MAP, 'rb') as f:
    ensp_entrez_map = pkl.load(f)
dict(list(ensp_entrez_map.items())[:5])

### Load ensemble EnsGeneID to EntrezID map

In [None]:
with open(ENSG_ENTREZ_MAP, 'rb') as f:
    ensg_entrez_map = pkl.load(f)
dict(list(ensg_entrez_map.items())[:5])

## 2.  Filter and process data

In [None]:
# Get only human protein complexes
human_prot_complexes = corum_data.loc[corum_data['SWISSPROT organism'].str.match("Homo")]
human_prot_complexes['subunits(Entrez IDs)'] = [[int(idx.strip()) for idx in x.split(';') 
                                                 if idx.strip() != 'None' and idx.strip() != ''] 
                                                for x in human_prot_complexes['subunits(Entrez IDs)']]
human_prot_complexes.head()

In [None]:
def f(*args):
    if len(args) < 1:
        raise ValueError("Function must have at least one provided argument")
    return 1. - np.prod([1. - i for i in args], axis=0)

In [None]:
# Get custom interaction score from STRING database by summing probabilities
prot_interactions = string_data
prot_interactions['experiments_score']  = f(prot_interactions.experiments/1000, 
                                            prot_interactions.experiments_transferred/1000)
prot_interactions['database_score']     = f(prot_interactions.database/1000, 
                                            prot_interactions.database_transferred/1000)
prot_interactions['coexpression_score'] = f(prot_interactions.coexpression/1000, 
                                            prot_interactions.coexpression_transferred/1000)
prot_interactions['interaction_score']  = f(prot_interactions.experiments_score, 
                                            prot_interactions.database_score,
                                            prot_interactions.coexpression_score)

# Keep interactions only if score >= threshold
probable_prot_interactions = prot_interactions.loc[prot_interactions.interaction_score >= STRING_CUTOFF / 100, 
                                                   ['protein1', 'protein2', 'interaction_score']]
probable_prot_interactions.reset_index(drop=True, inplace=True)
probable_prot_interactions.head()

## 3.  Find interactions

In [None]:
results = {'CORUM':  {'DGD': set(), 'PANTHER': set()}, 
           'STRING': {'DGD': set(), 'PANTHER': set()}, 
           'SIGNOR': {'DGD': set(), 'PANTHER': set()},
           'HuRI':   {'DGD': set(), 'PANTHER': set()},
          }
gene_com_ess_int = {}

In [None]:
def com_ess_interaction(com_ess, gene):   
    if gene != com_ess:
        if gene not in gene_com_ess_int.keys():
            gene_com_ess_int[gene] = set()
        gene_com_ess_int[gene].add(com_ess)
    return gene

In [None]:
def get_candidate_prots(ppi_list, common_essentials=com_ess.index.values):
    candidates = [com_ess_interaction(p1, p2) if p1 in common_essentials else com_ess_interaction(p2, p1)
                  for p1, p2 in tqdm(ppi_list)
                      if bool(p1 in common_essentials) != bool(p2 in common_essentials)]
    
    # Make one single list of the candidate proteins which are not an already known essential dependency
    candidates = set(candidates).difference(common_essentials)
    print(len(candidates), "candidates found")
    
    # Check for paralogs
    dgd     = set(dup_gens.loc[dup_gens.GeneID.isin(candidates)].GeneID)
    panther = set(paralogs.loc[paralogs.GeneID.isin(candidates)].GeneID)
    print(f"DGD: {len(dgd)}, PANTHER: {len(panther)}, total: {len(dgd.union(panther))}")
    
    return dgd, panther

### Select from CORUM

In [None]:
corum_interactions = set(i for comp in human_prot_complexes['subunits(Entrez IDs)'].values 
                             for i in combinations(comp, 2))
results['CORUM']['DGD'], results['CORUM']['PANTHER'] = get_candidate_prots(corum_interactions)

### Select from STRING

In [None]:
string_interactions = set((ensp_entrez_map[p1], ensp_entrez_map[p2]) 
                              for p1, p2 in probable_prot_interactions[['protein1', 'protein2']].values 
                                  if ensp_entrez_map[p1] != -1 and ensp_entrez_map[p2] != -1)
results['STRING']['DGD'], results['STRING']['PANTHER'] = get_candidate_prots(string_interactions)

### Select from SIGNOR

In [None]:
results['SIGNOR']['DGD'], results['SIGNOR']['PANTHER'] = get_candidate_prots(signor_data)

### Select from HuRI

In [None]:
huri_interactions = set((ensg_entrez_map[p1], ensg_entrez_map[p2]) 
                              for p1, p2 in huri_data.values
                                  if ensg_entrez_map[p1] != -1 and ensg_entrez_map[p2] != -1)
results['HuRI']['DGD'], results['HuRI']['PANTHER'] = get_candidate_prots(huri_interactions)

## 4.  Results

In [None]:
for dataset, paralog_sets in results.items():
    for paralog_set, genes in paralog_sets.items():
        print(f"{dataset}-{paralog_set}: {len(genes)}")

In [None]:
fig = plt.figure(figsize=(9, 9), dpi=124, facecolor='w', edgecolor='k')
# plt.title('Overlap between candidate genes from CORUM, HuRI, SIGNOR and STRING', fontsize=18)
datasets = {f"CORUM ({len(results['CORUM']['PANTHER'].union(results['CORUM']['DGD']))})": results['CORUM']['PANTHER'].union(results['CORUM']['DGD']),
            f"HuRI ({len(results['HuRI']['PANTHER'].union(results['HuRI']['DGD']))})": results['HuRI']['PANTHER'].union(results['HuRI']['DGD']),
            f"SIGNOR ({len(results['SIGNOR']['PANTHER'].union(results['SIGNOR']['DGD']))})": results['SIGNOR']['PANTHER'].union(results['SIGNOR']['DGD']),
            f"STRING ({len(results['STRING']['PANTHER'].union(results['STRING']['DGD']))})": results['STRING']['PANTHER'].union(results['STRING']['DGD']),
           }
venn(datasets, ax=plt.gca(), cmap=['r', 'g', 'b', 'y'])
plt.show()

In [None]:
fig = plt.figure(figsize=(5, 5), dpi=124, facecolor='w', edgecolor='k')
# plt.title('Overlap between candidate genes found through PANTHER and DGD')
venn2([results['CORUM']['PANTHER'].union(results['STRING']['PANTHER']).union(results['SIGNOR']['PANTHER']).union(results['HuRI']['PANTHER']), 
       results['CORUM']['DGD'].union(results['STRING']['DGD']).union(results['SIGNOR']['DGD']).union(results['HuRI']['DGD'])], 
      set_labels=[f"PANTHER ({len(results['CORUM']['PANTHER'].union(results['STRING']['PANTHER']).union(results['SIGNOR']['PANTHER']).union(results['HuRI']['PANTHER']))})", 
                  f"DGD ({len(results['CORUM']['DGD'].union(results['STRING']['DGD']).union(results['SIGNOR']['DGD']).union(results['HuRI']['DGD']))})"])
plt.show()

### Save candidate genes

In [None]:
if not os.path.isdir(save_dir):
    os.mkdir(save_dir)

for dataset, paralog_sets in results.items():
    for paralog_set, genes in paralog_sets.items():
        with open(os.path.join(save_dir, f"{dataset}-{paralog_set}"), 'w') as f:
            for gene in genes:
                f.write(f"{gene}\n")

### Save gene - common essential mapping

In [None]:
with open(os.path.join(save_dir, "gene-com_ess.dict"), "wb") as f:
    pkl.dump(gene_com_ess_int, f)