In [1]:
%reset
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sci

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


## hypergeomeric tests for the significant TTs with CRISPR, uniprot, TTD and TSgene

### CRISPR

In [2]:
def get_CRISPR(cell, fdr):
    CRISPR=pd.read_table('data/CRISPR/GeckoV2_{}.txt'.format(cell), sep='\t',index_col=0,engine='python')
    CRISPR_all=set(CRISPR.index)
    CRISPR=set(CRISPR.loc[CRISPR['FDR']<fdr].index)
    
    return CRISPR_all, CRISPR

def hyperGeo_CRISPR(all_TF, sig_TT_dic, res_file):
    sig_type='neg'
    for cell in ['A375','HT29']:
        sig_TT=sig_TT_dic[(cell,sig_type)]
        for fdr in [0.1,0.2,0.3,0.4]:
            CRISPR_all, CRISPR=get_CRISPR(cell, fdr)
            all_TF_both=all_TF&CRISPR_all
            
            CRISPR=all_TF_both&CRISPR
            sig_TT=all_TF_both&sig_TT

            p_val = sci.hypergeom.sf(len(sig_TT&CRISPR), len(all_TF_both), len(CRISPR), len(sig_TT))
            res_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(cell, sig_type, 'CRISPR({})'.format(fdr), len(all_TF_both), len(CRISPR), len(sig_TT), len(sig_TT&CRISPR), p_val, sig_TT&CRISPR))

### uniprot

In [3]:
def get_uniprot(status):
    if status=='OCG':
        uniprot=pd.read_table('data/Uniprot/oncogene_uniprot.txt', sep='\t',engine='python')
    if status=='TSG':
        uniprot=pd.read_table('data/Uniprot/tumor_suppressor_uniprot.txt', sep='\t',engine='python')
        
    uniprot=uniprot.loc[uniprot['Gene names'].notnull()]
    
    uniprot_list=[]
    for ind, row in uniprot.iterrows():
        uniprot_list+=row['Gene names'].split(' ')

    return set(uniprot_list)

def hyperGeo_uniprot(all_TF, sig_TT_dic, status, res_file):
    uniprot=get_uniprot(status)
    uniprot=all_TF&uniprot
    
    if status=='OCG':
        sig_type='neg'
    if status=='TSG':
        sig_type='pos'
        
    for cell in ['A375','HT29','both']:
        sig_TT=sig_TT_dic[(cell,sig_type)]
        
        p_val = sci.hypergeom.sf(len(sig_TT&uniprot), len(all_TF), len(uniprot), len(sig_TT))
        res_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(cell, sig_type, 'uniprot({})'.format(status), len(all_TF), len(uniprot), len(sig_TT), len(sig_TT&uniprot), p_val, sig_TT&uniprot))

### TTD

In [4]:
def get_TTD():
    def cancer_related_disease(x):
        code_list=[]
        for code in x.split(', '):
            code_list+=code.split('-')

        for code in code_list:
            if code.startswith('C') or code.startswith('D') :
                return True
        return False

    TTD_to_UPT=pd.read_table('data/TTD/P2-01-TTD_uniprot_all.txt', index_col=0, sep='\t',engine='python')
    TTD_to_UPT=TTD_to_UPT['Uniprot ID'].map(lambda x: x.split(' ')[0]).to_dict()
    UPT_to_gene=pd.read_table('data/TTD/uniprot_to_gene.txt', index_col=0, sep='\t',engine='python')
    UPT_to_gene=UPT_to_gene['To'].to_dict()

    TTD_TarDis=pd.read_table('data/TTD/P1-05-Target_disease.txt', sep='\t',engine='python')
    TTD_TarDis=TTD_TarDis.loc[TTD_TarDis['ICD10'].notnull()]
    TTD_TarDis=TTD_TarDis.loc[TTD_TarDis['ICD10'].map(cancer_related_disease)]

    TTD_available_UPT=set(TTD_TarDis['TTDTargetID'])&set(TTD_to_UPT.keys())

    UPT_list=[]
    for TTD in TTD_available_UPT:
        UPT_list.append(TTD_to_UPT[TTD])

    UPT_available_gene=set(UPT_list)&set(UPT_to_gene.keys())

    gene_list=[]
    for UPT in UPT_available_gene:
        gene_list.append(UPT_to_gene[UPT])

    return set(gene_list)

    
def hyperGeo_TTD(all_TF, sig_TT_dic, res_file):
    TTD=get_TTD()
    TTD=all_TF&TTD
    
    sig_type='neg'
    for cell in ['A375','HT29','both']:
        sig_TT=sig_TT_dic[(cell,sig_type)]
        
        p_val = sci.hypergeom.sf(len(sig_TT&TTD), len(all_TF), len(TTD), len(sig_TT))
        res_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(cell, sig_type, 'TTD', len(all_TF), len(TTD), len(sig_TT), len(sig_TT&TTD), p_val, sig_TT&TTD))

### TSgene

In [5]:
def hyperGeo_TSgene(all_TF, sig_TT_dic, res_file):
    TSgene=pd.read_table('data/TSgene2.0/Human_TSGs.txt', sep='\t',index_col=0,engine='python')
    TSgene=set(TSgene['GeneSymbol'])
    TSgene=all_TF&TSgene
    
    sig_type='pos'
    for cell in ['A375','HT29','both']:
        sig_TT=sig_TT_dic[(cell,sig_type)]
        
        p_val = sci.hypergeom.sf(len(sig_TT&TSgene), len(all_TF), len(TSgene), len(sig_TT))
        res_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(cell, sig_type, 'TSgene', len(all_TF), len(TSgene), len(sig_TT), len(sig_TT&TSgene), p_val, sig_TT&TSgene))

### main

In [6]:
def get_sig_TT(cell, tp):
    sig_TT=pd.read_table('result/TT_score_viper/sig_TT.txt', sep='\t',index_col=0,header=None, engine='python')
    sig_TT=sig_TT.loc['{}({})'.format(cell,tp)].iloc[1].split(',')
    return set(sig_TT)

def get_all_considered_TF():
    all_TF=pd.read_table('result/TT_score_viper/TTS_A375.txt', sep='\t',index_col=0,engine='python')
    all_TF=set(all_TF.index)   
    return all_TF

all_TF=get_all_considered_TF()
sig_TT_dic={}

for cell in ['A375','HT29','both']:
    for tp in ['pos','neg']:
        sig_TT_dic[(cell,tp)]=get_sig_TT(cell,tp)

res_file=open('result/TT_score_viper/hyperGeo_result.txt','w+')
_=res_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format('cell','sig_TT_type','database','all_TF','GS','sig_TT','hit','p-value','hit_TF_list'))
        
hyperGeo_TTD(all_TF, sig_TT_dic,res_file)
hyperGeo_TSgene(all_TF, sig_TT_dic,res_file)
hyperGeo_uniprot(all_TF, sig_TT_dic, 'OCG',res_file)
hyperGeo_uniprot(all_TF, sig_TT_dic, 'TSG',res_file)
hyperGeo_CRISPR(all_TF, sig_TT_dic, res_file)
res_file.close()