In [11]:
import pandas as pd 
import numpy as np 
import networkx as nx

np.random.seed(42)
SEED = 42

## Create random gene sets and contaminated ones 

* random: same size as the selected term, draw genes from the pool that is not in the systems with direct connection with the real gene set 

* contaimnated: same size as the selected term, randomly perserve 50% of them (roof) and the rest 50% randomly select from the random gene set. 

In [5]:
full_GO = pd.read_csv('data/go_terms.csv', index_col=0)
selected_go = pd.read_csv('data/GO_term_analysis/1000_selected_go_terms.csv')
go_edges = pd.read_csv('./data/GO_BP/collapsed_go.symbol', sep='\t', header=None, names=['parent', 'child', 'edge_type'])
go_edges = go_edges[go_edges['edge_type'] == 'default']

In [7]:
def find_direct_connections(go_id, edge_list):
    direct_connections = edge_list.loc[(edge_list['parent'] == go_id), 'child'].tolist()
    # print(len(direct_connections))
    direct_connections.extend(edge_list.loc[(edge_list['child'] == go_id), 'parent'].tolist())
    direct_connections = list(set(direct_connections))
    return direct_connections

# find_direct_connections(go_id, go_edges)


In [8]:
def get_gene_pool(go_id, edge_list, go_terms):
    '''
    go_id: the go term id
    edge_list: the edge list of the go terms
    go_terms: the full go terms dataframe
    '''
    gene_pool = set(gene for genes in go_terms['Genes'].apply(lambda x: x.split(' ')) for gene in genes)
    # print(len(gene_pool))
    direct_connections = find_direct_connections(go_id, edge_list)
    # exclude genes from direct connection terms
    direct_connection_pool = go_terms[go_terms['GO'].isin(direct_connections+ [go_id])]
    direct_connection_genes = set(gene for genes in direct_connection_pool['Genes'].apply(lambda x: x.split(' ')) for gene in genes)
    # print(len(direct_connection_genes))
    
    filtered_gene_pool = gene_pool - direct_connection_genes
    
    current_genes = go_terms.loc[go_terms['GO'] == go_id, 'Genes'].tolist()[0].split(' ')
    # print(len(current_genes))
    
    # check if theres any genes in the pool overlaps with the ones in go_id
    # print(len(filtered_gene_pool))
    assert len(set(current_genes).intersection(filtered_gene_pool)) == 0, 'Warning: genes in the pool overlaps with the ones in this real set'
    return filtered_gene_pool


def add_contamination(go_id, edge_list, go_terms, contamination_rate=0.5):
    '''
    contamination_rate: the percentage of genes to be contaminated
    '''
    random_pool = get_gene_pool(go_id, edge_list, go_terms)
    current_genes = go_terms.loc[go_terms['GO'] == go_id, 'Genes'].tolist()[0].split(' ')
    # print(current_genes)
    contamination_size = int(np.ceil(len(current_genes) * contamination_rate))
    contamination_genes = np.random.choice(list(random_pool), size=contamination_size, replace=False)
    perserve_genes = np.random.choice(current_genes, size=len(current_genes) - contamination_size, replace=False)
    new_set = list(perserve_genes) + list(contamination_genes)
    assert len(new_set) == len(current_genes), 'length of new set is not the same as the original set'
    return new_set


In [9]:
# add contamination to the toy example
selected_go['50perc_contaminated_Genes'] = selected_go.apply(lambda x: ' '.join(add_contamination(x['GO'], go_edges, full_GO, contamination_rate=0.5)), axis=1)
selected_go['100perc_contaminated_Genes'] = selected_go.apply(lambda x: ' '.join(add_contamination(x['GO'], go_edges, full_GO, contamination_rate=1)), axis=1)
selected_go.to_csv('data/GO_term_analysis/1000_selected_go_contaminated.csv', index=False)

In [10]:
selected_go.head()

Unnamed: 0,GO,Genes,Gene_Count,Term_Description,50perc_contaminated_Genes,100perc_contaminated_Genes
0,GO:0048627,MEGF10 SDC1 WNT10B SOX15,4,myoblast development,MEGF10 WNT10B PDP2 INSL5,ENPP6 NAA80 BBS7 AJUBA
1,GO:1904888,CPLANE2 NEUROG1 GRHL2 TGFB3 EXT1 TGFBR2 TWIST1...,72,cranial skeletal system development,TFAP2A SIX4 MTHFD1L FGFR2 GNA11 TGFB3 IFT140 M...,LIFR ASB7 DEDD2 WSB1 USP51 OR2V2 CREBZF ELOVL4...
2,GO:0019585,DCXR UGT1A9 UGT2B7 PRKCE UGT1A7 UGT2A3 SORD UG...,26,glucuronate metabolic process,AKR1A1 DCXR UGT2A1 UGT2B7 UGT2A2 UGT2B28 UGT1A...,MSRB2 RAB3A DLL1 ERCC2 PIP4K2B HOXB13 TIMM50 Z...
3,GO:1902267,AZIN1 OAZ2 OAZ1 AZIN2 OAZ3,5,regulation of polyamine transmembrane transport,AZIN1 OAZ1 OR10A4 ZBTB10 BLNK,AP4B1 ULK4 MYCBP2 CD86 YTHDC2
4,GO:0019748,BDH2 CYP2A7 AKR1C1 ACMSD ATP7A ASIP DDT CYP3A4...,56,secondary metabolic process,AKR7A2 FMO1 UGT1A8 APPL1 WNT5A SULT1C4 ACMSD M...,CHRNB3 H2BC7 PSD ARL5B CLEC4C BSPH1 FREY1 POTE...


In [12]:
# extract a subset of 10 GO terms as a toy example
import pandas as pd
df = pd.read_csv('data/GO_term_analysis/1000_selected_go_contaminated.csv')
# take random 10 GO terms 

toy = df.sample(n=10, random_state=SEED)
toy.to_csv('data/GO_term_analysis/toy_example_w_contaminated.csv', index=False)