In [61]:
import pandas as pd 
import numpy as np 
import networkx as nx

np.random.seed(42)

## Create random gene sets and contaminated ones 

* random: same size as the selected term, draw genes from the pool that is not in the systems with direct connection with the real gene set 

* contaimnated: same size as the selected term, randomly perserve 50% of them (roof) and the rest 50% randomly select from the random gene set. 

In [62]:
full_GO = pd.read_csv('data/go_terms.csv', index_col=0)
toy_example = pd.read_csv('data/GO_term_analysis/toy_example.csv')
go_edges = pd.read_csv('./data/GO_BP/collapsed_go.symbol', sep='\t', header=None, names=['parent', 'child', 'edge_type'])
go_edges = go_edges[go_edges['edge_type'] == 'default']

In [45]:
# check with an example
go_id = 'GO:0033683'

In [63]:
def find_direct_connections(go_id, edge_list):
    direct_connections = edge_list.loc[(edge_list['parent'] == go_id), 'child'].tolist()
    # print(len(direct_connections))
    direct_connections.extend(edge_list.loc[(edge_list['child'] == go_id), 'parent'].tolist())
    direct_connections = list(set(direct_connections))
    return direct_connections

# find_direct_connections(go_id, go_edges)


In [64]:
def get_gene_pool(go_id, edge_list, go_terms):
    '''
    go_id: the go term id
    edge_list: the edge list of the go terms
    go_terms: the full go terms dataframe
    '''
    gene_pool = set(gene for genes in go_terms['Genes'].apply(lambda x: x.split(' ')) for gene in genes)
    # print(len(gene_pool))
    direct_connections = find_direct_connections(go_id, edge_list)
    # exclude genes from direct connection terms
    direct_connection_pool = go_terms[go_terms['GO'].isin(direct_connections+ [go_id])]
    direct_connection_genes = set(gene for genes in direct_connection_pool['Genes'].apply(lambda x: x.split(' ')) for gene in genes)
    # print(len(direct_connection_genes))
    
    filtered_gene_pool = gene_pool - direct_connection_genes
    
    current_genes = go_terms.loc[go_terms['GO'] == go_id, 'Genes'].tolist()[0].split(' ')
    # print(len(current_genes))
    
    # check if theres any genes in the pool overlaps with the ones in go_id
    # print(len(filtered_gene_pool))
    assert len(set(current_genes).intersection(filtered_gene_pool)) == 0, 'Warning: genes in the pool overlaps with the ones in this real set'
    return filtered_gene_pool
    
random_pool = get_gene_pool(go_id, go_edges, full_GO)

def add_contamination(go_id, edge_list, go_terms, contamination_rate=0.5):
    '''
    contamination_rate: the percentage of genes to be contaminated
    '''
    random_pool = get_gene_pool(go_id, edge_list, go_terms)
    current_genes = go_terms.loc[go_terms['GO'] == go_id, 'Genes'].tolist()[0].split(' ')
    # print(current_genes)
    contamination_size = int(np.ceil(len(current_genes) * contamination_rate))
    contamination_genes = np.random.choice(list(random_pool), size=contamination_size, replace=False)
    perserve_genes = np.random.choice(current_genes, size=len(current_genes) - contamination_size, replace=False)
    new_set = list(perserve_genes) + list(contamination_genes)
    assert len(new_set) == len(current_genes), 'length of new set is not the same as the original set'
    return new_set

add_contamination(go_id,  go_edges, full_GO, contamination_rate=0.5)

['XPA', 'ERCC4', 'LRP1', 'IGSF3', 'POGZ']

In [65]:
# add contamination to the toy example
toy_example['50perc_contaminated_Genes'] = toy_example.apply(lambda x: ' '.join(add_contamination(x['GO'], go_edges, full_GO, contamination_rate=0.5)), axis=1)
toy_example['100perc_contaminated_Genes'] = toy_example.apply(lambda x: ' '.join(add_contamination(x['GO'], go_edges, full_GO, contamination_rate=1)), axis=1)
toy_example.to_csv('data/GO_term_analysis/toy_example_contaminated.csv', index=False)

In [66]:
toy_example.head()

Unnamed: 0,GO,Genes,Gene_Count,Term_Description,50perc_contaminated_Genes,100perc_contaminated_Genes
0,GO:0032385,LDLRAP1 SCP2D1 ANXA2 SCP2,4,positive regulation of intracellular cholester...,LDLRAP1 SCP2 TRIM45 NME5,HMGA2 MID2 HSFX2 FOXP4
1,GO:0002468,NOD1 HLA-DRA CLEC4A HLA-DRB1 CCL21 NOD2 CCL19 ...,15,dendritic cell antigen processing and presenta...,CD68 HLA-DRB3 CCL19 CCL21 HLA-DRA NOD2 THBS1 T...,JAG1 LTK ARL17A SLCO4A1 PLEKHO2 NDUFS5 ZC3H12D...
2,GO:0033683,OGG1 ERCC5 XPA ERCC4 NTHL1,5,"nucleotide-excision repair, DNA incision",XPA NTHL1 NAA11 SCD5 CDCA8,MBTPS2 PRCD BUB3 SLC13A1 FADS2
3,GO:0035672,SLC7A11 SLC25A39 SLC26A6 ABCB9 SLC15A4 ABCC5 C...,15,oligopeptide transmembrane transport,GJA1 SLC15A4 SLC15A1 CDH17 SLC25A39 SLC26A6 SL...,DEFB113 GLMN CELA2B SIGLEC7 RIGI CCL3L3 DEFB11...
4,GO:0048023,OPN3 CDH3 ATP7A APPL1 ASIP RAB38 ZEB2 TYRP1 GIPC1,9,positive regulation of melanin biosynthetic pr...,TYRP1 CDH3 OPN3 RAB38 FGFRL1 ZNF429 DUS3L CTSK...,WEE2 STIM1 EXOC4 MYO15A GLIPR1L1 ATAD3A CDCA5 ...
