In [1]:
import pandas as pd 
import numpy as np 
import networkx as nx

np.random.seed(42)

## Create random gene sets and contaminated ones 

* random: same size as the selected term, draw genes from the pool that is not in the systems with direct connection with the real gene set 

* contaimnated: same size as the selected term, randomly perserve 50% of them (roof) and the rest 50% randomly select from the random gene set. 

In [2]:
full_GO = pd.read_csv('data/go_terms.csv', index_col=0)
toy_example = pd.read_csv('data/GO_term_analysis/toy_example.csv')
go_edges = pd.read_csv('./data/GO_BP/collapsed_go.symbol', sep='\t', header=None, names=['parent', 'child', 'edge_type'])
go_edges = go_edges[go_edges['edge_type'] == 'default']

In [3]:
def find_direct_connections(go_id, edge_list):
    direct_connections = edge_list.loc[(edge_list['parent'] == go_id), 'child'].tolist()
    # print(len(direct_connections))
    direct_connections.extend(edge_list.loc[(edge_list['child'] == go_id), 'parent'].tolist())
    direct_connections = list(set(direct_connections))
    return direct_connections

# find_direct_connections(go_id, go_edges)


In [4]:
def get_gene_pool(go_id, edge_list, go_terms):
    '''
    go_id: the go term id
    edge_list: the edge list of the go terms
    go_terms: the full go terms dataframe
    '''
    gene_pool = set(gene for genes in go_terms['Genes'].apply(lambda x: x.split(' ')) for gene in genes)
    # print(len(gene_pool))
    direct_connections = find_direct_connections(go_id, edge_list)
    # exclude genes from direct connection terms
    direct_connection_pool = go_terms[go_terms['GO'].isin(direct_connections+ [go_id])]
    direct_connection_genes = set(gene for genes in direct_connection_pool['Genes'].apply(lambda x: x.split(' ')) for gene in genes)
    # print(len(direct_connection_genes))
    
    filtered_gene_pool = gene_pool - direct_connection_genes
    
    current_genes = go_terms.loc[go_terms['GO'] == go_id, 'Genes'].tolist()[0].split(' ')
    # print(len(current_genes))
    
    # check if theres any genes in the pool overlaps with the ones in go_id
    # print(len(filtered_gene_pool))
    assert len(set(current_genes).intersection(filtered_gene_pool)) == 0, 'Warning: genes in the pool overlaps with the ones in this real set'
    return filtered_gene_pool


def add_contamination(go_id, edge_list, go_terms, contamination_rate=0.5):
    '''
    contamination_rate: the percentage of genes to be contaminated
    '''
    random_pool = get_gene_pool(go_id, edge_list, go_terms)
    current_genes = go_terms.loc[go_terms['GO'] == go_id, 'Genes'].tolist()[0].split(' ')
    # print(current_genes)
    contamination_size = int(np.ceil(len(current_genes) * contamination_rate))
    contamination_genes = np.random.choice(list(random_pool), size=contamination_size, replace=False)
    perserve_genes = np.random.choice(current_genes, size=len(current_genes) - contamination_size, replace=False)
    new_set = list(perserve_genes) + list(contamination_genes)
    assert len(new_set) == len(current_genes), 'length of new set is not the same as the original set'
    return new_set


In [5]:
# add contamination to the toy example
toy_example['50perc_contaminated_Genes'] = toy_example.apply(lambda x: ' '.join(add_contamination(x['GO'], go_edges, full_GO, contamination_rate=0.5)), axis=1)
toy_example['100perc_contaminated_Genes'] = toy_example.apply(lambda x: ' '.join(add_contamination(x['GO'], go_edges, full_GO, contamination_rate=1)), axis=1)
toy_example.to_csv('data/GO_term_analysis/toy_example_contaminated.csv', index=False)

In [6]:
toy_example.head()

Unnamed: 0,GO,Genes,Gene_Count,Term_Description,50perc_contaminated_Genes,100perc_contaminated_Genes
0,GO:0070350,FGF10 FTO VSTM2A,3,regulation of white fat cell proliferation,FGF10 INPP5J STXBP5,BCORL1 DEFB1 SS18L2
1,GO:0051936,SLC6A12 SLC6A11 SLC6A1 SLC6A13,4,gamma-aminobutyric acid reuptake,SLC6A11 SLC6A1 ENDOD1 LGALS7B,CSRNP3 ZNF572 RPL10 EXOSC4
2,GO:0070646,USP17L10 USP29 TNIP1 USP17L3 USP20 USP5 USP18 ...,135,protein modification by small protein removal,TNIP1 USP17L20 ABRAXAS1 USP38 USP17L21 USP14 U...,RPS27 PSMA3 EDC3 KLK13 TRIB2 PLA2G4E SLU7 UBLC...
3,GO:0071318,P2RY2 P2RY4 CIB2 PTGS2 P2RX7 TRPM4 SSH1 TAF1 P...,18,cellular response to ATP,PDXP P2RY4 TRPM4 TOP2B RYR3 P2RY1 HSP90B1 P2RY...,TDRP NAA30 SEPTIN6 HVCN1 SLC24A2 ZNF397 OR6C65...
4,GO:0038183,ABCB11 ABCG5 CYP8B1 ABCG8 NR1H4 PRKAA1 CIDEB V...,11,bile acid signaling pathway,CYP8B1 GPBAR1 VDR ABCG8 CYP7A1 CMTR1 SNX1 SLC2...,SYT10 NOPCHAP1 CLIP4 P3R3URF MAGOH SLC43A2 GLI...
