In [3]:
#MAKE_cytoscape_ready_format        22.04.19
#
#> topology
#- source, target, edge info
#- deg_group, goterm, number of degs in goterm
#
#> topology_info
#- node, node_size
#- deg_group, gene size
#- goterm, fixed size

import pandas as pd


def update_deg_info_dict(data_file, deg_dict, condition_regulation):
    
    data_df = pd.read_csv(data_file, header=None, sep="\t")
    r, c = data_df.shape
    deg_dict[condition_regulation] = r
    
    return deg_dict

def update_topology_dict(data_file, topology_dict, goterm_list, condition_regulation):
    
    data_df = pd.read_csv(data_file, sep="\t")
    r, c = data_df.shape
    
    for i in range(r):
        goterm = data_df["Term"][i]
        goterm = goterm.split('~')[1] #Doing this for visual simplicity
        
        pval = data_df["PValue"][i]
        count = data_df["Count"][i]
        enrichment = data_df["Fold Enrichment"][i]
        category = data_df["Category"][i] #Doing this for visual simplicity

        if category == "GOTERM_BP_FAT":        
#            if pval < 0.05: #changed in Aug15
            if pval < 0.01:
#                 topology_dict[condition_regulation, goterm] = count
                topology_dict[condition_regulation, goterm] = enrichment
                goterm_list.append(goterm)

        
    return topology_dict, goterm_list
    

In [4]:
data_file_dir = "/Users/m221138/RA_acpa_multiomics/analysis/correlation_network/network_similarity_v3/gse"

condition_list = ["GSE.acpa_neg.protein.correlation.neg.clean"]
regulation_list = ["neg","pos"]

output_topology_file = '%s/proteomics.gse.topology.tsv' % data_file_dir
output_info_file = '%s/proteomics.gse.topology.info.tsv' % data_file_dir

topology_dict = {}
deg_info_dict = {}
goterm_list = []

for condition in condition_list:
    for regulation in regulation_list:
        deg_file = '%s/proteomics/proteomics.diff.%s.%s.tsv' % (data_file_dir, condition, regulation)
        gse_file = "%s/proteomics/gse/GSE.proteomics.%s.%s.clean.txt" % (data_file_dir, condition, regulation)
        
        condition_regulation = '%s_%s' % (condition, regulation)

        deg_info_dict = update_deg_info_dict(deg_file, deg_info_dict, condition_regulation)
        topology_dict, goterm_list = update_topology_dict(gse_file, topology_dict, goterm_list, condition_regulation)

goterm_list = list(set(goterm_list))
    
#Write topology file
output_topology_txt = open(output_topology_file, 'w')
output_topology_txt.write('source\ttarget\tedge_size\n')
for condition in condition_list:
    for regulation in regulation_list:
        condition_regulation = '%s_%s' % (condition, regulation)
        for goterm in goterm_list:
            try:
                gene_count = topology_dict[condition_regulation, goterm]
                output_topology_txt.write('%s\t%s\t%s\n' % (condition_regulation, goterm, gene_count))
            except KeyError:
                None
                
output_topology_txt.close()

#write topology info file
output_info_txt = open(output_info_file,'w')
output_info_txt.write('node\tnode_type\tnode_size\n')
for condition in condition_list:
    for regulation in regulation_list: 
        condition_regulation = '%s_%s' % (condition, regulation)
        node_size = deg_info_dict[condition_regulation]
        output_info_txt.write('%s\t1\t%s\n' % (condition_regulation, node_size))

for goterm in goterm_list:
    output_info_txt.write('%s\t2\t%s\n' % (goterm, '10'))

output_info_txt.close()

        
            
            
    