In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
# load compartment data
df = pd.read_csv('human_compartment_integrated_full.tsv', sep='\t', header=None)

# load recon gene list
gene_id = pd.read_csv('../../../recon/genes.tsv', sep='\t')['GENEID'].tolist()
gene_symbol = pd.read_csv('../../../recon/genes.tsv', sep='\t')['SYMBOL'].tolist()

# compartments list
compartment = ['c','e','g','l','m','n','r','x','i']
compartment_name = ['Cytosol',['Extracellular region','Extracellular space'],'Golgi apparatus','Lysosome','Mitochondrion','Nucleus','Endoplasmic reticulum','Peroxisome','Mitochondrial intermembrane space']

In [None]:
# initialize results
compartment_score = pd.DataFrame(index=gene_id, columns=compartment)

# get compartment scores for each gene
for i in range(len(compartment)):
    
    # iterate over genes
    for j in range(len(gene_id)):
        
        # if gene in dataset
        if gene_symbol[j] in list(set(df[1].values.tolist())):
            
            # check for value
            if type(compartment_name[i]) == list:
                values = []
                for k in range(len(compartment_name[i])):
                    df_subset = df[df[1] == gene_symbol[j]]
                    if compartment_name[i][k] in df_subset[3].values.tolist():
                        if len(df_subset[df_subset[3] == compartment_name[i][k]][4].values.tolist()) == 1:
                            values.append(df_subset[df_subset[3] == compartment_name[i][k]][4].values.tolist()[0])
                        else:
                            raise Exception('%d values for %s in %s' % (len(df_subset[df_subset[3] == compartment_name[i][k]][4].values.tolist()), gene_symbol[j], compartment_name[i][k]))
                if len(values) > 0:
                    compartment_score.at[gene_id[j],compartment[i]] = np.max(values)
                else:
                    compartment_score.at[gene_id[j],compartment[i]] = 0
            else:
                df_subset = df[df[1] == gene_symbol[j]]
                if compartment_name[i] in df_subset[3].values.tolist():
                    if len(df_subset[df_subset[3] == compartment_name[i]][4].values.tolist()) == 1:
                        compartment_score.at[gene_id[j],compartment[i]] = df_subset[df_subset[3] == compartment_name[i]][4].values.tolist()[0]
                    else:
                        raise Exception('%d values for %s in %s' % (len(df_subset[df_subset[3] == compartment_name[i]][4].values.tolist()), gene_symbol[j], compartment_name[i][k]))
                else:
                    compartment_score.at[gene_id[j],compartment[i]] = 0
            
        # if gene not in dataset, give -1
        else:
            compartment_score.at[gene_id[j],compartment[i]] = -1
            
# print to file
compartment_score.to_csv('compartment_values.tsv', sep='\t')

In [None]:
# load reaction-gene matrix
rxnGeneMat = np.array(pd.read_csv('../../../recon/rxnGeneMat.csv', header=None))

# load reaction data
df_reactions = pd.read_csv('../../../recon/reactions.tsv', sep='\t', index_col=0)
reactions = df_reactions.index.tolist()

In [None]:
# get gene fraction for every reaction
df_score = pd.DataFrame(columns=['REACTION','GENE ID','GENE SYMBOL','FRACTION'])

# square bracket regex
p = re.compile('\[.*?\]')

# iterate over genes
for i in range(rxnGeneMat.shape[1]):
    
    # get associated reactions and compartments
    rxns = []
    comparts = []
    for j in [a for a,x in enumerate(rxnGeneMat[:,i]) if x==1]:
        rxns.append(j)
        
        # get compartment
        formula = df_reactions.at[reactions[j],'FORMULA']
        comparts.append([x[1] for x in list(set(p.findall(formula)))])

    # assign scores for each compartment
    comparts_unique = []
    scores_unique = []
    for j in range(len(comparts)):
        for k in range(len(comparts[j])):
            if comparts[j][k] not in comparts_unique:
                comparts_unique.append(comparts[j][k])
    for j in range(len(comparts_unique)):
        scores_unique.append(compartment_score.at[gene_id[i],comparts_unique[j]])

    # if some values greater than 0, use those values
    if np.max(scores_unique) > 0:
        scores_unique = [x/np.sum(scores_unique) for x in scores_unique]

    # if not, divide evenly among all compartments
    else:
        scores_unique = [1/len(scores_unique) for x in scores_unique]

    # assign scores for each reaction
    for j in range(len(rxns)):
        values = []
        for k in range(len(comparts[j])):
            values.append(scores_unique[comparts_unique.index(comparts[j][k])])
        df_score.loc[len(df_score)] = [reactions[rxns[j]], gene_id[i], gene_symbol[i], np.max(values)]
        
# print to file
df_score.to_csv('gene_fraction.tsv', sep='\t', index=False)