# Pathways workflow
## Calculating PII and NPII values

In [1]:
import os
import os.path
import glob
import pathlib

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict
from xlrd import open_workbook

In [2]:
#my_tag = "H3K4me3"
#my_tag = "H3K27me3"
my_tags = ["H3K4me3", "H3K9ac", "H3K27ac", "H3K27me3", "H3K9me3"]

In [3]:
pathway_tables = glob.glob("../pathways/*/gp.csv")

In [4]:
pathway_tables

['../pathways/Metabolism 1.1/gp.csv',
 '../pathways/Biocarta 1.0.1/gp.csv',
 '../pathways/KEGG Adjusted 1.2/gp.csv',
 '../pathways/Cytoskeleton 1.0/gp.csv',
 '../pathways/Reactome 1.1.1/gp.csv',
 '../pathways/Primary 1.2/gp.csv',
 '../pathways/NCI 1.0/gp.csv']

In [5]:
dfs = [pd.read_csv(table) for table in pathway_tables]
for i, df in enumerate(dfs):
    dfs[i] = df.set_index("SYMBOL")
    dfs[i].sort_index(inplace=True)
    #print(dfs[i].shape)
dfs[0]
united_df = pd.concat(dfs, axis=1, sort=True)  # sorting added
united_df.shape
len(set(united_df.index))

9134

In [6]:
## REPLACING NA WITH ZEROS (0)
united_df.fillna(value=0, inplace=True)
print(united_df.shape)
united_df.head()

(9134, 3126)


Unnamed: 0,125-dihydroxyvitamin_Dsub3sub_biosynthesis,1D-imyoi-inositol_hexakisphosphate_biosynthesis_II_mammalian,1D-imyoi-inositol_hexakisphosphate_biosynthesis_V_from_Ins134P3,2-amino-3-carboxymuconate_semialdehyde_degradation_to_glutaryl-CoA,2-deoxy-alpha-D-ribose_1-phosphate_degradation,2-oxobutanoate_degradation,2-oxoglutarate_decarboxylation_to_succinyl-CoA,2-oxoisovalerate_decarboxylation_to_isobutanoyl-CoA,3-phosphoinositide_biosynthesis,3-phosphoinositide_degradation,...,NCI_p53_Main_Pathway,NCI_p53_Pathway_(apoptosis),NCI_p53_Pathway_(proteasomal_ubiquitin_dependent_protein_catabolic_process),NCI_p73_transcription_factor_network_Main_Pathway,NCI_p73_transcription_factor_network_Pathway_(apoptosis),NCI_p75_NTR_mediated_signaling_Main_Pathway,NCI_p75_NTR_mediated_signaling_Pathway_(activation_of_caspase_activity),NCI_p75_NTR_mediated_signaling_Pathway_(cell_cycle_arrest),NCI_p75_NTR_mediated_signaling_Pathway_(neuron_apoptosis),NCI_p75_NTR_mediated_signaling_Pathway_(neuron_projection_morphogenesis)
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GALT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GNT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AACS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
genes = list(united_df.index)

In [8]:
all_pathways = list(united_df.columns)

In [9]:
tables_human = dict()
tables_mouse = dict()
for hg_tag in my_tags:
    gre_human_tables = glob.glob("../human/gre/GRE*.xlsx")
    #gre_human_tables = [f"file://localhost{os.path.abspath(path)}" for path in glob.glob("./data/human/GRE*.xlsx")]
    gre_mouse_tables = glob.glob(f"../mouse/gre/*_GRE*{hg_tag}.txt")

    gre_human_dfs = [pd.read_excel(open_workbook(table), sheet_name=hg_tag, index_col="Gene_name", engine="xlrd") for table in gre_human_tables]

    gre_mouse_dfs = [pd.read_csv(table, index_col="gene", sep="\t") for table in gre_mouse_tables]

    # pair (GRE, GTE)
    tables_human[hg_tag] = [(df["gene_signal"], df["total_signal"]) for df in gre_human_dfs]
    tables_mouse[hg_tag] = [(df["gene_signal"], df["total_signal"]) for df in gre_mouse_dfs]

## HUMAN

In [10]:
FDATA_HUMAN = defaultdict(dict)
for i, hg_tag in enumerate(my_tags):
    total_gre_human = None
    total_gte_human = None
    for pair in tables_human[hg_tag]:
        total_gre_human = pair[1] if total_gre_human is None else total_gre_human + pair[1]
        total_gte_human = pair[0] if total_gte_human is None else total_gte_human + pair[0]

    GENES_HUMAN = sorted(set(genes) & set(total_gre_human.index) & set(total_gte_human.index))
    #print(len(genes), len(GENES_HUMAN))
    missing_genes = sorted(set(genes) - (set(genes) & set(total_gre_human.index) & set(total_gte_human.index)))
    #print(len(missing_genes) + len(GENES_HUMAN))

    #print("non-cut\t", len(total_gre_human), len(total_gte_human))
    total_gre_human = total_gre_human.reindex(GENES_HUMAN, copy=False)
    total_gte_human = total_gte_human.reindex(GENES_HUMAN, copy=False)
    #print("cut\t", len(total_gre_human), len(total_gte_human))

    united_df_human = united_df.reindex(GENES_HUMAN)
    print("matrix shape:", united_df_human.shape)

    PII_HUMAN = pd.Series()
    NPII_HUMAN = pd.Series()
    for column_id in united_df_human:
        column = united_df_human[column_id]
        n = column.fillna(0).sum()
        gre_score_n = sum(column.multiply(total_gre_human))
        gte_score_n = sum(column.multiply(total_gte_human))
        pii_score = gre_score_n / n
        try:
            npii_score = gre_score_n / gte_score_n
        except ZeroDivisionError:
            npii_score = 0.0
        PII_HUMAN = PII_HUMAN.append(pd.Series({column_id: pii_score}), verify_integrity=True)
        NPII_HUMAN = NPII_HUMAN.append(pd.Series({column_id: npii_score}), verify_integrity=True)
    FDATA_HUMAN["PII"][hg_tag] = PII_HUMAN
    FDATA_HUMAN["NPII"][hg_tag] = NPII_HUMAN

matrix shape: (8676, 3126)




matrix shape: (8676, 3126)
matrix shape: (8676, 3126)
matrix shape: (8676, 3126)
matrix shape: (8676, 3126)


## MOUSE

In [11]:
FDATA_MOUSE = defaultdict(dict)
for i, hg_tag in enumerate(my_tags):
    total_gre_mouse = None
    total_gte_mouse = None
    for pair in tables_mouse[hg_tag]:
        total_gre_mouse = pair[1] if total_gre_mouse is None else total_gre_mouse + pair[1]
        total_gte_mouse = pair[0] if total_gte_mouse is None else total_gte_mouse + pair[0]

    GENES_MOUSE = sorted(set(genes) & set(total_gre_mouse.index) & set(total_gte_mouse.index))
    #print(len(genes), len(GENES_MOUSE))
    missing_genes = sorted(set(genes) - (set(genes) & set(total_gre_mouse.index) & set(total_gte_mouse.index)))
    #print(len(missing_genes) + len(GENES_MOUSE))

    #print("non-cut\t", len(total_gre_mouse), len(total_gte_mouse))
    total_gre_mouse = total_gre_mouse.reindex(GENES_MOUSE, copy=False)
    total_gte_mouse = total_gte_mouse.reindex(GENES_MOUSE, copy=False)
    #print("cut\t", len(total_gre_mouse), len(total_gte_mouse))

    united_df_mouse = united_df.reindex(GENES_MOUSE)
    print("matrix shape:", united_df_mouse.shape)

    PII_MOUSE = pd.Series()
    NPII_MOUSE = pd.Series()
    for column_id in united_df_mouse:
        column = united_df_mouse[column_id]
        n = column.fillna(0).sum()
        gre_score_n = sum(column.multiply(total_gre_mouse))
        gte_score_n = sum(column.multiply(total_gte_mouse))
        pii_score = gre_score_n / n
        try:
            npii_score = gre_score_n / gte_score_n
        except ZeroDivisionError:
            npii_score = 0.0
        PII_MOUSE = PII_MOUSE.append(pd.Series({column_id: pii_score}), verify_integrity=True)
        NPII_MOUSE = NPII_MOUSE.append(pd.Series({column_id: npii_score}), verify_integrity=True)
    FDATA_MOUSE["PII"][hg_tag] = PII_MOUSE
    FDATA_MOUSE["NPII"][hg_tag] = NPII_MOUSE

matrix shape: (8121, 3126)




matrix shape: (8121, 3126)
matrix shape: (8121, 3126)
matrix shape: (8121, 3126)
matrix shape: (8121, 3126)


In [12]:
OUT_DICT = dict()
for i, hg_tag in enumerate(my_tags):
    OUT_DICT[f"PII_HUMAN_{hg_tag}"] = FDATA_HUMAN["PII"][hg_tag]
    OUT_DICT[f"NPII_HUMAN_{hg_tag}"] = FDATA_HUMAN["NPII"][hg_tag]
    OUT_DICT[f"PII_MOUSE_{hg_tag}"] = FDATA_MOUSE["PII"][hg_tag]
    OUT_DICT[f"NPII_MOUSE_{hg_tag}"] = FDATA_MOUSE["NPII"][hg_tag]

In [13]:
OUT_DF = pd.DataFrame(OUT_DICT)

In [14]:
OUT_DF.head()

Unnamed: 0,PII_HUMAN_H3K4me3,NPII_HUMAN_H3K4me3,PII_MOUSE_H3K4me3,NPII_MOUSE_H3K4me3,PII_HUMAN_H3K9ac,NPII_HUMAN_H3K9ac,PII_MOUSE_H3K9ac,NPII_MOUSE_H3K9ac,PII_HUMAN_H3K27ac,NPII_HUMAN_H3K27ac,PII_MOUSE_H3K27ac,NPII_MOUSE_H3K27ac,PII_HUMAN_H3K27me3,NPII_HUMAN_H3K27me3,PII_MOUSE_H3K27me3,NPII_MOUSE_H3K27me3,PII_HUMAN_H3K9me3,NPII_HUMAN_H3K9me3,PII_MOUSE_H3K9me3,NPII_MOUSE_H3K9me3
125-dihydroxyvitamin_Dsub3sub_biosynthesis,2.24836,0.37589,18.982333,1.600691,2.391201,0.636846,15.014,1.06075,2.607717,0.730111,10.488333,1.162761,3.312624,0.717225,16.368333,13.641415,3.757602,0.881069,11.621333,8.959243
1D-imyoi-inositol_hexakisphosphate_biosynthesis_II_mammalian,5.605292,0.799023,36.733933,8.579047,6.174049,0.861989,32.454067,5.485992,7.398026,1.086097,23.632267,5.620164,3.561952,0.831201,20.007467,12.63853,3.625598,0.869489,15.303,11.036719
1D-imyoi-inositol_hexakisphosphate_biosynthesis_V_from_Ins134P3,1.042087,1.500849,82.579,13.647767,1.810458,1.605907,65.185,7.035922,2.612022,0.984017,33.130333,4.765789,1.454336,0.50359,24.618,21.278668,2.147989,0.558334,19.076,15.254292
2-amino-3-carboxymuconate_semialdehyde_degradation_to_glutaryl-CoA,6.896377,1.313218,38.3785,30.351932,8.087387,1.375695,26.035,22.939337,9.11816,2.102414,14.8995,13.832335,7.61266,1.958742,28.476,20.090306,8.971827,1.58674,21.058,8.468932
2-deoxy-alpha-D-ribose_1-phosphate_degradation,8.35736,1.102882,24.0656,2.44436,8.118468,1.206915,23.0238,1.328723,11.054988,1.552252,12.9412,0.826807,6.275483,1.351197,13.6146,9.23675,5.20134,1.194269,9.229,8.535097


In [15]:
OUT_DF.dropna().shape
OUT_DF.dropna().to_csv(f"../extracted/pii-vs-npii.all.csv")