# Analysis of enrichment

In [1]:
import glob

import json
import math

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from functools import reduce
from collections import OrderedDict, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import fisher_exact as fisher
from scipy.stats import chi2_contingency as chisq

In [2]:
def ease(n_outliers_path, n_total_path, n_outliers, n_total):
    """
    Calculates a contingency table EASE score
    [x y]
    [z k]
    :param n_in_path: number of outliers in the pathway
    :param n_total_path: total number of genes in the pathway
    :param n_outliers: total number of outliers
    :param n_total: total number of genes analysed
    :return:
    """

    x = max(0, n_outliers_path - 1)   # in category, enriched
    y = n_total_path                  # total,       enriched
    z = n_outliers - n_outliers_path  # in category, not enriched
    k = n_total - n_total_path        # total,       not enriched
    
    #if x <= 10:
    _, pvalue = fisher(([[x, y], [z, k]]), alternative='greater')
    #else:
    #    _, pvalue, _, _ = chisq(([[x, y], [z, k]]))

    return pvalue

## Collecting all pathway names

In [3]:
pathway_tables = glob.glob("../pathways/*/gp.csv")
dfs = [pd.read_csv(table) for table in pathway_tables]
for i, df in enumerate(dfs):
    dfs[i] = df.set_index("SYMBOL")
    dfs[i].sort_index(inplace=True)
    #print(dfs[i].shape)
dfs[0]
all_entries = list(pd.concat(dfs, axis=1, sort=True).columns)

In [4]:
all_entries[0:10]

['125-dihydroxyvitamin_Dsub3sub_biosynthesis',
 '1D-imyoi-inositol_hexakisphosphate_biosynthesis_II_mammalian',
 '1D-imyoi-inositol_hexakisphosphate_biosynthesis_V_from_Ins134P3',
 '2-amino-3-carboxymuconate_semialdehyde_degradation_to_glutaryl-CoA',
 '2-deoxy-alpha-D-ribose_1-phosphate_degradation',
 '2-oxobutanoate_degradation',
 '2-oxoglutarate_decarboxylation_to_succinyl-CoA',
 '2-oxoisovalerate_decarboxylation_to_isobutanoyl-CoA',
 '3-phosphoinositide_biosynthesis',
 '3-phosphoinositide_degradation']

In [5]:
structures = pd.read_csv("../extracted/classification_pathways.csv", header=0, index_col="Pathway")
structures = pd.DataFrame(structures, dtype=bool)

In [6]:
del structures["DUPLICATE?"], structures["TRUTHFULNESS"], structures["Garbage"]
structures.head()

Unnamed: 0_level_0,Carbohydrates Metabolism,Aminoacids and Polyamines Metabolism,Lipids Metabolism,Nucleic Base Metabolism,Catabolism of Xenobiotics,Vitamin Metabolism,Translation and protein maturation,Signaling,Perception and Neurotransmission,Immunity Linked Pathways,...,Cell cycle regulation and apoptosis,Molecular Transport pathways,Invasive Disease,Cytoskeleton Organisation and Cell Adhesion Linked Pathways,Cancer and Tumorigenesis Linked Pathways,Chromatin Structure Linked Pathways,"RNA synthesis, processing and degradation","DNA replication, recombination and repair",Mitochondria Linked Pathways,Other
Pathway,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
125-dihydroxyvitamin_Dsub3sub_biosynthesis,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1D-imyoi-inositol_hexakisphosphate_biosynthesis_II_mammalian,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1D-imyoi-inositol_hexakisphosphate_biosynthesis_V_from_Ins134P3,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2-amino-3-carboxymuconate_semialdehyde_degradation_to_glutaryl-CoA,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2-deoxy-alpha-D-ribose_1-phosphate_degradation,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
all_2 = set(structures.index)
set(all_entries) - all_2

set()

In [8]:
pathway_types = dict()
for pathway in sorted(all_entries):
    x = structures.loc[pathway]
    pathway_types[pathway] = x[x].index[0]

In [9]:
reverse_counter = defaultdict(int)
for pathway in sorted(all_entries):
    category = pathway_types[pathway]
    reverse_counter[category] += 1
reverse_counter

defaultdict(int,
            {'Aminoacids and Polyamines Metabolism': 119,
             'Perception and Neurotransmission': 83,
             'Catabolism of Xenobiotics': 48,
             'Vitamin Metabolism': 43,
             'Lipids Metabolism': 147,
             'Nucleic Base Metabolism': 78,
             'Other': 104,
             'Signaling': 1178,
             'Cancer and Tumorigenesis Linked Pathways': 65,
             'Cell cycle regulation and apoptosis': 277,
             'DNA replication, recombination and repair': 104,
             'Molecular Transport pathways': 105,
             'Chromatin Structure Linked Pathways': 42,
             'Hormone Linked Pathways': 91,
             'Immunity Linked Pathways': 213,
             'Carbohydrates Metabolism': 87,
             'Cytoskeleton Organisation and Cell Adhesion Linked Pathways ': 122,
             'Translation and protein maturation': 59,
             'Invasive Disease': 37,
             'Mitochondria Linked Pathways': 36,


In [10]:
ALL_PATHS = sum(reverse_counter.values())
ALL_PATHS

3126

# By histone tag:

In [11]:
my_tags = ["H3K4me3", "H3K9ac", "H3K27ac", "H3K27me3", "H3K9me3"]

In [12]:
ENR_COUNTERS = dict()
for hg_tag in my_tags:
    files_up_human = glob.glob(f"../extracted/Human_{hg_tag}_pathways_up*")
    files_down_human = glob.glob(f"../extracted/Human_{hg_tag}_pathways_down*")
    files_up_mouse = glob.glob(f"../extracted/Mouse_{hg_tag}_pathways_up*")
    files_down_mouse = glob.glob(f"../extracted/Mouse_{hg_tag}_pathways_down*")

    files = {"Human+": files_up_human[0],
             "Human-": files_down_human[0],
             "Mouse+": files_up_mouse[0],
             "Mouse-": files_down_mouse[0]}

    enriched_counter = defaultdict(lambda: defaultdict(int))
    for xtype in files:
        with open(files[xtype], "r") as file:
            en_pathways = file.read().strip().split("\n")
        for pw in en_pathways:
            cat = pathway_types[pw]
            enriched_counter[xtype][cat] += 1
    enriched_counter = pd.DataFrame(enriched_counter).T.fillna(0)
    enriched_counter = pd.DataFrame(enriched_counter, dtype=int)
    ENR_COUNTERS[hg_tag] = enriched_counter
ENR_COUNTERS[my_tags[0]]

Unnamed: 0,Aminoacids and Polyamines Metabolism,Cancer and Tumorigenesis Linked Pathways,Carbohydrates Metabolism,Catabolism of Xenobiotics,Cell cycle regulation and apoptosis,Chromatin Structure Linked Pathways,Cytoskeleton Organisation and Cell Adhesion Linked Pathways,"DNA replication, recombination and repair",Hormone Linked Pathways,Immunity Linked Pathways,...,Mitochondria Linked Pathways,Molecular Transport pathways,Nucleic Base Metabolism,Other,Perception and Neurotransmission,"RNA synthesis, processing and degradation",Signaling,Sulfur Metabolism and Linked Redox Reactions,Translation and protein maturation,Vitamin Metabolism
Human+,16,1,5,6,6,0,2,0,4,12,...,0,8,5,2,10,0,9,3,0,4
Human-,1,0,3,0,4,1,1,4,0,1,...,0,0,5,8,0,5,5,0,7,0
Mouse+,3,1,1,5,6,0,1,1,0,2,...,3,0,2,1,1,0,8,2,1,1
Mouse-,5,0,3,2,11,1,8,6,2,5,...,1,10,5,4,0,4,31,1,5,2


Calculates a contingency table EASE score  
[x y]  
[z k]  
:param n_in_path: number of outliers in the pathway  
:param n_total_path: total number of genes in the pathway  
:param n_outliers: total number of outliers  
:param n_total: total number of genes analysed  
:return:  

In [13]:
ksi = defaultdict(dict)
signs = {"+": "positively\u00A0enriched\u00A0(+)",
        "-": "negatively\u00A0enriched\u00A0(-)"}
for hg_tag in my_tags:
    enriched_counter = ENR_COUNTERS[hg_tag]
    for sign in ["+", "-"]:
        for org in ["Human", "Mouse"]:
            for category in enriched_counter:
                n1 = enriched_counter[category][f"{org}{sign}"]
                n2 = sum(enriched_counter.loc[f"{org}{sign}"])
                n3 = reverse_counter[category]
                n4 = ALL_PATHS
                #print(n1, n2, n3, n4)
                ksi[category][f"{org},\u00A0{hg_tag},\u00A0{signs[sign]}"] = ease(n1, n2, n3, n4)
pd.DataFrame(ksi).to_csv(f"../extracted/pvalues.csv")
pd.DataFrame(ksi)

Unnamed: 0,Aminoacids and Polyamines Metabolism,Cancer and Tumorigenesis Linked Pathways,Carbohydrates Metabolism,Catabolism of Xenobiotics,Cell cycle regulation and apoptosis,Chromatin Structure Linked Pathways,Cytoskeleton Organisation and Cell Adhesion Linked Pathways,"DNA replication, recombination and repair",Hormone Linked Pathways,Immunity Linked Pathways,...,Mitochondria Linked Pathways,Molecular Transport pathways,Nucleic Base Metabolism,Other,Perception and Neurotransmission,"RNA synthesis, processing and degradation",Signaling,Sulfur Metabolism and Linked Redox Reactions,Translation and protein maturation,Vitamin Metabolism
"Human, H3K27ac, negatively enriched (-)",1.0,1.0,1.0,1.0,0.089173,0.539746,1.0,0.5767476,1.0,0.562063,...,1.0,1.0,0.060329,0.001011,1.0,0.319614,0.984301,1.0,0.098382,1.0
"Human, H3K27ac, positively enriched (+)",2e-06,1.0,0.296924,0.017659,0.998263,1.0,0.741372,1.0,0.325504,0.002116,...,1.0,0.844481,0.012756,0.84066,0.737934,1.0,1.0,0.190137,1.0,0.389585
"Human, H3K27me3, negatively enriched (-)",0.403831,1.0,1.0,1.0,0.966991,1.0,1.0,1.0,0.102526,0.587725,...,0.496289,0.018326,1.0,1.0,0.474399,1.0,0.512876,1.0,1.0,1.0
"Human, H3K27me3, positively enriched (+)",0.71353,1.0,0.751213,0.434494,0.854441,1.0,0.977169,5.714523e-08,1.0,0.893701,...,1.0,0.961508,0.035842,0.625892,1.0,0.853926,0.999087,0.18436,0.110268,0.14518
"Human, H3K4me3, negatively enriched (-)",1.0,1.0,0.417812,1.0,0.826744,1.0,1.0,0.246793,1.0,1.0,...,1.0,1.0,0.041776,0.002161,1.0,0.020017,0.999965,1.0,0.000497,1.0
"Human, H3K4me3, positively enriched (+)",2e-05,1.0,0.333787,0.022003,0.952062,1.0,0.983318,1.0,0.58887,0.115611,...,1.0,0.067062,0.265831,0.969633,0.002182,1.0,1.0,0.207623,1.0,0.170779
"Human, H3K9ac, negatively enriched (-)",1.0,1.0,0.806742,1.0,0.764756,1.0,0.899741,0.5858523,1.0,0.981192,...,1.0,0.862225,0.017789,0.004542,1.0,0.114166,0.996597,1.0,0.000155,1.0
"Human, H3K9ac, positively enriched (+)",2e-06,1.0,0.512439,0.062502,0.971654,1.0,0.978046,1.0,0.777781,0.004593,...,1.0,0.236147,0.228788,0.836542,0.016672,1.0,1.0,0.187245,1.0,0.738754
"Human, H3K9me3, negatively enriched (-)",0.90533,1.0,1.0,1.0,0.203775,1.0,0.910736,1.0,0.835676,0.002068,...,0.507601,0.87535,0.204401,1.0,1.0,1.0,0.995954,1.0,1.0,0.572246
"Human, H3K9me3, positively enriched (+)",0.154421,1.0,0.735655,1.0,0.83431,0.354619,0.167189,0.6049911,1.0,0.952397,...,1.0,0.823176,0.414715,0.604991,1.0,0.116303,0.93869,0.175752,1.0,0.365553


In [14]:
TAU = pd.DataFrame(ksi)

In [15]:
def get_highlighter_min(color, point):
    def highlight_min(s):
        '''
        highlight the minimums in a Series.
        '''
        is_max = s <= point
        return [f'background-color: {color}' if v else '' for v in is_max]
    return highlight_min

In [16]:
data_round = np.round(TAU, 3)
cm = sns.light_palette("green", as_cmap=True, reverse=True)

s = data_round.style.apply(get_highlighter_min("green", 0.05), subset=([i for i in TAU.index if "+" in i], TAU.columns))
cm = sns.light_palette("red", as_cmap=True, reverse=True)

s.apply(get_highlighter_min("red", 0.05), subset=([i for i in TAU.index if "-" in i], TAU.columns))
s

Unnamed: 0,Aminoacids and Polyamines Metabolism,Cancer and Tumorigenesis Linked Pathways,Carbohydrates Metabolism,Catabolism of Xenobiotics,Cell cycle regulation and apoptosis,Chromatin Structure Linked Pathways,Cytoskeleton Organisation and Cell Adhesion Linked Pathways,"DNA replication, recombination and repair",Hormone Linked Pathways,Immunity Linked Pathways,Invasive Disease,Lipids Metabolism,Mitochondria Linked Pathways,Molecular Transport pathways,Nucleic Base Metabolism,Other,Perception and Neurotransmission,"RNA synthesis, processing and degradation",Signaling,Sulfur Metabolism and Linked Redox Reactions,Translation and protein maturation,Vitamin Metabolism
"Human, H3K27ac, negatively enriched (-)",1.0,1.0,1.0,1.0,0.089,0.54,1.0,0.577,1.0,0.562,1.0,0.934,1.0,1.0,0.06,0.001,1.0,0.32,0.984,1.0,0.098,1.0
"Human, H3K27ac, positively enriched (+)",0.0,1.0,0.297,0.018,0.998,1.0,0.741,1.0,0.326,0.002,1.0,0.009,1.0,0.844,0.013,0.841,0.738,1.0,1.0,0.19,1.0,0.39
"Human, H3K27me3, negatively enriched (-)",0.404,1.0,1.0,1.0,0.967,1.0,1.0,1.0,0.103,0.588,1.0,0.539,0.496,0.018,1.0,1.0,0.474,1.0,0.513,1.0,1.0,1.0
"Human, H3K27me3, positively enriched (+)",0.714,1.0,0.751,0.434,0.854,1.0,0.977,0.0,1.0,0.894,0.312,0.669,1.0,0.962,0.036,0.626,1.0,0.854,0.999,0.184,0.11,0.145
"Human, H3K4me3, negatively enriched (-)",1.0,1.0,0.418,1.0,0.827,1.0,1.0,0.247,1.0,1.0,1.0,0.102,1.0,1.0,0.042,0.002,1.0,0.02,1.0,1.0,0.0,1.0
"Human, H3K4me3, positively enriched (+)",0.0,1.0,0.334,0.022,0.952,1.0,0.983,1.0,0.589,0.116,1.0,0.013,1.0,0.067,0.266,0.97,0.002,1.0,1.0,0.208,1.0,0.171
"Human, H3K9ac, negatively enriched (-)",1.0,1.0,0.807,1.0,0.765,1.0,0.9,0.586,1.0,0.981,1.0,0.765,1.0,0.862,0.018,0.005,1.0,0.114,0.997,1.0,0.0,1.0
"Human, H3K9ac, positively enriched (+)",0.0,1.0,0.512,0.063,0.972,1.0,0.978,1.0,0.778,0.005,1.0,0.047,1.0,0.236,0.229,0.837,0.017,1.0,1.0,0.187,1.0,0.739
"Human, H3K9me3, negatively enriched (-)",0.905,1.0,1.0,1.0,0.204,1.0,0.911,1.0,0.836,0.002,1.0,0.34,0.508,0.875,0.204,1.0,1.0,1.0,0.996,1.0,1.0,0.572
"Human, H3K9me3, positively enriched (+)",0.154,1.0,0.736,1.0,0.834,0.355,0.167,0.605,1.0,0.952,1.0,0.455,1.0,0.823,0.415,0.605,1.0,0.116,0.939,0.176,1.0,0.366
