# Looking at the Dataset
The purpose of this notebook is to look closer at the dataset of genes, natural language descriptions, and ontology term annotations that are used in this work. As included in the preprocessing notebooks, these data are drawn from files from either publications supplements like Oellrich, Walls et al. (2015) or model species databases such as TAIR, MaizeGDB, and SGN. The datasets are already loaded and merged using classes available through the oats package.

In [1]:
import datetime
import nltk
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
from collections import defaultdict
from nltk.corpus import brown
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string, remove_stopwords
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, merge_list_dicts, flatten, to_hms
from oats.biology.dataset import Dataset
from oats.biology.groupings import Groupings
from oats.biology.relationships import ProteinInteractions, AnyInteractions
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import term_enrichment
from oats.nlp.vocabulary import token_enrichment

warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

Warming up PyWSD (takes ~10 secs)... took 5.330681085586548 secs.


True

In [2]:
data = load_from_pickle("../data/pickles/gene_phenotype_dataset_all_text_and_annotations_unmerged.pickle")
data.to_pandas().head()
data.describe()

Unnamed: 0,species,num_genes,unique_descriptions
0,ath,628658,9110
1,gmx,156,49
2,mtr,342,155
3,osa,772,389
4,sly,786,314
5,zma,6526,998
6,total,637240,11015


In [3]:
data = load_from_pickle("../data/pickles/gene_phenotype_dataset_all_text_and_annotations.pickle")
data.filter_has_description()
data.to_pandas().head()
data.describe()

Unnamed: 0,species,num_genes,unique_descriptions
0,ath,5851,3527
1,gmx,30,24
2,mtr,37,36
3,osa,92,85
4,sly,70,70
5,zma,1405,811
6,total,7485,4553


### What's there for each species?
The previously loaded dataset contains all of the genes that across six plant species that have natural language description data for phenotype(s) related to that gene. Each gene can have multiple descriptions annotated to it, which were combined or concatenated when the datasets from multiple sources were merged in creating the pickled datasets. Arabidopsis has the highest number of genes that satisfy this criteria, followed by maize, and then followed by the other four species which have a relatively low number of genes that satisfy this criteria, atleast given the sources used for this work. Note that the number of unique descriptions is lower than the number of genes in call cases, because multiple genes can have the same phenotype description associated with them.

In [4]:
wnl = WordNetLemmatizer()
lemmatize_doc = lambda d: [wnl.lemmatize(x) for x in simple_preprocess(d)]

dists = defaultdict(list)

sent_lists = {}
token_lists = {}
stems_lists = {}
lemma_lists = {}


# For each individual species.
for species in data.get_species():
    df = data.to_pandas()
    subset = df[df["species"]==species]
    sentences = [sent_tokenize(d) for d in subset["description"].values]
    descriptions_not_stemmed = [simple_preprocess(d) for d in subset["description"].values]
    descriptions_stemmed = [preprocess_string(d) for d in subset["description"].values]
    descriptions_lemmatized = [lemmatize_doc(d) for d in subset["description"].values]
    sent_lists[species] = flatten(sentences)
    token_lists[species] = flatten(descriptions_not_stemmed)
    stems_lists[species] = flatten(descriptions_stemmed)    
    lemma_lists[species] = flatten(descriptions_lemmatized)
    
    # What about the distributions of words per gene and sentences per gene?
    dists["species"].extend([species]*subset.shape[0])
    dists["num_words"].extend([len(word_tokenize(x)) for x in subset["description"].values])
    dists["num_sents"].extend([len(sent_tokenize(x)) for x in subset["description"].values])
    
# For the entire dataset including all of the species.
df = data.to_pandas()
subset = df
sentences = [sent_tokenize(d) for d in subset["description"].values]
descriptions_not_stemmed = [simple_preprocess(d) for d in subset["description"].values]
descriptions_stemmed = [preprocess_string(d) for d in subset["description"].values]
descriptions_lemmatized = [lemmatize_doc(d) for d in subset["description"].values]
sent_lists["total"] = flatten(sentences)
token_lists["total"] = flatten(descriptions_not_stemmed)
stems_lists["total"] = flatten(descriptions_stemmed)    
lemma_lists["total"] = flatten(descriptions_lemmatized)

# What about lemmas that are uniquely used for a particular species?
lemma_sets_unique_to_species = {}
for species in data.get_species():
    other_species = [s for s in data.get_species() if s != species]
    lemmas_used_in_other_species = set(flatten([lemma_lists[s] for s in other_species]))
    unique_lemmas = set(lemma_lists[species]).difference(lemmas_used_in_other_species)
    lemma_sets_unique_to_species[species] = unique_lemmas
lemma_sets_unique_to_species["total"] = flatten([list(s) for s in lemma_sets_unique_to_species.values()])

    
# Create a dataframe to contain the summarizing information about this dataset, and sort it by number of genes.
df = data.describe() 
condition = (df.species=="total")
excluded = df[condition]
included = df[~condition]
df_sorted = included.sort_values(by="num_genes", ascending=False)
df = pd.concat([df_sorted,excluded])

# Add columns summarizing information about the text descriptions in the dataset.
df["total_sents"] = df["species"].map(lambda x: len(sent_lists[x]))
df["total_words"] = df["species"].map(lambda x: len(token_lists[x]))
df["unique_words"] = df["species"].map(lambda x: len(set(token_lists[x])))
df["unique_stems"] = df["species"].map(lambda x: len(set(stems_lists[x])))
df["total_lemmas"] = df["species"].map(lambda x: len(lemma_lists[x]))
df["unique_lemmas"] = df["species"].map(lambda x: len(set(lemma_lists[x])))
df["unique_lemmas_to_species"] = df["species"].map(lambda x: len(lemma_sets_unique_to_species[x]))
df

Unnamed: 0,species,num_genes,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species
0,ath,5851,3527,32401,264099,7085,5116,264099,6561,4864
5,zma,1405,811,8069,50029,1846,1317,50029,1722,503
3,osa,92,85,533,3887,826,586,3887,760,99
4,sly,70,70,399,1810,577,438,1810,552,99
2,mtr,37,36,281,2672,718,516,2672,671,126
1,gmx,30,24,66,233,81,68,233,78,12
6,total,7485,4553,41749,322730,8043,5802,322730,7443,5703


In [5]:
text_distributions = pd.DataFrame(dists)
text_distributions.to_csv("../data/scratch/word_sent_distributions.csv", index=False)
text_distributions.head(20)

Unnamed: 0,species,num_words,num_sents
0,ath,8,2
1,ath,17,4
2,ath,16,2
3,ath,30,3
4,ath,157,11
5,ath,22,4
6,ath,182,33
7,ath,104,7
8,ath,127,13
9,ath,343,30


### What about the ontology term annotations for each species?

In [6]:
# How many of the genes in this dataset for each species are mapped to atleast one term from a given ontology?
num_mapped_go = {}
num_mapped_po = {}
for species in data.get_species():
    d = data.to_pandas()
    subset = d[d["species"]==species]    
    num_mapped_po[species] = len([t for t in subset["term_ids"].values if "PO" in t])
    num_mapped_go[species] = len([t for t in subset["term_ids"].values if "GO" in t])
num_mapped_go["total"] = sum(list(num_mapped_go.values()))   
num_mapped_po["total"] = sum(list(num_mapped_po.values()))
df["go"] = df["species"].map(lambda x: num_mapped_go[x])
df["po"] = df["species"].map(lambda x: num_mapped_po[x])
df

Unnamed: 0,species,num_genes,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species,go,po
0,ath,5851,3527,32401,264099,7085,5116,264099,6561,4864,5387,4003
5,zma,1405,811,8069,50029,1846,1317,50029,1722,503,184,111
3,osa,92,85,533,3887,826,586,3887,760,99,46,92
4,sly,70,70,399,1810,577,438,1810,552,99,23,65
2,mtr,37,36,281,2672,718,516,2672,671,126,30,32
1,gmx,30,24,66,233,81,68,233,78,12,28,27
6,total,7485,4553,41749,322730,8043,5802,322730,7443,5703,5698,4330


### What about the biologically relevant groups like biochemical pathways and phenotypes?

In [7]:
# What are the groupings that we're interested in mapping to?
kegg_pathways_filename = "../data/pickles/groupings_from_kegg_pathways.pickle" 
pmn_pathways_filename = "../data/pickles/groupings_from_pmn_pathways.pickle"                        
lloyd_subsets_filename = "../data/pickles/groupings_from_lloyd_subsets.pickle"                     
groupings_dict = {"kegg":kegg_pathways_filename,"plantcyc":pmn_pathways_filename,"lloyd":lloyd_subsets_filename}

for name,filename in groupings_dict.items():
    groups = load_from_pickle(filename)
    id_to_group_ids, group_id_to_ids = groups.get_groupings_for_dataset(data)
    group_mapped_ids = [k for (k,v) in id_to_group_ids.items() if len(v)>0]
    species_dict = data.get_species_dictionary()
    num_mapped = {}
    for species in data.get_species():
        num_mapped[species] = len([x for x in group_mapped_ids if species_dict[x]==species])
    num_mapped["total"] = sum(list(num_mapped.values()))    
    df[name] = df["species"].map(lambda x: num_mapped[x])  
df

Unnamed: 0,species,num_genes,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species,go,po,kegg,plantcyc,lloyd
0,ath,5851,3527,32401,264099,7085,5116,264099,6561,4864,5387,4003,1345,807,2355
5,zma,1405,811,8069,50029,1846,1317,50029,1722,503,184,111,156,133,0
3,osa,92,85,533,3887,826,586,3887,760,99,46,92,0,3,0
4,sly,70,70,399,1810,577,438,1810,552,99,23,65,17,2,0
2,mtr,37,36,281,2672,718,516,2672,671,126,30,32,0,2,0
1,gmx,30,24,66,233,81,68,233,78,12,28,27,1,0,0
6,total,7485,4553,41749,322730,8043,5802,322730,7443,5703,5698,4330,1519,947,2355


### What about the other biologically relevant information like orthologous genes and protein interactions?

In [8]:
# PantherDB for plant orthologs.
ortholog_file_path = "../data/orthology_related_files/pantherdb/PlantGenomeOrthologs_IRB_Modified.txt"
ortholog_edgelist = AnyInteractions(data.get_name_to_id_dictionary(), ortholog_file_path)
species_dict = data.get_species_dictionary()
num_mapped = {}
for species in data.get_species():
    num_mapped[species] = len([x for x in ortholog_edgelist.ids if species_dict[x]==species])
num_mapped["total"] = sum(list(num_mapped.values()))
df["panther"] = df["species"].map(lambda x: num_mapped[x])    
df

Unnamed: 0,species,num_genes,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species,go,po,kegg,plantcyc,lloyd,panther
0,ath,5851,3527,32401,264099,7085,5116,264099,6561,4864,5387,4003,1345,807,2355,344
5,zma,1405,811,8069,50029,1846,1317,50029,1722,503,184,111,156,133,0,443
3,osa,92,85,533,3887,826,586,3887,760,99,46,92,0,3,0,86
4,sly,70,70,399,1810,577,438,1810,552,99,23,65,17,2,0,11
2,mtr,37,36,281,2672,718,516,2672,671,126,30,32,0,2,0,0
1,gmx,30,24,66,233,81,68,233,78,12,28,27,1,0,0,0
6,total,7485,4553,41749,322730,8043,5802,322730,7443,5703,5698,4330,1519,947,2355,884


In [9]:
# STRING DB for protein-protein interactions.
naming_file = "../data/group_related_files/string/all_organisms.name_2_string.tsv"
interaction_files = [
    "../data/group_related_files/string/3702.protein.links.detailed.v11.0.txt", # Arabidopsis thaliana
    "../data/group_related_files/string/4577.protein.links.detailed.v11.0.txt", # maize
    "../data/group_related_files/string/4530.protein.links.detailed.v11.0.txt", # tomato 
    "../data/group_related_files/string/4081.protein.links.detailed.v11.0.txt", # medicago
    "../data/group_related_files/string/3880.protein.links.detailed.v11.0.txt", # rice 
    "../data/group_related_files/string/3847.protein.links.detailed.v11.0.txt", # soybean
]
genes = data.get_gene_dictionary()
string_data = ProteinInteractions(genes, naming_file, *interaction_files)
species_dict = data.get_species_dictionary()
num_mapped = {}
for species in data.get_species():
    num_mapped[species] = len([x for x in string_data.ids if species_dict[x]==species])
num_mapped["total"] = sum(list(num_mapped.values()))
df["stringdb"] = df["species"].map(lambda x: num_mapped[x])    
df

Unnamed: 0,species,num_genes,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species,go,po,kegg,plantcyc,lloyd,panther,stringdb
0,ath,5851,3527,32401,264099,7085,5116,264099,6561,4864,5387,4003,1345,807,2355,344,3898
5,zma,1405,811,8069,50029,1846,1317,50029,1722,503,184,111,156,133,0,443,168
3,osa,92,85,533,3887,826,586,3887,760,99,46,92,0,3,0,86,45
4,sly,70,70,399,1810,577,438,1810,552,99,23,65,17,2,0,11,8
2,mtr,37,36,281,2672,718,516,2672,671,126,30,32,0,2,0,0,13
1,gmx,30,24,66,233,81,68,233,78,12,28,27,1,0,0,0,2
6,total,7485,4553,41749,322730,8043,5802,322730,7443,5703,5698,4330,1519,947,2355,884,4134


### How do the vocabularies used for different species compare?
One of the things we are interested in is discovering or recovering phenotype similarity between different species in order to identify phenologs (phenotypes between species that share some underlying genetic cause). For this reason, we are interested in how the vocabularies used to describe phenotypes between different species vary, because this will impact how feasible it is to use a dataset like this to identify phenologs. Because the Arabidopsis and maize datasets are the largest in this case, we will compare the vocabularies used in describing the phenotypes associated with the genes from these species in this dataset.

In [10]:
# Using lemmas as the vocabulary components.
vocabs = {s:set(lemma_list) for s,lemma_list in lemma_lists.items()}
fdist_zma = FreqDist(lemma_lists["zma"])
fdist_ath = FreqDist(lemma_lists["ath"])

# Using word stems as the vocabulary components.
#vocabs = {s:set(stems_list) for s,stems_list in stems_lists.items()}
#fdist_zma = FreqDist(stems_lists["zma"])
#fdist_ath = FreqDist(stems_lists["ath"])

# Using tokens (full words) as the vocabulary components.
#vocabs = {s:set(token_list) for s,token_list in token_lists.items()}
#fdist_zma = FreqDist(token_lists["zma"])
#fdist_ath = FreqDist(token_lists["ath"])

union_vocab = vocabs["zma"].union(vocabs["ath"])
table = pd.DataFrame({"token":list(union_vocab)})
stops = set(stopwords.words('english'))
table = table[~table.token.isin(stops)]
table["part_of_speech"] = table["token"].map(lambda x: nltk.pos_tag([x])[0][1][:2])
table["ath_freq"] = table["token"].map(lambda x: fdist_ath[x])
table["ath_rate"] = table["ath_freq"]*100/len(token_lists["ath"])
table["zma_freq"] = table["token"].map(lambda x: fdist_zma[x])
table["zma_rate"] = table["zma_freq"]*100/len(token_lists["zma"])
table["diff"] = table["ath_rate"]-table["zma_rate"]
table.to_csv("../data/scratch/token_frequencies.csv")
table.head(10)

Unnamed: 0,token,part_of_speech,ath_freq,ath_rate,zma_freq,zma_rate,diff
0,fast,NN,2,0.000757,0,0.0,0.000757
1,invasion,NN,2,0.000757,0,0.0,0.000757
2,sample,NN,4,0.001515,0,0.0,0.001515
3,unbend,NN,1,0.000379,0,0.0,0.000379
4,oxygen,NN,36,0.013631,0,0.0,0.013631
5,orobanche,NN,0,0.0,3,0.005997,-0.005997
6,novo,NN,7,0.002651,0,0.0,0.002651
8,arose,NN,2,0.000757,0,0.0,0.000757
9,nm,NN,24,0.009088,0,0.0,0.009088
10,supersensitive,NN,2,0.000757,0,0.0,0.000757


In [11]:
# What are the tokens more frequently used for Arabidopsis than maize descriptions in this dataset?
table.sort_values(by="diff", ascending=False, inplace=True)
table.head(30)

Unnamed: 0,token,part_of_speech,ath_freq,ath_rate,zma_freq,zma_rate,diff
3051,embryo,NN,4987,1.888307,151,0.301825,1.586482
1460,mutant,NN,4778,1.80917,256,0.511703,1.297467
51,phenotype,NN,3583,1.356688,78,0.15591,1.200779
247,root,NN,2942,1.113976,72,0.143917,0.97006
2331,type,NN,2540,0.961761,15,0.029983,0.931778
2352,wild,NN,2456,0.929954,7,0.013992,0.915962
402,defective,JJ,3332,1.261648,285,0.56967,0.691979
5367,reduced,VB,2857,1.081791,216,0.43175,0.650042
1314,stage,NN,1814,0.686864,63,0.125927,0.560937
4949,cotyledon,NN,1439,0.544871,0,0.0,0.544871


In [12]:
# What are the tokens more frequently used for maize than Arabidopsis descriptions in this dataset?
table.sort_values(by="diff", ascending=True, inplace=True)
table.head(30)

Unnamed: 0,token,part_of_speech,ath_freq,ath_rate,zma_freq,zma_rate,diff
4947,endosperm,NN,124,0.046952,1078,2.15475,-2.107798
6563,seedling,VB,1560,0.590688,1318,2.634472,-2.043784
3269,kernel,NN,0,0.0,766,1.531112,-1.531112
4002,yellow,NN,304,0.115108,775,1.549102,-1.433993
4349,leaf,NN,3876,1.467631,1445,2.888325,-1.420693
549,green,JJ,883,0.334344,788,1.575086,-1.240742
6395,white,JJ,375,0.141992,642,1.283256,-1.141264
5261,albino,NN,222,0.084059,396,0.791541,-0.707482
5422,usually,RB,46,0.017418,353,0.705591,-0.688173
6069,color,NN,43,0.016282,330,0.659617,-0.643336


In [13]:
# Is the mean absolute value of the rate differences different between the different parts of speech?
table["abs_diff"] = abs(table["diff"])
pos_table = table.groupby("part_of_speech").mean()
pos_table.sort_values(by="abs_diff", inplace=True, ascending=False)
pos_table = pos_table[["abs_diff"]]
pos_table.reset_index()

Unnamed: 0,part_of_speech,abs_diff
0,MD,0.059431
1,CD,0.027896
2,IN,0.026204
3,JJ,0.023686
4,DT,0.019075
5,RB,0.016302
6,NN,0.014463
7,VB,0.01213
8,CC,0.007617
9,WP,0.00162


In [14]:
# Working on the Venn Diagram for this part, unused currently.
#print(table.shape)
#zma_only = table[table["ath_rate"]==0]
#ath_only = table[table["zma_rate"]==0]
#print(zma_only.shape)
#print(ath_only.shape)
#print(ath_only.shape[0]+zma_only.shape[0])
#ath_only.head(10)
# We need to create a mapping between stems and the words that were present for them.
# This is because what we want is the stems that are exclusive to a species.
# but then the words that are actually there for those stems, so that we can count their parts of speech.

### Looking at Term and Word Enrichment for Groups of Genes

In [15]:
# Loading the dataset of phenotype descriptions and ontology annotations.
data = load_from_pickle("../data/pickles/gene_phenotype_dataset_all_text_and_annotations.pickle")
data.filter_has_description()
data.filter_has_annotation("GO")
data.filter_has_annotation("PO")
d = data.get_description_dictionary()
texts = {i:" ".join(simple_preprocess(t)) for i,t in d.items()}
len(texts)                              

3815

In [16]:
# Create ontology objects for all the biological ontologies being used.
go_filename = "../ontologies/go.obo"                                                                
po_filename = "../ontologies/po.obo"                                                                 
pato_filename = "../ontologies/pato.obo"   
pato = Ontology(pato_filename)
po = Ontology(po_filename)
go = Ontology(go_filename)

In [17]:
curated_go_annotations = data.get_annotations_dictionary("GO")
curated_po_annotations = data.get_annotations_dictionary("PO")

In [18]:
# Load the mappings from this dataset to PlantCyc information.
pmn_pathways_filename = "../data/pickles/groupings_from_pmn_pathways.pickle"                        
groups = load_from_pickle(pmn_pathways_filename)
id_to_group_ids, group_id_to_ids = groups.get_groupings_for_dataset(data)

# Look at which pathways are best represented in this dataset.
pathways_sorted = sorted(group_id_to_ids.items(), key=lambda item: len(item[1]), reverse=True)
pathways_sorted_lengths = [(i,len(l)) for (i,l) in pathways_sorted]
pathways_df = pd.DataFrame(pathways_sorted_lengths, columns=["pathway_id","num_genes"])
pathways_df["pathway_name"] = pathways_df["pathway_id"].map(lambda x: groups.get_long_name(x))
pathways_df = pathways_df[["pathway_name","pathway_id","num_genes"]]
pathways_df.head(15)

Unnamed: 0,pathway_name,pathway_id,num_genes
0,sporopollenin precursors biosynthesis,PWY-6733,21
1,gluconeogenesis III,PWY66-399,20
2,gluconeogenesis I,GLUCONEO-PWY,20
3,suberin monomers biosynthesis,PWY-1121,20
4,phosphatidylcholine acyl editing,PWY-6803,19
5,indole-3-acetate biosynthesis II,PWY-581,18
6,glycolysis I (from glucose 6-phosphate),GLYCOLYSIS,16
7,glycolysis II (from fructose 6-phosphate),PWY-5484,15
8,palmitate biosynthesis II (bacteria and plants),PWY-5971,15
9,3-phosphoinositide biosynthesis,PWY-6352,15


In [19]:
# For some example pathway to use.
pathway_id = "PWY-6733"
gene_ids_in_this_pathway = group_id_to_ids[pathway_id]

In [51]:
results = term_enrichment(curated_po_annotations, gene_ids_in_this_pathway, po).head(20)


from statsmodels.sandbox.stats.multicomp import multipletests
threshold = 0.05
results["p_value_adj"] = multipletests(results["p_value"].values, method='bonferroni')[1]
results["significant"] = results["p_value_adj"] < threshold
results = results.loc[results["significant"]==True]



results["info_content"] = results["term_id"].map(lambda x: po.ic(x))
results.sort_values(by="info_content", ascending=False, inplace=True)


# ns   P > 0.05
# *    P ≤ 0.05
# **   P ≤ 0.01
# ***  P ≤ 0.001
# **** P ≤ 0.0001

# This lambda won't work is passed a value greater than the minimum p-value for significance defined here.
significance_levels = {0.05:"*", 0.01:"**", 0.001:"***", 0.0001:"****"}
get_level = lambda x: significance_levels[min([level for level in significance_levels.keys() if x <= level])]

results["significance"] = results["p_value_adj"].map(get_level)
results













Unnamed: 0,term_id,term_label,genes_with,genes_without,group_genes_with,group_genes_without,p_value,p_value_adj,significant,info_content,significance
4,PO:0009071,anther wall tapetum,29,3786,4,17,2.452198e-05,0.0004904397,True,5.0,***
5,PO:0005059,root endodermis,18,3797,3,18,0.0001765972,0.003531944,True,4.0,**
3,PO:0025314,microsporangium tapetum,29,3786,4,17,2.452198e-05,0.0004904397,True,3.633786,***
7,PO:0009088,seed coat,37,3778,3,18,0.001226875,0.0245375,True,3.0,*
1,PO:0025313,tapetum,33,3782,6,15,3.593559e-08,7.187118e-07,True,2.450679,****
6,PO:0000252,endodermis,25,3790,3,18,0.0004243818,0.008487637,True,2.290013,**
2,PO:0025059,portion of ground tissue,173,3642,9,12,1.797079e-07,3.594158e-06,True,0.748851,****
0,PO:0020008,exine,9,3806,5,16,5.718655e-09,1.143731e-07,True,0.0,****


In [52]:
results = term_enrichment(curated_go_annotations, gene_ids_in_this_pathway, go).head(20)

from statsmodels.sandbox.stats.multicomp import multipletests
threshold = 0.05
results["p_value_adj"] = multipletests(results["p_value"].values, method='bonferroni')[1]
results["significant"] = results["p_value_adj"] < threshold


results = results.loc[results["significant"]==True]

results["info_content"] = results["term_id"].map(lambda x: go.ic(x))
results.sort_values(by="info_content", ascending=False, inplace=True)


# This lambda won't work is passed a value greater than the minimum p-value for significance defined here.
significance_levels = {0.05:"*", 0.01:"**", 0.001:"***", 0.0001:"****"}
get_level = lambda x: significance_levels[min([level for level in significance_levels.keys() if x <= level])]

results["significance"] = results["p_value_adj"].map(get_level)
results

Unnamed: 0,term_id,term_label,genes_with,genes_without,group_genes_with,group_genes_without,p_value,p_value_adj,significant,info_content,significance
19,GO:0050062,long-chain-fatty-acyl-CoA reductase activity,3,3812,3,18,2.799884e-06,5.599768e-05,True,5.0,****
4,GO:0080110,sporopollenin biosynthetic process,7,3808,7,14,1.617948e-13,3.235897e-12,True,5.0,****
7,GO:0010584,pollen exine formation,16,3799,7,14,1.122824e-11,2.245649e-10,True,5.0,****
5,GO:0004467,long-chain fatty acid-CoA ligase activity,7,3808,7,14,1.617948e-13,3.235897e-12,True,4.356238,****
16,GO:0005783,endoplasmic reticulum,154,3661,8,13,1.084597e-06,2.169193e-05,True,4.16795,****
6,GO:0015645,fatty acid ligase activity,8,3807,7,14,3.023935e-13,6.047871e-12,True,3.846069,****
9,GO:0010208,pollen wall assembly,21,3794,7,14,5.336455e-11,1.067291e-09,True,3.742495,****
12,GO:0045229,external encapsulating structure organization,57,3758,7,14,2.492753e-08,4.985505e-07,True,2.611453,****
0,GO:0016405,CoA-ligase activity,16,3799,10,11,9.561414e-18,1.912283e-16,True,2.511272,****
1,GO:0016878,acid-thiol ligase activity,17,3798,10,11,1.5145950000000002e-17,3.029189e-16,True,2.406091,****


In [53]:
results = token_enrichment(texts, gene_ids_in_this_pathway).head(20)

from statsmodels.sandbox.stats.multicomp import multipletests
threshold = 0.05
results["p_value_adj"] = multipletests(results["p_value"].values, method='bonferroni')[1]
results["significant"] = results["p_value_adj"] < threshold
results = results.loc[results["significant"]==True]



# This lambda won't work is passed a value greater than the minimum p-value for significance defined here.
significance_levels = {0.05:"*", 0.01:"**", 0.001:"***", 0.0001:"****"}
get_level = lambda x: significance_levels[min([level for level in significance_levels.keys() if x <= level])]

results["significance"] = results["p_value_adj"].map(get_level)
results

Unnamed: 0,token,genes_with,genes_without,group_genes_with,group_genes_without,p_value,p_value_adj,significant,significance
0,layer,65,3750,9,12,5.499473e-11,1.099895e-09,True,****
1,exine,19,3796,6,15,2.04505e-09,4.0901e-08,True,****
2,suberin,8,3807,3,18,2.269517e-05,0.0004539035,True,***
3,lacs,2,3813,2,19,0.0001701698,0.003403396,True,**
4,cutin,4,3811,2,19,0.0004226197,0.008452394,True,**
5,wax,32,3783,3,18,0.0008271913,0.01654383,True,*
6,permeability,7,3808,2,19,0.001004279,0.02008557,True,*
7,flavonoids,8,3807,2,19,0.001251207,0.02502414,True,*
8,flavonoid,9,3806,2,19,0.00152421,0.0304842,True,*
9,pollen,251,3564,6,15,0.001964829,0.03929658,True,*
