# Looking at the Dataset
The purpose of this notebook is to look closer at the dataset of genes, natural language descriptions, and ontology term annotations that are used in this work. As included in the preprocessing notebooks, these data are drawn from files from either publications supplements like Oellrich, Walls et al. (2015) or model species databases such as TAIR, MaizeGDB, and SGN. The datasets are already loaded and merged using classes available through the oats package.

In [5]:
import datetime
import nltk
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import random
import warnings
from collections import defaultdict
from nltk.corpus import brown
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string, remove_stopwords
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from statsmodels.sandbox.stats.multicomp import multipletests

sys.path.append("../../oats")
sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, flatten, to_hms
from oats.utils.utils import function_wrapper_with_duration, remove_duplicates_retain_order
from oats.biology.dataset import Dataset
from oats.biology.groupings import Groupings
from oats.biology.relationships import ProteinInteractions, AnyInteractions
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import annotate_using_noble_coder, term_enrichment
from oats.distances import pairwise as pw
from oats.nlp.vocabulary import get_overrepresented_tokens, get_vocab_from_tokens
from oats.nlp.vocabulary import reduce_vocab_connected_components, reduce_vocab_linares_pontes, token_enrichment

warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [14]:
# Paths to the files that are used for this notebook.
plant_dataset_path = "../../plant-data/genes_texts_annots.csv"

# Paths to files with mappings to groups.
kegg_pathways_path = "../../plant-data/reshaped_data/kegg_pathways.csv" 
plantcyc_pathways_path = "../../plant-data/reshaped_data/plantcyc_pathways.csv" 
lloyd_meinke_subsets_path = "../../plant-data/reshaped_data/lloyd_meinke_subsets.csv" 
lloyd_meinke_classes_path = "../../plant-data/reshaped_data/lloyd_meinke_classes.csv" 

# Paths to files that map group identifers to longer group names.
kegg_pathways_names_path = "../../plant-data/reshaped_data/kegg_pathways_name_map.csv"
plantcyc_pathways_names_path = "../../plant-data/reshaped_data/plantcyc_pathways_name_map.csv"
lloyd_meinke_subsets_names_path = "../../plant-data/reshaped_data/lloyd_meinke_subsets_name_map.csv"
lloyd_meinke_classes_names_path = "../../plant-data/reshaped_data/lloyd_meinke_classes_name_map.csv"

# Path to file with plant ortholog mappings.
ortholog_file_path = "../data/orthology_related_files/pantherdb/PlantGenomeOrthologs_IRB_Modified.txt"

In [None]:
# Create and name an output directory according to when the notebooks was run.
OUTPUT_NAME = "composition"
OUTPUT_DIR = os.path.join("../outputs","{}_{}_{}".format(OUTPUT_NAME,datetime.datetime.now().strftime('%m_%d_%Y_h%Hm%Ms%S'),random.randrange(1000,9999)))
os.mkdir(OUTPUT_DIR)

In [7]:
# Reading in and describing the dataset of plant genes.
plant_dataset = Dataset(plant_dataset_path)
plant_dataset.filter_has_description()
plant_dataset.describe()

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions
0,ath,5066,2975
1,gmx,30,23
2,mtr,37,36
3,osa,92,85
4,sly,70,70
5,zma,1405,810
6,total,6700,3999


### What's there for each species?
The previously loaded dataset contains all of the genes that across six plant species that have natural language description data for phenotype(s) related to that gene. Each gene can have multiple descriptions annotated to it, which were combined or concatenated when the datasets from multiple sources were merged in creating the pickled datasets. Arabidopsis has the highest number of genes that satisfy this criteria, followed by maize, and then followed by the other four species which have a relatively low number of genes that satisfy this criteria, atleast given the sources used for this work. Note that the number of unique descriptions is lower than the number of genes in call cases, because multiple genes can have the same phenotype description associated with them.

In [8]:
data = plant_dataset

wnl = WordNetLemmatizer()
lemmatize_doc = lambda d: [wnl.lemmatize(x) for x in simple_preprocess(d)]

dists = defaultdict(list)

sent_lists = {}
token_lists = {}
stems_lists = {}
lemma_lists = {}


# For each individual species.
for species in data.get_species():
    df = data.to_pandas()
    subset = df[df["species"]==species]
    sentences = [sent_tokenize(d) for d in subset["descriptions"].values]
    descriptions_not_stemmed = [simple_preprocess(d) for d in subset["descriptions"].values]
    descriptions_stemmed = [preprocess_string(d) for d in subset["descriptions"].values]
    descriptions_lemmatized = [lemmatize_doc(d) for d in subset["descriptions"].values]
    sent_lists[species] = flatten(sentences)
    token_lists[species] = flatten(descriptions_not_stemmed)
    stems_lists[species] = flatten(descriptions_stemmed)    
    lemma_lists[species] = flatten(descriptions_lemmatized)
    
    # What about the distributions of words per gene and sentences per gene?
    dists["species"].extend([species]*subset.shape[0])
    dists["num_words"].extend([len(word_tokenize(x)) for x in subset["descriptions"].values])
    dists["num_sents"].extend([len(sent_tokenize(x)) for x in subset["descriptions"].values])
    
# For the entire dataset including all of the species.
df = data.to_pandas()
subset = df
sentences = [sent_tokenize(d) for d in subset["descriptions"].values]
descriptions_not_stemmed = [simple_preprocess(d) for d in subset["descriptions"].values]
descriptions_stemmed = [preprocess_string(d) for d in subset["descriptions"].values]
descriptions_lemmatized = [lemmatize_doc(d) for d in subset["descriptions"].values]
sent_lists["total"] = flatten(sentences)
token_lists["total"] = flatten(descriptions_not_stemmed)
stems_lists["total"] = flatten(descriptions_stemmed)    
lemma_lists["total"] = flatten(descriptions_lemmatized)

# What about lemmas that are uniquely used for a particular species?
lemma_sets_unique_to_species = {}
for species in data.get_species():
    other_species = [s for s in data.get_species() if s != species]
    lemmas_used_in_other_species = set(flatten([lemma_lists[s] for s in other_species]))
    unique_lemmas = set(lemma_lists[species]).difference(lemmas_used_in_other_species)
    lemma_sets_unique_to_species[species] = unique_lemmas
lemma_sets_unique_to_species["total"] = flatten([list(s) for s in lemma_sets_unique_to_species.values()])

    
# Create a dataframe to contain the summarizing information about this dataset, and sort it by number of genes.
# Unique gene identifiers is just the total number of genes, this column name should be changed in the class...
df = data.describe() 
condition = (df.species=="total")
excluded = df[condition]
included = df[~condition]
df_sorted = included.sort_values(by="unique_gene_identifiers", ascending=False)
df = pd.concat([df_sorted,excluded])

# Add columns summarizing information about the text descriptions in the dataset.
df["total_sents"] = df["species"].map(lambda x: len(sent_lists[x]))
df["total_words"] = df["species"].map(lambda x: len(token_lists[x]))
df["unique_words"] = df["species"].map(lambda x: len(set(token_lists[x])))
df["unique_stems"] = df["species"].map(lambda x: len(set(stems_lists[x])))
df["total_lemmas"] = df["species"].map(lambda x: len(lemma_lists[x]))
df["unique_lemmas"] = df["species"].map(lambda x: len(set(lemma_lists[x])))
df["unique_lemmas_to_species"] = df["species"].map(lambda x: len(lemma_sets_unique_to_species[x]))
df

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species
0,ath,5066,2975,26043,232445,7085,5116,232445,6561,4864
5,zma,1405,810,5475,48983,1846,1317,48983,1722,503
3,osa,92,85,478,3689,826,586,3689,760,99
4,sly,70,70,359,1678,577,438,1678,552,99
2,mtr,37,36,263,2447,718,516,2447,671,126
1,gmx,30,23,62,222,81,68,222,78,12
6,total,6700,3999,32680,289464,8043,5802,289464,7443,5703


In [10]:
text_distributions = pd.DataFrame(dists)
text_distributions.to_csv(os.path.join(OUTPUT_DIR, "word_sent_distributions.csv"), index=False)
text_distributions.head(20)

Unnamed: 0,species,num_words,num_sents
0,zma,14,1
1,zma,115,7
2,zma,6,2
3,zma,13,3
4,zma,33,3
5,zma,105,9
6,zma,100,7
7,zma,43,3
8,zma,43,3
9,zma,115,7


### What about the ontology term annotations for each species?

In [11]:
# How many of the genes in this dataset for each species are mapped to atleast one term from a given ontology?
num_mapped_go = {}
num_mapped_po = {}
for species in data.get_species():
    d = data.to_pandas()
    subset = d[d["species"]==species]    
    num_mapped_po[species] = len([t for t in subset["annotations"].values if "PO" in t])
    num_mapped_go[species] = len([t for t in subset["annotations"].values if "GO" in t])
num_mapped_go["total"] = sum(list(num_mapped_go.values()))   
num_mapped_po["total"] = sum(list(num_mapped_po.values()))
df["go"] = df["species"].map(lambda x: num_mapped_go[x])
df["po"] = df["species"].map(lambda x: num_mapped_po[x])
df

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species,go,po
0,ath,5066,2975,26043,232445,7085,5116,232445,6561,4864,4691,3220
5,zma,1405,810,5475,48983,1846,1317,48983,1722,503,184,111
3,osa,92,85,478,3689,826,586,3689,760,99,46,92
4,sly,70,70,359,1678,577,438,1678,552,99,23,65
2,mtr,37,36,263,2447,718,516,2447,671,126,30,32
1,gmx,30,23,62,222,81,68,222,78,12,28,27
6,total,6700,3999,32680,289464,8043,5802,289464,7443,5703,5002,3547


### What about the biologically relevant groups like biochemical pathways and phenotypes?

In [12]:
# What are the groupings that we're interested in mapping to? Uses the paths defined at the top of the notebook.
groupings_dict = {
    "kegg":(kegg_pathways_path, kegg_pathways_names_path),
    "plantcyc":(plantcyc_pathways_path, plantcyc_pathways_names_path),
    "lloyd_meinke":(lloyd_meinke_subsets_path, lloyd_meinke_subsets_names_path)
}


for name,(filename,mapfile) in groupings_dict.items():
    groups = Groupings(filename, {row.group_id:row.group_name for row in pd.read_csv(mapfile).itertuples()})
    id_to_group_ids, group_id_to_ids = groups.get_groupings_for_dataset(data)
    group_mapped_ids = [k for (k,v) in id_to_group_ids.items() if len(v)>0]
    species_dict = data.get_species_dictionary()
    num_mapped = {}
    for species in data.get_species():
        num_mapped[species] = len([x for x in group_mapped_ids if species_dict[x]==species])
    num_mapped["total"] = sum(list(num_mapped.values()))    
    df[name] = df["species"].map(lambda x: num_mapped[x])  
df

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species,go,po,kegg,plantcyc,lloyd_meinke
0,ath,5066,2975,26043,232445,7085,5116,232445,6561,4864,4691,3220,1084,654,1570
5,zma,1405,810,5475,48983,1846,1317,48983,1722,503,184,111,157,133,0
3,osa,92,85,478,3689,826,586,3689,760,99,46,92,1,4,0
4,sly,70,70,359,1678,577,438,1678,552,99,23,65,18,3,0
2,mtr,37,36,263,2447,718,516,2447,671,126,30,32,0,2,0
1,gmx,30,23,62,222,81,68,222,78,12,28,27,3,0,0
6,total,6700,3999,32680,289464,8043,5802,289464,7443,5703,5002,3547,1263,796,1570


### What about the other biologically relevant information like orthologous genes and protein interactions?

In [15]:
# PantherDB for plant orthologs.
ortholog_edgelist = AnyInteractions(data.get_name_to_id_dictionary(), ortholog_file_path)
species_dict = data.get_species_dictionary()
num_mapped = {}
for species in data.get_species():
    num_mapped[species] = len([x for x in ortholog_edgelist.ids if species_dict[x]==species])
num_mapped["total"] = sum(list(num_mapped.values()))
df["panther"] = df["species"].map(lambda x: num_mapped[x])    
df

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species,go,po,kegg,plantcyc,lloyd_meinke,panther
0,ath,5066,2975,26043,232445,7085,5116,232445,6561,4864,4691,3220,1084,654,1570,317
5,zma,1405,810,5475,48983,1846,1317,48983,1722,503,184,111,157,133,0,443
3,osa,92,85,478,3689,826,586,3689,760,99,46,92,1,4,0,86
4,sly,70,70,359,1678,577,438,1678,552,99,23,65,18,3,0,11
2,mtr,37,36,263,2447,718,516,2447,671,126,30,32,0,2,0,0
1,gmx,30,23,62,222,81,68,222,78,12,28,27,3,0,0,1
6,total,6700,3999,32680,289464,8043,5802,289464,7443,5703,5002,3547,1263,796,1570,858


In [None]:
# STRING DB for protein-protein interactions.
naming_file = "../data/group_related_files/string/all_organisms.name_2_string.tsv"
interaction_files = [
    "../data/group_related_files/string/3702.protein.links.detailed.v11.0.txt", # Arabidopsis thaliana
    "../data/group_related_files/string/4577.protein.links.detailed.v11.0.txt", # maize
    "../data/group_related_files/string/4530.protein.links.detailed.v11.0.txt", # tomato 
    "../data/group_related_files/string/4081.protein.links.detailed.v11.0.txt", # medicago
    "../data/group_related_files/string/3880.protein.links.detailed.v11.0.txt", # rice 
    "../data/group_related_files/string/3847.protein.links.detailed.v11.0.txt", # soybean
]
genes = data.get_gene_dictionary()
string_data = ProteinInteractions(genes, naming_file, *interaction_files)
species_dict = data.get_species_dictionary()
num_mapped = {}
for species in data.get_species():
    num_mapped[species] = len([x for x in string_data.ids if species_dict[x]==species])
num_mapped["total"] = sum(list(num_mapped.values()))
df["stringdb"] = df["species"].map(lambda x: num_mapped[x])    
df

In [17]:
# Write that dataframe with all the information about datast to a file.
df.to_csv(os.path.join(OUTPUT_DIR,"full_dataset_composition.csv"),index=False)

### How do the vocabularies used for different species compare?
One of the things we are interested in is discovering or recovering phenotype similarity between different species in order to identify phenologs (phenotypes between species that share some underlying genetic cause). For this reason, we are interested in how the vocabularies used to describe phenotypes between different species vary, because this will impact how feasible it is to use a dataset like this to identify phenologs. Because the Arabidopsis and maize datasets are the largest in this case, we will compare the vocabularies used in describing the phenotypes associated with the genes from these species in this dataset.

In [20]:
# Using lemmas as the vocabulary components.
vocabs = {s:set(lemma_list) for s,lemma_list in lemma_lists.items()}
fdist_zma = FreqDist(lemma_lists["zma"])
fdist_ath = FreqDist(lemma_lists["ath"])

# Using word stems as the vocabulary components.
#vocabs = {s:set(stems_list) for s,stems_list in stems_lists.items()}
#fdist_zma = FreqDist(stems_lists["zma"])
#fdist_ath = FreqDist(stems_lists["ath"])

# Using tokens (full words) as the vocabulary components.
#vocabs = {s:set(token_list) for s,token_list in token_lists.items()}
#fdist_zma = FreqDist(token_lists["zma"])
#fdist_ath = FreqDist(token_lists["ath"])

union_vocab = vocabs["zma"].union(vocabs["ath"])
table = pd.DataFrame({"token":list(union_vocab)})
stops = set(stopwords.words('english'))
table = table[~table.token.isin(stops)]
table["part_of_speech"] = table["token"].map(lambda x: nltk.pos_tag([x])[0][1][:2])
table["ath_freq"] = table["token"].map(lambda x: fdist_ath[x])
table["ath_rate"] = table["ath_freq"]*100/len(token_lists["ath"])
table["zma_freq"] = table["token"].map(lambda x: fdist_zma[x])
table["zma_rate"] = table["zma_freq"]*100/len(token_lists["zma"])
table["diff"] = table["ath_rate"]-table["zma_rate"]
table.to_csv(os.path.join(OUTPUT_DIR,"token_frequencies.csv"), index=False)
table.head(10)

Unnamed: 0,token,part_of_speech,ath_freq,ath_rate,zma_freq,zma_rate,diff
0,twenty,NN,1,0.00043,0,0.0,0.00043
1,seedligs,NN,2,0.00086,0,0.0,0.00086
2,atpase,NN,9,0.003872,0,0.0,0.003872
3,polynucleate,NN,2,0.00086,0,0.0,0.00086
4,hub,NN,13,0.005593,0,0.0,0.005593
5,indicate,NN,7,0.003011,0,0.0,0.003011
6,blade,NN,67,0.028824,114,0.232734,-0.20391
7,galactose,NN,10,0.004302,0,0.0,0.004302
8,cacl,NN,1,0.00043,0,0.0,0.00043
9,evaluation,NN,0,0.0,1,0.002042,-0.002042


In [21]:
# What are the tokens more frequently used for Arabidopsis than maize descriptions in this dataset?
table.sort_values(by="diff", ascending=False, inplace=True)
table.head(30)

Unnamed: 0,token,part_of_speech,ath_freq,ath_rate,zma_freq,zma_rate,diff
5437,embryo,NN,4087,1.758265,150,0.306229,1.452037
2882,mutant,NN,4433,1.907118,254,0.518547,1.388571
4767,phenotype,NN,3393,1.4597,76,0.155156,1.304544
5715,type,NN,2364,1.017015,15,0.030623,0.986392
5801,wild,NN,2287,0.983889,7,0.014291,0.969598
795,root,NN,2487,1.069931,70,0.142907,0.927024
2612,stage,NN,1748,0.752006,55,0.112284,0.639722
6370,reduced,VB,2416,1.039386,215,0.438928,0.600458
1311,terminal,NN,1265,0.544215,4,0.008166,0.536049
2079,growth,NN,1590,0.684033,78,0.159239,0.524794


In [22]:
# What are the tokens more frequently used for maize than Arabidopsis descriptions in this dataset?
table.sort_values(by="diff", ascending=True, inplace=True)
table.head(30)

Unnamed: 0,token,part_of_speech,ath_freq,ath_rate,zma_freq,zma_rate,diff
4478,endosperm,NN,100,0.043021,1069,2.18239,-2.139369
4888,seedling,VB,1276,0.548947,1316,2.686646,-2.137699
6182,kernel,NN,0,0.0,756,1.543393,-1.543393
3835,leaf,NN,3224,1.386995,1401,2.860176,-1.473181
4981,yellow,NN,239,0.10282,766,1.563808,-1.460988
3900,green,JJ,743,0.319646,778,1.588306,-1.268661
1735,white,JJ,310,0.133365,642,1.310659,-1.177294
6019,albino,NN,182,0.078298,396,0.808444,-0.730146
2680,usually,RB,41,0.017639,342,0.698201,-0.680563
4332,color,NN,40,0.017208,326,0.665537,-0.648329


In [23]:
# Is the mean absolute value of the rate differences different between the different parts of speech?
table["abs_diff"] = abs(table["diff"])
pos_table = table.groupby("part_of_speech").mean()
pos_table.sort_values(by="abs_diff", inplace=True, ascending=False)
pos_table = pos_table[["abs_diff"]]
pos_table.reset_index()

Unnamed: 0,part_of_speech,abs_diff
0,MD,0.059567
1,CD,0.029173
2,IN,0.026193
3,JJ,0.02308
4,DT,0.019441
5,RB,0.016217
6,NN,0.014511
7,VB,0.012129
8,CC,0.007736
9,WP,0.001611


In [24]:
# Working on the Venn Diagram for this part, unused currently.
#print(table.shape)
#zma_only = table[table["ath_rate"]==0]
#ath_only = table[table["zma_rate"]==0]
#print(zma_only.shape)
#print(ath_only.shape)
#print(ath_only.shape[0]+zma_only.shape[0])
#ath_only.head(10)
# We need to create a mapping between stems and the words that were present for them.
# This is because what we want is the stems that are exclusive to a species.
# but then the words that are actually there for those stems, so that we can count their parts of speech.

### Looking at Term and Word Enrichment for Groups of Genes

In [25]:
# Loading the dataset of phenotype descriptions and ontology annotations.
data = plant_dataset
data.filter_has_description()
data.filter_has_annotation("GO")
data.filter_has_annotation("PO")
d = data.get_description_dictionary()
texts = {i:" ".join(simple_preprocess(t)) for i,t in d.items()}
len(texts)                              

3114

In [26]:
# Create ontology objects for all the biological ontologies being used.
go_pickle_path = "../ontologies/go.pickle"                                                                
po_pickle_path = "../ontologies/po.pickle"                                                             
pato_pickle_path = "../ontologies/pato.pickle"
pato = load_from_pickle(pato_pickle_path)
po = load_from_pickle(po_pickle_path)
go = load_from_pickle(go_pickle_path)

In [27]:
curated_go_annotations = data.get_annotations_dictionary("GO")
curated_po_annotations = data.get_annotations_dictionary("PO")

In [28]:
# Load the mappings from this dataset to PlantCyc information.
#pmn_pathways_filename = "../data/pickles/groupings_from_pmn_pathways.pickle"                        
#groups = load_from_pickle(pmn_pathways_filename)
#id_to_group_ids, group_id_to_ids = groups.get_groupings_for_dataset(data)


# Reading in the dataset of groupings for pathways in PlantCyc.
plantcyc_name_mapping = {row.group_id:row.group_name for row in pd.read_csv(plantcyc_pathways_names_path).itertuples()}
plantcyc_grouping = Groupings(plantcyc_pathways_path, plantcyc_name_mapping)
id_to_group_ids, group_id_to_ids = plantcyc_grouping.get_groupings_for_dataset(data)

# Look at which pathways are best represented in this dataset.
pathways_sorted = sorted(group_id_to_ids.items(), key=lambda item: len(item[1]), reverse=True)
pathways_sorted_lengths = [(i,len(l)) for (i,l) in pathways_sorted]
pathways_df = pd.DataFrame(pathways_sorted_lengths, columns=["pathway_id","num_genes"])
pathways_df["pathway_name"] = pathways_df["pathway_id"].map(lambda x: plantcyc_grouping.get_long_name(x))
pathways_df = pathways_df[["pathway_name","pathway_id","num_genes"]]
pathways_df.head(15)

Unnamed: 0,pathway_name,pathway_id,num_genes
0,suberin monomers biosynthesis,PWY-1121,19
1,gluconeogenesis I,GLUCONEO-PWY,19
2,gluconeogenesis III,PWY66-399,19
3,indole-3-acetate biosynthesis II,PWY-581,18
4,phosphatidylcholine acyl editing,PWY-6803,17
5,sporopollenin precursors biosynthesis,PWY-6733,16
6,glycolysis I (from glucose 6-phosphate),GLYCOLYSIS,16
7,3-phosphoinositide biosynthesis,PWY-6352,15
8,palmitate biosynthesis II (bacteria and plants),PWY-5971,15
9,glycolysis II (from fructose 6-phosphate),PWY-5484,15


In [30]:
# For some example pathway to use.
pathway_id = "PWY-6733"
gene_ids_in_this_pathway = group_id_to_ids[pathway_id]

In [31]:
results = term_enrichment(curated_po_annotations, gene_ids_in_this_pathway, po).head(20)
threshold = 0.05
results["p_value_adj"] = multipletests(results["p_value"].values, method='bonferroni')[1]
results["significant"] = results["p_value_adj"] < threshold
results = results.loc[results["significant"]==True]
results["info_content"] = results["term_id"].map(lambda x: po.ic(x))
results.sort_values(by="info_content", ascending=False, inplace=True)


# ns   P > 0.05
# *    P ≤ 0.05
# **   P ≤ 0.01
# ***  P ≤ 0.001
# **** P ≤ 0.0001

# This lambda won't work is passed a value greater than the minimum p-value for significance defined here.
significance_levels = {0.05:"*", 0.01:"**", 0.001:"***", 0.0001:"****"}
get_level = lambda x: significance_levels[min([level for level in significance_levels.keys() if x <= level])]

results["significance"] = results["p_value_adj"].map(get_level)
results

Unnamed: 0,term_id,term_label,genes_with,genes_without,group_genes_with,group_genes_without,p_value,p_value_adj,significant,info_content,significance
2,PO:0009071,anther wall tapetum,29,3085,3,13,0.000497,0.009938,True,5.0,**
7,PO:0009067,filament,52,3062,3,13,0.002446,0.048914,True,5.0,*
6,PO:0009073,stigma,47,3067,3,13,0.001856,0.037119,True,4.0,*
3,PO:0025314,microsporangium tapetum,29,3085,3,13,0.000497,0.009938,True,3.633786,**
1,PO:0025313,tapetum,32,3082,4,12,2.4e-05,0.000487,True,2.450679,***
4,PO:0005052,plant callus,141,2973,5,11,0.000593,0.011869,True,2.450679,*
5,PO:0025059,portion of ground tissue,156,2958,5,11,0.000931,0.018618,True,0.748851,*
0,PO:0020008,exine,7,3107,3,13,1.3e-05,0.000258,True,0.0,***


In [32]:
results = term_enrichment(curated_go_annotations, gene_ids_in_this_pathway, go).head(20)

from statsmodels.sandbox.stats.multicomp import multipletests
threshold = 0.05
results["p_value_adj"] = multipletests(results["p_value"].values, method='bonferroni')[1]
results["significant"] = results["p_value_adj"] < threshold


results = results.loc[results["significant"]==True]

results["info_content"] = results["term_id"].map(lambda x: go.ic(x))
results.sort_values(by="info_content", ascending=False, inplace=True)


# This lambda won't work is passed a value greater than the minimum p-value for significance defined here.
significance_levels = {0.05:"*", 0.01:"**", 0.001:"***", 0.0001:"****"}
get_level = lambda x: significance_levels[min([level for level in significance_levels.keys() if x <= level])]

results["significance"] = results["p_value_adj"].map(get_level)
results

Unnamed: 0,term_id,term_label,genes_with,genes_without,group_genes_with,group_genes_without,p_value,p_value_adj,significant,info_content,significance
14,GO:0080110,sporopollenin biosynthetic process,4,3110,4,12,3.152767e-08,6.305535e-07,True,5.0,****
5,GO:0004467,long-chain fatty acid-CoA ligase activity,7,3107,7,9,6.650099e-14,1.33002e-12,True,4.356238,****
15,GO:0005783,endoplasmic reticulum,133,2981,8,8,1.321393e-07,2.642786e-06,True,4.16795,****
6,GO:0015645,fatty acid ligase activity,8,3106,7,9,1.243744e-13,2.487488e-12,True,3.846069,****
18,GO:1901570,fatty acid derivative biosynthetic process,9,3105,4,12,3.171141e-07,6.342281e-06,True,2.775594,****
7,GO:0032787,monocarboxylic acid metabolic process,119,2995,11,5,1.527934e-12,3.055868e-11,True,2.591912,****
0,GO:0016405,CoA-ligase activity,16,3098,10,6,1.6871200000000001e-18,3.37424e-17,True,2.511272,****
1,GO:0016878,acid-thiol ligase activity,16,3098,10,6,1.6871200000000001e-18,3.37424e-17,True,2.406091,****
4,GO:0006631,fatty acid metabolic process,67,3047,11,5,4.423822e-15,8.847644e-14,True,2.240498,****
8,GO:0019752,carboxylic acid metabolic process,202,2912,12,4,1.107028e-11,2.214055e-10,True,1.778929,****


In [33]:
results = token_enrichment(texts, gene_ids_in_this_pathway).head(20)


threshold = 0.05
results["p_value_adj"] = multipletests(results["p_value"].values, method='bonferroni')[1]
results["significant"] = results["p_value_adj"] < threshold
results = results.loc[results["significant"]==True]


# This lambda won't work if passed a value greater than the minimum p-value for significance defined here.
significance_levels = {0.05:"*", 0.01:"**", 0.001:"***", 0.0001:"****"}
get_level = lambda x: significance_levels[min([level for level in significance_levels.keys() if x <= level])]
results["significance"] = results["p_value_adj"].map(get_level)
results

Unnamed: 0,token,genes_with,genes_without,group_genes_with,group_genes_without,p_value,p_value_adj,significant,significance
0,layer,56,3058,7,9,9.464101e-09,1.89282e-07,True,****
1,cuticle,10,3104,4,12,4.425946e-07,8.851892e-06,True,****
2,exine,17,3097,4,12,2.589817e-06,5.179633e-05,True,****
3,permeability,7,3107,3,13,1.287683e-05,0.0002575365,True,***
4,visible,589,2525,11,5,2.017943e-05,0.0004035885,True,***
5,humidity,14,3100,3,13,7.139017e-05,0.001427803,True,**
6,phenotype,1017,2097,13,3,9.587024e-05,0.001917405,True,**
7,lacs,2,3112,2,14,0.0001461562,0.002923124,True,**
8,extracellular,4,3110,2,14,0.0003632139,0.007264279,True,**
9,cutin,4,3110,2,14,0.0003632139,0.007264279,True,**
