# Looking at the Dataset
The purpose of this notebook is to look closer at the dataset of genes, natural language descriptions, and ontology term annotations that are used in this work. As included in the preprocessing notebooks, these data are drawn from files from either publications supplements like Oellrich, Walls et al. (2015) or model species databases such as TAIR, MaizeGDB, and SGN. The datasets are already loaded and merged using classes available through the oats package.

In [1]:
import datetime
import nltk
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import random
import warnings
from collections import defaultdict
from nltk.corpus import brown
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string, remove_stopwords
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from statsmodels.sandbox.stats.multicomp import multipletests

sys.path.append("../../oats")
sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, flatten, to_hms
from oats.utils.utils import function_wrapper_with_duration, remove_duplicates_retain_order
from oats.biology.dataset import Dataset
from oats.biology.groupings import Groupings
from oats.biology.relationships import ProteinInteractions, AnyInteractions
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import annotate_using_noble_coder, term_enrichment
from oats.distances import pairwise as pw
from oats.nlp.vocabulary import get_overrepresented_tokens, get_vocab_from_tokens
from oats.nlp.vocabulary import reduce_vocab_connected_components, reduce_vocab_linares_pontes, token_enrichment

warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

Warming up PyWSD (takes ~10 secs)... took 5.291064262390137 secs.


True

In [2]:
# Paths to the files that are used for this notebook.
plant_dataset_path = "../../plant-data/genes_texts_annots.csv"

# Paths to files with mappings to groups.
kegg_pathways_path = "../../plant-data/reshaped_data/kegg_pathways.csv" 
plantcyc_pathways_path = "../../plant-data/reshaped_data/plantcyc_pathways.csv" 
lloyd_meinke_subsets_path = "../../plant-data/reshaped_data/lloyd_meinke_subsets.csv" 
lloyd_meinke_classes_path = "../../plant-data/reshaped_data/lloyd_meinke_classes.csv" 

# Paths to files that map group identifers to longer group names.
kegg_pathways_names_path = "../../plant-data/reshaped_data/kegg_pathways_name_map.csv"
plantcyc_pathways_names_path = "../../plant-data/reshaped_data/plantcyc_pathways_name_map.csv"
lloyd_meinke_subsets_names_path = "../../plant-data/reshaped_data/lloyd_meinke_subsets_name_map.csv"
lloyd_meinke_classes_names_path = "../../plant-data/reshaped_data/lloyd_meinke_classes_name_map.csv"

# Path to file with plant ortholog mappings.
ortholog_file_path = "../../plant-data/databases/panther/PlantGenomeOrthologs_IRB_Modified.txt"

In [3]:
# Create and name an output directory according to when the notebooks was run.
OUTPUT_NAME = "composition"
OUTPUT_DIR = os.path.join("../outputs","{}_{}_{}".format(OUTPUT_NAME,datetime.datetime.now().strftime('%m_%d_%Y_h%Hm%Ms%S'),random.randrange(1000,9999)))
os.mkdir(OUTPUT_DIR)

In [4]:
# Reading in and describing the dataset of plant genes.
plant_dataset = Dataset(plant_dataset_path)
plant_dataset.filter_has_description()
plant_dataset.describe()

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions
0,ath,5066,2975
1,gmx,30,23
2,mtr,37,36
3,osa,92,85
4,sly,70,70
5,zma,1405,810
6,total,6700,3999


### What's there for each species?
The previously loaded dataset contains all of the genes that across six plant species that have natural language description data for phenotype(s) related to that gene. Each gene can have multiple descriptions annotated to it, which were combined or concatenated when the datasets from multiple sources were merged in creating the pickled datasets. Arabidopsis has the highest number of genes that satisfy this criteria, followed by maize, and then followed by the other four species which have a relatively low number of genes that satisfy this criteria, atleast given the sources used for this work. Note that the number of unique descriptions is lower than the number of genes in call cases, because multiple genes can have the same phenotype description associated with them.

In [5]:
data = plant_dataset

wnl = WordNetLemmatizer()
lemmatize_doc = lambda d: [wnl.lemmatize(x) for x in simple_preprocess(d)]

dists = defaultdict(list)

sent_lists = {}
token_lists = {}
stems_lists = {}
lemma_lists = {}


# For each individual species.
for species in data.get_species():
    df = data.to_pandas()
    subset = df[df["species"]==species]
    sentences = [sent_tokenize(d) for d in subset["descriptions"].values]
    descriptions_not_stemmed = [simple_preprocess(d) for d in subset["descriptions"].values]
    descriptions_stemmed = [preprocess_string(d) for d in subset["descriptions"].values]
    descriptions_lemmatized = [lemmatize_doc(d) for d in subset["descriptions"].values]
    sent_lists[species] = flatten(sentences)
    token_lists[species] = flatten(descriptions_not_stemmed)
    stems_lists[species] = flatten(descriptions_stemmed)    
    lemma_lists[species] = flatten(descriptions_lemmatized)
    
    # What about the distributions of words per gene and sentences per gene?
    dists["species"].extend([species]*subset.shape[0])
    dists["num_words"].extend([len(word_tokenize(x)) for x in subset["descriptions"].values])
    dists["num_sents"].extend([len(sent_tokenize(x)) for x in subset["descriptions"].values])
    
# For the entire dataset including all of the species.
df = data.to_pandas()
subset = df
sentences = [sent_tokenize(d) for d in subset["descriptions"].values]
descriptions_not_stemmed = [simple_preprocess(d) for d in subset["descriptions"].values]
descriptions_stemmed = [preprocess_string(d) for d in subset["descriptions"].values]
descriptions_lemmatized = [lemmatize_doc(d) for d in subset["descriptions"].values]
sent_lists["total"] = flatten(sentences)
token_lists["total"] = flatten(descriptions_not_stemmed)
stems_lists["total"] = flatten(descriptions_stemmed)    
lemma_lists["total"] = flatten(descriptions_lemmatized)

# What about lemmas that are uniquely used for a particular species?
lemma_sets_unique_to_species = {}
for species in data.get_species():
    other_species = [s for s in data.get_species() if s != species]
    lemmas_used_in_other_species = set(flatten([lemma_lists[s] for s in other_species]))
    unique_lemmas = set(lemma_lists[species]).difference(lemmas_used_in_other_species)
    lemma_sets_unique_to_species[species] = unique_lemmas
lemma_sets_unique_to_species["total"] = flatten([list(s) for s in lemma_sets_unique_to_species.values()])

    
# Create a dataframe to contain the summarizing information about this dataset, and sort it by number of genes.
# Unique gene identifiers is just the total number of genes, this column name should be changed in the class...
df = data.describe() 
condition = (df.species=="total")
excluded = df[condition]
included = df[~condition]
df_sorted = included.sort_values(by="unique_gene_identifiers", ascending=False)
df = pd.concat([df_sorted,excluded])

# Add columns summarizing information about the text descriptions in the dataset.
df["total_sents"] = df["species"].map(lambda x: len(sent_lists[x]))
df["total_words"] = df["species"].map(lambda x: len(token_lists[x]))
df["unique_words"] = df["species"].map(lambda x: len(set(token_lists[x])))
df["unique_stems"] = df["species"].map(lambda x: len(set(stems_lists[x])))
df["total_lemmas"] = df["species"].map(lambda x: len(lemma_lists[x]))
df["unique_lemmas"] = df["species"].map(lambda x: len(set(lemma_lists[x])))
df["unique_lemmas_to_species"] = df["species"].map(lambda x: len(lemma_sets_unique_to_species[x]))
df

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species
0,ath,5066,2975,26043,232445,7085,5116,232445,6561,4864
5,zma,1405,810,5475,48983,1846,1317,48983,1722,503
3,osa,92,85,478,3689,826,586,3689,760,99
4,sly,70,70,359,1678,577,438,1678,552,99
2,mtr,37,36,263,2447,718,516,2447,671,126
1,gmx,30,23,62,222,81,68,222,78,12
6,total,6700,3999,32680,289464,8043,5802,289464,7443,5703


In [6]:
text_distributions = pd.DataFrame(dists)
text_distributions.to_csv(os.path.join(OUTPUT_DIR, "word_sent_distributions.csv"), index=False)
text_distributions.head(20)

Unnamed: 0,species,num_words,num_sents
0,zma,14,1
1,zma,115,7
2,zma,6,2
3,zma,13,3
4,zma,33,3
5,zma,105,9
6,zma,100,7
7,zma,43,3
8,zma,43,3
9,zma,115,7


### What about the ontology term annotations for each species?

In [7]:
# How many of the genes in this dataset for each species are mapped to atleast one term from a given ontology?
num_mapped_go = {}
num_mapped_po = {}
for species in data.get_species():
    d = data.to_pandas()
    subset = d[d["species"]==species]    
    num_mapped_po[species] = len([t for t in subset["annotations"].values if "PO" in t])
    num_mapped_go[species] = len([t for t in subset["annotations"].values if "GO" in t])
num_mapped_go["total"] = sum(list(num_mapped_go.values()))   
num_mapped_po["total"] = sum(list(num_mapped_po.values()))
df["go"] = df["species"].map(lambda x: num_mapped_go[x])
df["po"] = df["species"].map(lambda x: num_mapped_po[x])
df

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species,go,po
0,ath,5066,2975,26043,232445,7085,5116,232445,6561,4864,4691,3220
5,zma,1405,810,5475,48983,1846,1317,48983,1722,503,184,111
3,osa,92,85,478,3689,826,586,3689,760,99,46,92
4,sly,70,70,359,1678,577,438,1678,552,99,23,65
2,mtr,37,36,263,2447,718,516,2447,671,126,30,32
1,gmx,30,23,62,222,81,68,222,78,12,28,27
6,total,6700,3999,32680,289464,8043,5802,289464,7443,5703,5002,3547


### What about the biologically relevant groups like biochemical pathways and phenotypes?

In [8]:
# What are the groupings that we're interested in mapping to? Uses the paths defined at the top of the notebook.
groupings_dict = {
    "kegg":(kegg_pathways_path, kegg_pathways_names_path),
    "plantcyc":(plantcyc_pathways_path, plantcyc_pathways_names_path),
    "lloyd_meinke":(lloyd_meinke_subsets_path, lloyd_meinke_subsets_names_path)
}


for name,(filename,mapfile) in groupings_dict.items():
    groups = Groupings(filename, {row.group_id:row.group_name for row in pd.read_csv(mapfile).itertuples()})
    id_to_group_ids, group_id_to_ids = groups.get_groupings_for_dataset(data)
    group_mapped_ids = [k for (k,v) in id_to_group_ids.items() if len(v)>0]
    species_dict = data.get_species_dictionary()
    num_mapped = {}
    for species in data.get_species():
        num_mapped[species] = len([x for x in group_mapped_ids if species_dict[x]==species])
    num_mapped["total"] = sum(list(num_mapped.values()))    
    df[name] = df["species"].map(lambda x: num_mapped[x])  
df

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species,go,po,kegg,plantcyc,lloyd_meinke
0,ath,5066,2975,26043,232445,7085,5116,232445,6561,4864,4691,3220,1084,654,1570
5,zma,1405,810,5475,48983,1846,1317,48983,1722,503,184,111,157,133,0
3,osa,92,85,478,3689,826,586,3689,760,99,46,92,1,4,0
4,sly,70,70,359,1678,577,438,1678,552,99,23,65,18,3,0
2,mtr,37,36,263,2447,718,516,2447,671,126,30,32,0,2,0
1,gmx,30,23,62,222,81,68,222,78,12,28,27,3,0,0
6,total,6700,3999,32680,289464,8043,5802,289464,7443,5703,5002,3547,1263,796,1570


### What about the other biologically relevant information like orthologous genes and protein interactions?

In [9]:
# PantherDB for plant orthologs.
ortholog_edgelist = AnyInteractions(data.get_name_to_id_dictionary(), ortholog_file_path)
species_dict = data.get_species_dictionary()
num_mapped = {}
for species in data.get_species():
    num_mapped[species] = len([x for x in ortholog_edgelist.ids if species_dict[x]==species])
num_mapped["total"] = sum(list(num_mapped.values()))
df["panther"] = df["species"].map(lambda x: num_mapped[x])    
df

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species,go,po,kegg,plantcyc,lloyd_meinke,panther
0,ath,5066,2975,26043,232445,7085,5116,232445,6561,4864,4691,3220,1084,654,1570,317
5,zma,1405,810,5475,48983,1846,1317,48983,1722,503,184,111,157,133,0,443
3,osa,92,85,478,3689,826,586,3689,760,99,46,92,1,4,0,86
4,sly,70,70,359,1678,577,438,1678,552,99,23,65,18,3,0,11
2,mtr,37,36,263,2447,718,516,2447,671,126,30,32,0,2,0,0
1,gmx,30,23,62,222,81,68,222,78,12,28,27,3,0,0,1
6,total,6700,3999,32680,289464,8043,5802,289464,7443,5703,5002,3547,1263,796,1570,858


In [10]:
# STRING DB for protein-protein interactions.
naming_file = "../../plant-data/databases/string/all_organisms.name_2_string.tsv"
interaction_files = [
    "../../plant-data/databases/string/3702.protein.links.detailed.v11.0.txt", # Arabidopsis
    "../../plant-data/databases/string/4577.protein.links.detailed.v11.0.txt", # Maize
    "../../plant-data/databases/string/4530.protein.links.detailed.v11.0.txt", # Tomato 
    "../../plant-data/databases/string/4081.protein.links.detailed.v11.0.txt", # Medicago
    "../../plant-data/databases/string/3880.protein.links.detailed.v11.0.txt", # Rice 
    "../../plant-data/databases/string/3847.protein.links.detailed.v11.0.txt", # Soybean
    "../../plant-data/databases/string/9606.protein.links.detailed.v11.0.txt", # Human
]
genes = data.get_gene_dictionary()
string_data = ProteinInteractions(genes, naming_file, *interaction_files)
species_dict = data.get_species_dictionary()
num_mapped = {}
for species in data.get_species():
    num_mapped[species] = len([x for x in string_data.ids if species_dict[x]==species])
num_mapped["total"] = sum(list(num_mapped.values()))
df["stringdb"] = df["species"].map(lambda x: num_mapped[x])    
df

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions,total_sents,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species,go,po,kegg,plantcyc,lloyd_meinke,panther,stringdb
0,ath,5066,2975,26043,232445,7085,5116,232445,6561,4864,4691,3220,1084,654,1570,317,3173
5,zma,1405,810,5475,48983,1846,1317,48983,1722,503,184,111,157,133,0,443,229
3,osa,92,85,478,3689,826,586,3689,760,99,46,92,1,4,0,86,45
4,sly,70,70,359,1678,577,438,1678,552,99,23,65,18,3,0,11,11
2,mtr,37,36,263,2447,718,516,2447,671,126,30,32,0,2,0,0,13
1,gmx,30,23,62,222,81,68,222,78,12,28,27,3,0,0,1,5
6,total,6700,3999,32680,289464,8043,5802,289464,7443,5703,5002,3547,1263,796,1570,858,3476


In [11]:
# Write that dataframe with all the information about datast to a file.
df.to_csv(os.path.join(OUTPUT_DIR,"full_dataset_composition.csv"),index=False)

### How do the vocabularies used for different species compare?
One of the things we are interested in is discovering or recovering phenotype similarity between different species in order to identify phenologs (phenotypes between species that share some underlying genetic cause). For this reason, we are interested in how the vocabularies used to describe phenotypes between different species vary, because this will impact how feasible it is to use a dataset like this to identify phenologs. Because the Arabidopsis and maize datasets are the largest in this case, we will compare the vocabularies used in describing the phenotypes associated with the genes from these species in this dataset.

In [12]:
# Using lemmas as the vocabulary components.
vocabs = {s:set(lemma_list) for s,lemma_list in lemma_lists.items()}
fdist_zma = FreqDist(lemma_lists["zma"])
fdist_ath = FreqDist(lemma_lists["ath"])

# Using word stems as the vocabulary components.
#vocabs = {s:set(stems_list) for s,stems_list in stems_lists.items()}
#fdist_zma = FreqDist(stems_lists["zma"])
#fdist_ath = FreqDist(stems_lists["ath"])

# Using tokens (full words) as the vocabulary components.
#vocabs = {s:set(token_list) for s,token_list in token_lists.items()}
#fdist_zma = FreqDist(token_lists["zma"])
#fdist_ath = FreqDist(token_lists["ath"])

union_vocab = vocabs["zma"].union(vocabs["ath"])
table = pd.DataFrame({"token":list(union_vocab)})
stops = set(stopwords.words('english'))
table = table[~table.token.isin(stops)]
table["part_of_speech"] = table["token"].map(lambda x: nltk.pos_tag([x])[0][1][:2])
table["ath_freq"] = table["token"].map(lambda x: fdist_ath[x])
table["ath_rate"] = table["ath_freq"]*100/len(token_lists["ath"])
table["zma_freq"] = table["token"].map(lambda x: fdist_zma[x])
table["zma_rate"] = table["zma_freq"]*100/len(token_lists["zma"])
table["diff"] = table["ath_rate"]-table["zma_rate"]
table.to_csv(os.path.join(OUTPUT_DIR,"token_frequencies.csv"), index=False)
table.head(10)

Unnamed: 0,token,part_of_speech,ath_freq,ath_rate,zma_freq,zma_rate,diff
0,effectively,RB,4,0.001721,0,0.0,0.001721
1,fill,NN,1,0.00043,4,0.008166,-0.007736
2,ever,RB,1,0.00043,0,0.0,0.00043
3,live,JJ,1,0.00043,0,0.0,0.00043
4,frequent,NN,5,0.002151,0,0.0,0.002151
5,pc,NN,7,0.003011,0,0.0,0.003011
6,atrz,NN,4,0.001721,0,0.0,0.001721
7,reduction,NN,418,0.179827,7,0.014291,0.165537
8,pointing,VB,4,0.001721,0,0.0,0.001721
9,cellularizes,NN,1,0.00043,0,0.0,0.00043


In [13]:
# What are the tokens more frequently used for Arabidopsis than maize descriptions in this dataset?
table.sort_values(by="diff", ascending=False, inplace=True)
table.head(30)

Unnamed: 0,token,part_of_speech,ath_freq,ath_rate,zma_freq,zma_rate,diff
2872,embryo,NN,4087,1.758265,150,0.306229,1.452037
4861,mutant,NN,4433,1.907118,254,0.518547,1.388571
4828,phenotype,NN,3393,1.4597,76,0.155156,1.304544
5112,type,NN,2364,1.017015,15,0.030623,0.986392
5501,wild,NN,2287,0.983889,7,0.014291,0.969598
2930,root,NN,2487,1.069931,70,0.142907,0.927024
5078,stage,NN,1748,0.752006,55,0.112284,0.639722
6960,reduced,VB,2416,1.039386,215,0.438928,0.600458
4843,terminal,NN,1265,0.544215,4,0.008166,0.536049
4040,growth,NN,1590,0.684033,78,0.159239,0.524794


In [14]:
# What are the tokens more frequently used for maize than Arabidopsis descriptions in this dataset?
table.sort_values(by="diff", ascending=True, inplace=True)
table.head(30)

Unnamed: 0,token,part_of_speech,ath_freq,ath_rate,zma_freq,zma_rate,diff
3093,endosperm,NN,100,0.043021,1069,2.18239,-2.139369
188,seedling,VB,1276,0.548947,1316,2.686646,-2.137699
1188,kernel,NN,0,0.0,756,1.543393,-1.543393
265,leaf,NN,3224,1.386995,1401,2.860176,-1.473181
211,yellow,NN,239,0.10282,766,1.563808,-1.460988
5398,green,JJ,743,0.319646,778,1.588306,-1.268661
1213,white,JJ,310,0.133365,642,1.310659,-1.177294
5266,albino,NN,182,0.078298,396,0.808444,-0.730146
1765,usually,RB,41,0.017639,342,0.698201,-0.680563
4734,color,NN,40,0.017208,326,0.665537,-0.648329


In [15]:
# Is the mean absolute value of the rate differences different between the different parts of speech?
table["abs_diff"] = abs(table["diff"])
pos_table = table.groupby("part_of_speech").mean()
pos_table.sort_values(by="abs_diff", inplace=True, ascending=False)
pos_table = pos_table[["abs_diff"]]
pos_table.reset_index()

Unnamed: 0,part_of_speech,abs_diff
0,MD,0.059567
1,CD,0.029173
2,IN,0.026193
3,JJ,0.02308
4,DT,0.019441
5,RB,0.016217
6,NN,0.014511
7,VB,0.012129
8,CC,0.007736
9,WP,0.001611


In [16]:
# Working on the Venn Diagram for this part, unused currently.
#print(table.shape)
#zma_only = table[table["ath_rate"]==0]
#ath_only = table[table["zma_rate"]==0]
#print(zma_only.shape)
#print(ath_only.shape)
#print(ath_only.shape[0]+zma_only.shape[0])
#ath_only.head(10)
# We need to create a mapping between stems and the words that were present for them.
# This is because what we want is the stems that are exclusive to a species.
# but then the words that are actually there for those stems, so that we can count their parts of speech.

### Looking at Term and Word Enrichment for Groups of Genes

In [84]:
# Loading the dataset of phenotype descriptions and ontology annotations.
plant_dataset = Dataset(plant_dataset_path)
data = plant_dataset
data.filter_has_description()
#data.filter_has_annotation("GO")
data.filter_has_annotation("PO")
d = data.get_description_dictionary()
texts = {i:" ".join(simple_preprocess(t)) for i,t in d.items()}
len(texts)                              

3547

In [62]:
# Create ontology objects for all the biological ontologies being used.
go_pickle_path = "../ontologies/go.pickle"                                                                
po_pickle_path = "../ontologies/po.pickle"                                                             
pato_pickle_path = "../ontologies/pato.pickle"
pato = load_from_pickle(pato_pickle_path)
po = load_from_pickle(po_pickle_path)
go = load_from_pickle(go_pickle_path)

In [85]:
curated_go_annotations = data.get_annotations_dictionary("GO")
curated_po_annotations = data.get_annotations_dictionary("PO")
print("done")

done


In [77]:
# Which GO terms are used to annotate the most genes in this dataset?
term_id_to_ids = defaultdict(list)
for i,term_id_list in curated_go_annotations.items():
    for term_id in term_id_list:
        term_id_to_ids[term_id].append(i)
term_id_to_num_ids = {k:len(v) for k,v in term_id_to_ids.items()}
terms_df = pd.DataFrame(term_id_to_num_ids.items(), columns=["term_id", "freq"])

def get_term_name(ont,i):
    try:
        return(ont[i].name)
    except:
        return("")

terms_df["term_name"] = terms_df["term_id"].map(lambda x: get_term_name(go,x))
terms_df.sort_values(by="freq", ascending=False, inplace=True)
terms_df.head(20)

Unnamed: 0,term_id,freq,term_name
9,GO:0005515,665,protein binding
58,GO:0005634,595,nucleus
19,GO:0009507,463,chloroplast
18,GO:0005886,414,plasma membrane
100,GO:0005829,349,cytosol
2,GO:0003674,340,molecular_function
1,GO:0008150,225,biological_process
130,GO:0005737,213,cytoplasm
57,GO:0006355,192,"regulation of transcription, DNA-templated"
46,GO:0009506,189,plasmodesma


In [78]:
# Make the group be ones that have that GO term anntation.
#go_term_id_of_interest = "GO:0009640"
#gene_ids_in_this_pathway = [k for k,v in curated_go_annotations.items() if go_term_id_of_interest in v]

In [86]:
# Load the mappings from this dataset to PlantCyc information.
#pmn_pathways_filename = "../data/pickles/groupings_from_pmn_pathways.pickle"                        
#groups = load_from_pickle(pmn_pathways_filename)
#id_to_group_ids, group_id_to_ids = groups.get_groupings_for_dataset(data)


# Reading in the dataset of groupings for pathways in PlantCyc.
plantcyc_name_mapping = {row.group_id:row.group_name for row in pd.read_csv(plantcyc_pathways_names_path).itertuples()}
plantcyc_grouping = Groupings(plantcyc_pathways_path, plantcyc_name_mapping)
id_to_group_ids, group_id_to_ids = plantcyc_grouping.get_groupings_for_dataset(data)

# Look at which pathways are best represented in this dataset.
pathways_sorted = sorted(group_id_to_ids.items(), key=lambda item: len(item[1]), reverse=True)
pathways_sorted_lengths = [(i,len(l)) for (i,l) in pathways_sorted]
pathways_df = pd.DataFrame(pathways_sorted_lengths, columns=["pathway_id","num_genes"])
pathways_df["pathway_name"] = pathways_df["pathway_id"].map(lambda x: plantcyc_grouping.get_long_name(x))
pathways_df = pathways_df[["pathway_name","pathway_id","num_genes"]]
pathways_df.head(15)

Unnamed: 0,pathway_name,pathway_id,num_genes
0,phenylpropanoid biosynthesis,PWY-361,21
1,indole-3-acetate biosynthesis II,PWY-581,20
2,suberin monomers biosynthesis,PWY-1121,20
3,gluconeogenesis III,PWY66-399,19
4,gluconeogenesis I,GLUCONEO-PWY,19
5,phosphatidylcholine acyl editing,PWY-6803,17
6,sporopollenin precursors biosynthesis,PWY-6733,17
7,L-leucine biosynthesis,LEUSYN-PWY,16
8,glycolysis I (from glucose 6-phosphate),GLYCOLYSIS,16
9,glycolysis II (from fructose 6-phosphate),PWY-5484,15


In [107]:
# For some example pathway to use.
#pathway_id = "PWY-361"
pathway_id = "PWY-581"
#pathway_id = "PWY-1121"
pathway_id = "PWY-695"
gene_ids_in_this_pathway = group_id_to_ids[pathway_id]
gene_ids_in_this_pathway

[718, 860, 871, 955, 1228, 1833, 4387, 4399]

In [98]:
wordcloud = defaultdict(list)

In [108]:
results = term_enrichment(curated_po_annotations, gene_ids_in_this_pathway, po).head(20)
threshold = 0.05
results["p_value_adj"] = multipletests(results["p_value"].values, method='bonferroni')[1]
results["significant"] = results["p_value_adj"] < threshold
results = results.loc[results["significant"]==True]
results["info_content"] = results["term_id"].map(lambda x: po.ic(x))
results.sort_values(by="info_content", ascending=False, inplace=True)


# ns   P > 0.05
# *    P ≤ 0.05
# **   P ≤ 0.01
# ***  P ≤ 0.001
# **** P ≤ 0.0001

# This lambda won't work is passed a value greater than the minimum p-value for significance defined here.
significance_levels = {0.05:"*", 0.01:"**", 0.001:"***", 0.0001:"****"}
get_level = lambda x: significance_levels[min([level for level in significance_levels.keys() if x <= level])]

results["significance"] = results["p_value_adj"].map(get_level)
results

Unnamed: 0,term_id,term_label,genes_with,genes_without,group_genes_with,group_genes_without,p_value,p_value_adj,significant,info_content,significance
3,PO:0000258,root cortex,26,3521,2,6,0.001627,0.03254,True,4.0,*
1,PO:0007057,seed germination stage,94,3453,3,5,0.000999,0.019971,True,3.633786,*
0,PO:0020031,radicle,17,3530,2,6,0.000744,0.014871,True,3.419565,*
4,PO:0006203,pericycle,29,3518,2,6,0.001995,0.039895,True,3.0,*
2,PO:0000045,embryo root,23,3524,2,6,0.001296,0.025913,True,2.362259,*
5,PO:0005708,cortex,31,3516,2,6,0.00226,0.045197,True,2.015353,*


In [109]:
for row in results.itertuples():
    wordcloud["Weight"].append(int(1/row.p_value_adj))
    wordcloud["Word"].append("{} ({})".format(row.term_id,row.term_label))

In [102]:
results = term_enrichment(curated_go_annotations, gene_ids_in_this_pathway, go).head(20)

from statsmodels.sandbox.stats.multicomp import multipletests
threshold = 0.05
results["p_value_adj"] = multipletests(results["p_value"].values, method='bonferroni')[1]
results["significant"] = results["p_value_adj"] < threshold


results = results.loc[results["significant"]==True]

results["info_content"] = results["term_id"].map(lambda x: go.ic(x))
results.sort_values(by="info_content", ascending=False, inplace=True)


# This lambda won't work is passed a value greater than the minimum p-value for significance defined here.
significance_levels = {0.05:"*", 0.01:"**", 0.001:"***", 0.0001:"****"}
get_level = lambda x: significance_levels[min([level for level in significance_levels.keys() if x <= level])]

results["significance"] = results["p_value_adj"].map(get_level)
results

Unnamed: 0,term_id,term_label,genes_with,genes_without,group_genes_with,group_genes_without,p_value,p_value_adj,significant,info_content,significance
7,GO:0009684,indoleacetic acid biosynthetic process,9,3538,6,14,6.605807e-11,1.321161e-09,True,4.489831,****
19,GO:0048825,cotyledon development,23,3524,5,15,2.928356e-07,5.856712e-06,True,4.0,****
5,GO:0042435,indole-containing compound biosynthetic process,20,3527,8,12,5.716694e-13,1.143339e-11,True,3.632673,****
11,GO:0072330,monocarboxylic acid biosynthetic process,66,3481,8,12,2.411502e-09,4.823003e-08,True,3.313892,****
0,GO:0009851,auxin biosynthetic process,17,3530,12,8,7.257074e-22,1.451415e-20,True,3.277093,****
8,GO:0009683,indoleacetic acid metabolic process,10,3537,6,14,1.053362e-10,2.106724e-09,True,3.144588,****
16,GO:0032787,monocarboxylic acid metabolic process,119,3428,8,12,1.823356e-07,3.646712e-06,True,2.591912,****
6,GO:0042430,indole-containing compound metabolic process,25,3522,8,12,2.515474e-12,5.030948e-11,True,2.54668,****
17,GO:0046394,carboxylic acid biosynthetic process,120,3427,8,12,1.938932e-07,3.877864e-06,True,2.368647,****
1,GO:0009850,auxin metabolic process,26,3521,12,8,3.715543e-20,7.431085999999999e-19,True,2.165312,****


In [110]:
results = token_enrichment(texts, gene_ids_in_this_pathway).head(20)


threshold = 0.05
results["p_value_adj"] = multipletests(results["p_value"].values, method='bonferroni')[1]
results["significant"] = results["p_value_adj"] < threshold
results = results.loc[results["significant"]==True]


# This lambda won't work if passed a value greater than the minimum p-value for significance defined here.
significance_levels = {0.05:"*", 0.01:"**", 0.001:"***", 0.0001:"****"}
get_level = lambda x: significance_levels[min([level for level in significance_levels.keys() if x <= level])]
results["significance"] = results["p_value_adj"].map(get_level)
results

Unnamed: 0,token,genes_with,genes_without,group_genes_with,group_genes_without,p_value,p_value_adj,significant,significance
0,bps,9,3538,4,4,7.472458e-09,1.494492e-07,True,****
1,ga,49,3498,4,4,2.951314e-06,5.902628e-05,True,****
2,entirely,12,3535,3,5,3.362705e-06,6.72541e-05,True,****
3,inhibitor,56,3491,4,4,4.88352e-06,9.76704e-05,True,****
4,germinate,62,3485,4,4,7.178407e-06,0.0001435681,True,***
5,paclobutrazol,21,3526,3,5,1.481661e-05,0.0002963323,True,***
6,aba,201,3346,5,3,3.022841e-05,0.0006045683,True,***
7,become,43,3504,3,5,0.0001085597,0.002171194,True,**
8,if,43,3504,3,5,0.0001085597,0.002171194,True,**
9,neoxanthin,6,3541,2,6,0.0001232688,0.002465377,True,**


In [111]:
for row in results.itertuples():
    wordcloud["Weight"].append(int(1/row.p_value_adj))
    wordcloud["Word"].append(row.token)

In [112]:
pd.DataFrame(wordcloud).to_csv(os.path.join(OUTPUT_DIR, "{}_word_cloud.csv".format(pathway_id)), index=False)
pd.DataFrame(wordcloud)

Unnamed: 0,Weight,Word
0,91,PO:0000229 (flower meristem)
1,47,PO:0006020 (lateral root apical meristem)
2,113,PO:0006056 (cotyledon epidermis)
3,81,PO:0008028 (reproductive shoot apical meristem)
4,27,PO:0020147 (root apical meristem)
5,839,PO:0006085 (root meristem)
6,169,PO:0025277 (pollen sac)
7,648,PO:0000016 (lateral root primordium)
8,5306,PO:0000056 (flower bud)
9,173,PO:0020144 (apical meristem)
