# Looking at the Dataset
The purpose of this notebook is to look closer at the dataset of genes, natural language descriptions, and ontology term annotations that are used in this work. As included in the preprocessing notebooks, these data are drawn from files from either publications supplements like Oellrich, Walls et al. (2015) or model species databases such as TAIR, MaizeGDB, and SGN. The datasets are already loaded and merged using classe available through the oats package.

In [1]:
import datetime
import nltk
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
from collections import defaultdict
from nltk.corpus import brown
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string, remove_stopwords
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, merge_list_dicts, flatten, to_hms
from oats.biology.dataset import Dataset
from oats.biology.groupings import Groupings
from oats.biology.relationships import ProteinInteractions, AnyInteractions
from oats.annotation.ontology import Ontology

warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [2]:
data = load_from_pickle("../data/pickles/gene_phenotype_dataset_all_text_and_annotations_unmerged.pickle")
data.to_pandas().head()
data.describe()

Unnamed: 0,species,num_genes,unique_descriptions
0,ath,628658,9110
1,gmx,156,49
2,mtr,342,155
3,osa,772,389
4,sly,786,314
5,zma,6526,998
6,total,637240,11015


In [3]:
data = load_from_pickle("../data/pickles/gene_phenotype_dataset_all_text_and_annotations.pickle")
data.filter_has_description()
data.to_pandas().head()
data.describe()

Unnamed: 0,species,num_genes,unique_descriptions
0,ath,6364,3813
1,gmx,30,24
2,mtr,37,36
3,osa,92,85
4,sly,70,70
5,zma,1406,811
6,total,7999,4839


### What's there for each species?
The previously loaded dataset contains all of the genes that across six plant species that have natural language description data for phenotype(s) related to that gene. Each gene can have multiple descriptions annotated to it, which were combined or concatenated when the datasets from multiple sources were merged in creating the pickled datasets. Arabidopsis has the highest number of genes that satisfy this criteria, followed by maize, and then followed by the other four species which have a relatively low number of genes that satisfy this criteria, atleast given the sources used for this work. Note that the number of unique descriptions is lower than the number of genes in call cases, because multiple genes can have the same phenotype description associated with them.

In [22]:
wnl = WordNetLemmatizer()
lemmatize_doc = lambda d: [wnl.lemmatize(x) for x in simple_preprocess(d)]

dists = defaultdict(list)


token_lists = {}
stems_lists = {}
lemma_lists = {}

# For each individual species.
for species in data.get_species():
    df = data.to_pandas()
    subset = df[df["species"]==species]
    descriptions_not_stemmed = [simple_preprocess(d) for d in subset["description"].values]
    descriptions_stemmed = [preprocess_string(d) for d in subset["description"].values]
    descriptions_lemmatized = [lemmatize_doc(d) for d in subset["description"].values]
    token_lists[species] = flatten(descriptions_not_stemmed)
    stems_lists[species] = flatten(descriptions_stemmed)    
    lemma_lists[species] = flatten(descriptions_lemmatized)
    
    # What about the distributions of words per gene and sentences per gene?
    dists["species"].extend([species]*subset.shape[0])
    dists["num_words"].extend([len(word_tokenize(x)) for x in subset["description"].values])
    dists["num_sents"].extend([len(sent_tokenize(x)) for x in subset["description"].values])
    
    # What about the number of ontology annotations?
    #po = len([t for t in subset["term_ids"].values if "PO" in t])
    #go = len([t for t in subset["term_ids"].values if "GO" in t])
    
# For the entire dataset including all of the species.
df = data.to_pandas()
subset = df
descriptions_not_stemmed = [simple_preprocess(d) for d in subset["description"].values]
descriptions_stemmed = [preprocess_string(d) for d in subset["description"].values]
descriptions_lemmatized = [lemmatize_doc(d) for d in subset["description"].values]
token_lists["total"] = flatten(descriptions_not_stemmed)
stems_lists["total"] = flatten(descriptions_stemmed)    
lemma_lists["total"] = flatten(descriptions_lemmatized)

# What about lemmas that are uniquely used for a particular species?
lemma_sets_unique_to_species = {}
for species in data.get_species():
    other_species = [s for s in data.get_species() if s != species]
    lemmas_used_in_other_species = set(flatten([lemma_lists[s] for s in other_species]))
    unique_lemmas = set(lemma_lists[species]).difference(lemmas_used_in_other_species)
    lemma_sets_unique_to_species[species] = unique_lemmas
lemma_sets_unique_to_species["total"] = flatten([list(s) for s in lemma_sets_unique_to_species.values()])

    
# Create a dataframe to contain the summarizing information about this dataset, and sort it by number of genes.
df = data.describe() 
condition = (df.species=="total")
excluded = df[condition]
included = df[~condition]
df_sorted = included.sort_values(by="num_genes", ascending=False)
df = pd.concat([df_sorted,excluded])

# Add columns summarizing information about the text descriptions in the dataset.
df["total_words"] = df["species"].map(lambda x: len(token_lists[x]))
df["unique_words"] = df["species"].map(lambda x: len(set(token_lists[x])))
df["unique_stems"] = df["species"].map(lambda x: len(set(stems_lists[x])))
df["total_lemmas"] = df["species"].map(lambda x: len(lemma_lists[x]))
df["unique_lemmas"] = df["species"].map(lambda x: len(set(lemma_lists[x])))
df["unique_lemmas_to_species"] = df["species"].map(lambda x: len(lemma_sets_unique_to_species[x]))
df

Unnamed: 0,species,num_genes,unique_descriptions,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species
0,ath,6364,3813,264189,7085,5116,264189,6561,4864
5,zma,1406,811,50029,1846,1317,50029,1722,503
3,osa,92,85,3887,826,586,3887,760,99
4,sly,70,70,1810,577,438,1810,552,99
2,mtr,37,36,2672,718,516,2672,671,126
1,gmx,30,24,233,81,68,233,78,12
6,total,7999,4839,322820,8043,5802,322820,7443,5703


In [8]:
text_distributions = pd.DataFrame(dists)
text_distributions.to_csv("../data/scratch/word_sent_distributions.csv", index=False)
text_distributions.head(20)

Unnamed: 0,species,num_words,num_sents
0,ath,8,1
1,ath,3,1
2,ath,15,2
3,ath,67,5
4,ath,18,3
5,ath,78,10
6,ath,117,9
7,ath,71,6
8,ath,24,2
9,ath,7,1


### What about the ontology term annotations for each species?

In [9]:
# How many of the genes in this dataset for each species are mapped to atleast one term from a given ontology?
num_mapped_go = {}
num_mapped_po = {}
for species in data.get_species():
    d = data.to_pandas()
    subset = d[d["species"]==species]    
    num_mapped_po[species] = len([t for t in subset["term_ids"].values if "PO" in t])
    num_mapped_go[species] = len([t for t in subset["term_ids"].values if "GO" in t])
num_mapped_go["total"] = sum(list(num_mapped_go.values()))   
num_mapped_po["total"] = sum(list(num_mapped_po.values()))
df["go"] = df["species"].map(lambda x: num_mapped_go[x])
df["po"] = df["species"].map(lambda x: num_mapped_po[x])
df

Unnamed: 0,species,num_genes,unique_descriptions,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species,go,po
0,ath,6364,3813,264189,7085,5116,264189,6561,4864,5434,4420
5,zma,1406,811,50029,1846,1317,50029,1722,503,184,111
3,osa,92,85,3887,826,586,3887,760,99,46,92
4,sly,70,70,1810,577,438,1810,552,99,23,65
2,mtr,37,36,2672,718,516,2672,671,126,30,32
1,gmx,30,24,233,81,68,233,78,12,28,27
6,total,7999,4839,322820,8043,5802,322820,7443,5703,5745,4747


### What about the biologically relevant groups like biochemical pathways and phenotypes?

In [10]:
# What are the groupings that we're interested in mapping to?
kegg_pathways_filename = "../data/pickles/groupings_from_kegg_pathways.pickle" 
pmn_pathways_filename = "../data/pickles/groupings_from_pmn_pathways.pickle"                        
lloyd_subsets_filename = "../data/pickles/groupings_from_lloyd_subsets.pickle"                     
groupings_dict = {"kegg":kegg_pathways_filename,"plantcyc":pmn_pathways_filename,"lloyd":lloyd_subsets_filename}

for name,filename in groupings_dict.items():
    groups = load_from_pickle(filename)
    id_to_group_ids, group_id_to_ids = groups.get_groupings_for_dataset(data)
    group_mapped_ids = [k for (k,v) in id_to_group_ids.items() if len(v)>0]
    species_dict = data.get_species_dictionary()
    num_mapped = {}
    for species in data.get_species():
        num_mapped[species] = len([x for x in group_mapped_ids if species_dict[x]==species])
    num_mapped["total"] = sum(list(num_mapped.values()))    
    df[name] = df["species"].map(lambda x: num_mapped[x])  
df

Unnamed: 0,species,num_genes,unique_descriptions,total_words,unique_words,unique_stems,total_lemmas,unique_lemmas,unique_lemmas_to_species,go,po,kegg,plantcyc,lloyd
0,ath,6364,3813,264189,7085,5116,264189,6561,4864,5434,4420,1572,926,2868
5,zma,1406,811,50029,1846,1317,50029,1722,503,184,111,156,133,0
3,osa,92,85,3887,826,586,3887,760,99,46,92,0,3,0
4,sly,70,70,1810,577,438,1810,552,99,23,65,17,2,0
2,mtr,37,36,2672,718,516,2672,671,126,30,32,0,2,0
1,gmx,30,24,233,81,68,233,78,12,28,27,1,0,0
6,total,7999,4839,322820,8043,5802,322820,7443,5703,5745,4747,1746,1066,2868


### What about the other biologically relevant information like orthologous genes and protein interactions?

In [None]:
# PantherDB for plant orthologs.
ortholog_file_path = "../data/orthology_related_files/pantherdb/PlantGenomeOrthologs_IRB_Modified.txt"
ortholog_edgelist = AnyInteractions(data.get_name_to_id_dictionary(), ortholog_file_path)
species_dict = data.get_species_dictionary()
num_mapped = {}
for species in data.get_species():
    num_mapped[species] = len([x for x in ortholog_edgelist.ids if species_dict[x]==species])
num_mapped["total"] = sum(list(num_mapped.values()))
df["panther"] = df["species"].map(lambda x: num_mapped[x])    
df

In [None]:
# STRING DB for protein-protein interactions.
naming_file = "../data/group_related_files/string/all_organisms.name_2_string.tsv"
interaction_files = [
    "../data/group_related_files/string/3702.protein.links.detailed.v11.0.txt", # Arabidopsis thaliana
    "../data/group_related_files/string/4577.protein.links.detailed.v11.0.txt", # maize
    "../data/group_related_files/string/4530.protein.links.detailed.v11.0.txt", # tomato 
    "../data/group_related_files/string/4081.protein.links.detailed.v11.0.txt", # medicago
    "../data/group_related_files/string/3880.protein.links.detailed.v11.0.txt", # rice 
    "../data/group_related_files/string/3847.protein.links.detailed.v11.0.txt", # soybean
]
genes = data.get_gene_dictionary()
string_data = ProteinInteractions(genes, naming_file, *interaction_files)
species_dict = data.get_species_dictionary()
num_mapped = {}
for species in data.get_species():
    num_mapped[species] = len([x for x in string_data.ids if species_dict[x]==species])
num_mapped["total"] = sum(list(num_mapped.values()))
df["stringdb"] = df["species"].map(lambda x: num_mapped[x])    
df

### How do the vocabularies used for different species compare?
One of the things we are interested in is discovering or recovering phenotype similarity between different species in order to identify phenologs (phenotypes between species that share some underlying genetic cause). For this reason, we are interested in how the vocabularies used to describe phenotypes between different species vary, because this will impact how feasible it is to use a dataset like this to identify phenologs. Because the Arabidopsis and maize datasets are the largest in this case, we will compare the vocabularies used in describing the phenotypes associated with the genes from these species in this dataset.

In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist

vocabs = {s:set(lemma_list) for s,lemma_list in lemma_lists.items()}
fdist_zma = FreqDist(lemma_lists["zma"])
fdist_ath = FreqDist(lemma_lists["ath"])

#vocabs = {s:set(stems_list) for s,stems_list in stems_lists.items()}
#fdist_zma = FreqDist(stems_lists["zma"])
#fdist_ath = FreqDist(stems_lists["ath"])

#vocabs = {s:set(token_list) for s,token_list in token_lists.items()}
#fdist_zma = FreqDist(token_lists["zma"])
#fdist_ath = FreqDist(token_lists["ath"])


union_vocab = vocabs["zma"].union(vocabs["ath"])
table = pd.DataFrame({"token":list(union_vocab)})
stops = set(stopwords.words('english'))
table = table[~table.token.isin(stops)]
table["part_of_speech"] = table["token"].map(lambda x: nltk.pos_tag([x])[0][1][:2])
table["ath_freq"] = table["token"].map(lambda x: fdist_ath[x])
table["ath_rate"] = table["ath_freq"]*100/len(token_lists["ath"])
table["zma_freq"] = table["token"].map(lambda x: fdist_zma[x])
table["zma_rate"] = table["zma_freq"]*100/len(token_lists["zma"])
table["diff"] = table["ath_rate"]-table["zma_rate"]
table.to_csv("../data/scratch/token_frequencies.csv")
table.head(10)

Unnamed: 0,token,part_of_speech,ath_freq,ath_rate,zma_freq,zma_rate,diff
0,adding,VB,2,0.000757,0,0.0,0.000757
1,profound,NN,5,0.001893,0,0.0,0.001893
2,cancer,NN,1,0.000379,0,0.0,0.000379
3,central,JJ,29,0.010977,1,0.001999,0.008978
4,erf,NN,2,0.000757,0,0.0,0.000757
5,ptohrcc,NN,1,0.000379,0,0.0,0.000379
6,phytoglycogens,NN,1,0.000379,0,0.0,0.000379
7,attribute,NN,2,0.000757,0,0.0,0.000757
8,respond,NN,25,0.009463,0,0.0,0.009463
9,change,NN,162,0.06132,1,0.001999,0.059321


In [21]:
# What are the tokens more frequently used for Arabidopsis than maize descriptions in this dataset?
table.sort_values(by="diff", ascending=False, inplace=True)
table.head(30)

Unnamed: 0,token,part_of_speech,ath_freq,ath_rate,zma_freq,zma_rate,diff
6839,embryo,NN,4995,1.890692,151,0.301825,1.588867
2766,mutant,NN,4778,1.808554,256,0.511703,1.296851
2757,phenotype,NN,3583,1.356226,78,0.15591,1.200316
1170,root,NN,2943,1.113975,72,0.143917,0.970059
3249,type,NN,2540,0.961433,15,0.029983,0.93145
6757,wild,NN,2456,0.929637,7,0.013992,0.915646
3704,defective,JJ,3341,1.264625,285,0.56967,0.694955
814,reduced,VB,2860,1.082558,216,0.43175,0.650809
5341,stage,NN,1814,0.68663,63,0.125927,0.560703
2718,cotyledon,NN,1443,0.5462,0,0.0,0.5462


In [20]:
# What are the tokens more frequently used for maize than Arabidopsis descriptions in this dataset?
table.sort_values(by="diff", ascending=True, inplace=True)
table.head(30)

Unnamed: 0,token,part_of_speech,ath_freq,ath_rate,zma_freq,zma_rate,diff
3620,endosperm,NN,124,0.046936,1078,2.15475,-2.107814
6073,seedling,VB,1560,0.590486,1318,2.634472,-2.043986
3494,kernel,NN,0,0.0,766,1.531112,-1.531112
133,yellow,NN,304,0.115069,775,1.549102,-1.434032
1288,leaf,NN,3879,1.468267,1445,2.888325,-1.420058
2004,green,JJ,884,0.334609,788,1.575086,-1.240478
4668,white,JJ,375,0.141944,642,1.283256,-1.141312
866,albino,NN,222,0.084031,396,0.791541,-0.70751
6849,usually,RB,46,0.017412,353,0.705591,-0.688179
782,color,NN,43,0.016276,330,0.659617,-0.643341


In [14]:
# Is the mean absolute value of the rate differences different between the different parts of speech?
table["abs_diff"] = abs(table["diff"])
pos_table = table.groupby("part_of_speech").mean()
pos_table.sort_values(by="abs_diff", inplace=True, ascending=False)
pos_table = pos_table[["abs_diff"]]
pos_table.reset_index()

Unnamed: 0,part_of_speech,abs_diff
0,MD,0.05943
1,CD,0.027894
2,IN,0.026202
3,JJ,0.023688
4,DT,0.019077
5,RB,0.016302
6,NN,0.014463
7,VB,0.012131
8,CC,0.007617
9,WP,0.00162


In [15]:
print(table.shape)
zma_only = table[table["ath_rate"]==0]
ath_only = table[table["zma_rate"]==0]
print(zma_only.shape)
print(ath_only.shape)
print(ath_only.shape[0]+zma_only.shape[0])
ath_only.head(10)
# We need to create a mapping between stems and the words that were present for them.
# This is because what we want is the stems that are exclusive to a species.
# but then the words that are actually there for those stems, so that we can count their parts of speech.

(7000, 8)
(540, 8)
(5352, 8)
5892


Unnamed: 0,token,part_of_speech,ath_freq,ath_rate,zma_freq,zma_rate,diff,abs_diff
861,fmo,NN,1,0.000379,0,0.0,0.000379,0.000379
6323,rastafari,NN,1,0.000379,0,0.0,0.000379,0.000379
853,reorientation,NN,1,0.000379,0,0.0,0.000379,0.000379
841,glycoprotein,NN,1,0.000379,0,0.0,0.000379,0.000379
443,undergone,NN,1,0.000379,0,0.0,0.000379,0.000379
839,thad,NN,1,0.000379,0,0.0,0.000379,0.000379
837,reside,NN,1,0.000379,0,0.0,0.000379,0.000379
5449,coffee,NN,1,0.000379,0,0.0,0.000379,0.000379
850,wmv,NN,1,0.000379,0,0.0,0.000379,0.000379
6321,vernalized,VB,1,0.000379,0,0.0,0.000379,0.000379
