### Description of Notebook
This notebook takes a file describing a list of genes with those genes species and identifying strings. It loads the dataset that is used by the search tool, finds out if those genes are in the dataset, and then subsets the set of genes to include just those. It then uses the descriptions of those genes to query the tool (using the script rather than the streamlit web app, but the results are identical), and keeps track of where other genes from the list fall in the resulting gene rankings. The output of the notebook is a summary of these results that specifies the mean and standard deviation of binned ranks for each search.

In [13]:
import datetime
import nltk
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import random
import warnings
from collections import defaultdict
from nltk.corpus import brown
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string, remove_stopwords
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from statsmodels.sandbox.stats.multicomp import multipletests
from itertools import product

sys.path.append("../../oats")
sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, flatten, to_hms
from oats.utils.utils import function_wrapper_with_duration, remove_duplicates_retain_order
from oats.biology.dataset import Dataset
from oats.biology.groupings import Groupings
from oats.biology.relationships import ProteinInteractions, AnyInteractions
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import annotate_using_noble_coder, term_enrichment
from oats.distances import pairwise as pw
from oats.nlp.vocabulary import get_overrepresented_tokens, get_vocab_from_tokens
from oats.nlp.vocabulary import reduce_vocab_connected_components, reduce_vocab_linares_pontes, token_enrichment

warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [2]:
# Paths to the files that are used for this notebook.
dataset_path = "resources/genes_texts_annots.csv"
dataset = Dataset(dataset_path, keep_ids=True)
dataset.describe()

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions
0,ath,5065,2974
1,gmx,30,23
2,mtr,37,36
3,osa,92,85
4,sly,68,68
5,zma,1400,807
6,total,6692,3993


Previously out of this list we had 10 of 16 maize genes in the dataset and 16 of 18 Aribidopsis genes. Now we have 13 of 16 maize genes and 18 of 18 Arabidopsis genes.

In [3]:
mapping = dataset.get_species_to_name_to_ids_dictionary(include_synonyms=False, lowercase=True)
genes = pd.read_csv("../data/anthocyanin_biosynthesis_genes.csv")
genes["id"] = genes.apply(lambda x: mapping[x["species_code"]].get(x["identifier"].strip().lower(),-1), axis=1)
genes[genes["id"]!=-1]["id"] = genes[genes["id"]!=-1]["id"].map(lambda x: x[0])
genes["in_current_dataset"] = genes["id"].map(lambda x: x!=-1)
genes

# Looking just at the ones that are in the current dataset.
genes = genes[genes["in_current_dataset"]]
genes["id"] = genes["id"].map(lambda x: x[0])
genes

Unnamed: 0,species,species_code,identifier,in_previous_dataset,id,in_current_dataset
0,Maize,zma,GRMZM2G422750,True,863,True
2,Maize,zma,GRMZM2G025832,False,2971,True
4,Maize,zma,GRMZM2G026930,True,149,True
5,Maize,zma,GRMZM2G345717,True,152,True
6,Maize,zma,GRMZM2G165390,True,193,True
7,Maize,zma,GRMZM2G016241,True,194,True
9,Maize,zma,GRMZM5G822829,False,153,True
10,Maize,zma,GRMZM2G005066,True,148,True
11,Maize,zma,GRMZM2G084799,True,1640,True
12,Maize,zma,GRMZM2G701063,True,147,True


In [4]:
texts = dataset.get_description_dictionary()
texts[153]

"Aleuron layer is colored. Anthers are green. Anthocyanin pigment in pericarp. Aleurone layer mottled. Red anthers. Strong anthocyanin on leaf blade after 4-6 leaf stage. Aleuron layer is colored, leading to a colored kernel. Anthers are green, rather than the normal yellow. Irregular distribution of color among aleurone cells with appropriate anthocyanin genes. Typically, R mottling. Pale pink to red anthers. Cherry pericarp. Colored aleurone. Anthocyanin color with A1 A2 C1 C2 R1. Would be purple if with Pr1, Bz1 and Bz2. Colored leaf. Colored plant. Colored scutellum. Anthocyanin pigment in scutellar tissue and embryo. Distinct from purple plumule which has pigment in the embro axis only. Also distinct from embryonic axis profile/color halo trait. Expression varies between backgrounds. Colorless aleurone. No color in the aleurone, specifically no color due to anthocyanins. Colorless aleurone with pigmented sectors. However, anthocyanin pigmented sectors are present. Colorless roots.

In [10]:
# Prepare dictionaries to hold the resulting arrays.
resulting_bin_arrays = defaultdict(dict)
resulting_bin_arrays["zma"]["zma"] = []
resulting_bin_arrays["ath"]["ath"] = []
resulting_bin_arrays["zma"]["ath"] = []
resulting_bin_arrays["ath"]["zma"] = []
resulting_bin_arrays

defaultdict(dict,
            {'zma': {'zma': [], 'ath': []}, 'ath': {'ath': [], 'zma': []}})

In [11]:
# The searches within the same species.
rank_for_not_found = 100
bins =[0,11,21,31,41,51,rank_for_not_found]

ctr = 0
for gene in genes.itertuples():
    
    
    ctr = ctr+1
    limit = 50
    species = gene[1]
    species_code = gene[2]
    identifier = gene[3]
    gene_id = gene[5]
    text = texts[gene_id]
    text = text.replace("'","")
    text = text.replace('"','')
    
    path = "outputs/output_{}.csv".format(ctr)
    os.system("python app_as_script.py -s {} -t text -q '{}' -l {} -o {}".format(species,text,limit,path))
    time.sleep(4)
    df = pd.read_csv(path)
    id_to_rank = dict(zip(df["id"].values,df["Rank"].values))
    assert rank_for_not_found > limit
    
    # For within the same species, get rid of the identical gene (always rank 1).
    ids_of_interest = [i for i in genes[genes["species"]==species]["id"].values if i != gene_id]
    ranks = [id_to_rank.get(i, rank_for_not_found) for i in ids_of_interest]
    resulting_bin_arrays[species_code][species_code].append(np.histogram(ranks, bins=bins)[0])
    
    #print(np.array( resulting_bin_arrays[species_code][species_code]))
    
    
    if ctr==13:
        break
    
    
print('done with all queries')

done with all queries


In [23]:
output_rows = []
#output_rows.append(("ath","ath",np.mean(np.array(resulting_bin_arrays["ath"]["ath"]),axis=0)))
for (s1,s2) in product(["ath","zma"],["ath","zma"]):
    output_rows.append(flatten((s1,s2,"mean",np.mean(np.array(resulting_bin_arrays[s1][s2]),axis=0))))
    output_rows.append(flatten((s1,s2,"stdev",np.std(np.array(resulting_bin_arrays[s1][s2]),axis=0))))

names = ["from","to","metric","bin1","bin2","bin3","bin4","bin5","bin6"]
output_df = pd.DataFrame(output_rows,columns=names)
output_df

Unnamed: 0,from,to,metric,bin1,bin2,bin3,bin4,bin5,bin6
0,ath,ath,mean,,,,,,
1,ath,ath,stdev,,,,,,
2,ath,zma,mean,,,,,,
3,ath,zma,stdev,,,,,,
4,zma,ath,mean,,,,,,
5,zma,ath,stdev,,,,,,
6,zma,zma,mean,5.923077,2.153846,0.923077,0.461538,0.461538,1.923077
7,zma,zma,stdev,2.092534,0.948371,0.916644,0.634324,0.634324,2.973254


In [24]:
output_path = "outputs/anthocyanin_plot_data.csv"
output_df.to_csv(output_path, index=False)

array([[7, 1, 0, 2, 2, 0],
       [8, 1, 0, 2, 1, 0],
       [7, 1, 0, 2, 2, 0],
       [7, 1, 0, 2, 2, 0]])

[7.25 1.75 1.   0.75 0.75 0.5 ]


array([7, 1, 0, 2, 2, 0])