### Making plots to illustrate the results of some queries with the web tool
This notebook takes a file describing a list of genes with those genes species and identifying strings. It loads the dataset that is used by the search tool, finds out if those genes are in the dataset, and then subsets the set of genes to include just those. It then uses the descriptions of those genes to query the tool (using the script rather than the streamlit web app, but the results are identical), and keeps track of where other genes from the list fall in the resulting gene rankings. The output of the notebook is a summary of these results that specifies the mean and standard deviation of binned ranks for each search.

In [1]:
import datetime
import nltk
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import random
import warnings
from collections import defaultdict
from nltk.corpus import brown
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string, remove_stopwords
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from statsmodels.sandbox.stats.multicomp import multipletests
from itertools import product

sys.path.append("../../oats")
sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, flatten, to_hms
from oats.utils.utils import function_wrapper_with_duration, remove_duplicates_retain_order
from oats.biology.dataset import Dataset
from oats.biology.groupings import Groupings
from oats.biology.relationships import ProteinInteractions, AnyInteractions
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import annotate_using_noble_coder, term_enrichment
from oats.distances import pairwise as pw
from oats.nlp.vocabulary import get_overrepresented_tokens, get_vocab_from_tokens
from oats.nlp.vocabulary import reduce_vocab_connected_components, reduce_vocab_linares_pontes, token_enrichment

warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

Warming up PyWSD (takes ~10 secs)... took 6.5060601234436035 secs.


True

In [2]:
# Paths to the files that are used for this notebook.
dataset_path = "resources/genes_texts_annots.csv"
dataset = Dataset(dataset_path, keep_ids=True)
dataset.describe()

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions
0,ath,5850,3493
1,gmx,30,23
2,mtr,37,36
3,osa,92,85
4,sly,69,69
5,zma,1405,810
6,total,7483,4516


Previously out of this list we had 10 of 16 maize genes in the dataset and 16 of 18 Aribidopsis genes. Now we have 13 of 16 maize genes and 18 of 18 Arabidopsis genes.

In [3]:
mapping = dataset.get_species_to_name_to_ids_dictionary(include_synonyms=False, lowercase=True)
genes = pd.read_csv("plots/anthocyanin_biosynthesis_genes.csv")
genes["id"] = genes.apply(lambda x: mapping[x["species_code"]].get(x["identifier"].strip().lower(),-1), axis=1)
genes[genes["id"]!=-1]["id"] = genes[genes["id"]!=-1]["id"].map(lambda x: x[0])
genes["in_current_dataset"] = genes["id"].map(lambda x: x!=-1)
genes

# Looking just at the ones that are in the current dataset.
genes = genes[genes["in_current_dataset"]]
genes["id"] = genes["id"].map(lambda x: x[0])
genes

Unnamed: 0,species,species_code,identifier,in_previous_dataset,id,in_current_dataset
0,Maize,zma,GRMZM2G422750,True,1216,True
2,Maize,zma,GRMZM2G025832,False,3761,True
4,Maize,zma,GRMZM2G026930,True,166,True
5,Maize,zma,GRMZM2G345717,True,169,True
6,Maize,zma,GRMZM2G165390,True,218,True
7,Maize,zma,GRMZM2G016241,True,219,True
9,Maize,zma,GRMZM5G822829,False,170,True
10,Maize,zma,GRMZM2G005066,True,165,True
11,Maize,zma,GRMZM2G084799,True,2330,True
12,Maize,zma,GRMZM2G701063,True,164,True


In [4]:
# Grabbing the texts dictionary from the dataset that we can use to grab the descriptions to query.
texts = dataset.get_description_dictionary()
texts[153]

'Abnormal pectin levels in cell walls. Abnormal xylan levels in cell walls. Abnormal xylan and pectin levels in cell walls.'

In [5]:
# Prepare dictionaries to hold the resulting arrays.
resulting_bin_arrays = defaultdict(dict)
resulting_bin_arrays["zma"]["zma"] = []
resulting_bin_arrays["ath"]["ath"] = []
resulting_bin_arrays["zma"]["ath"] = []
resulting_bin_arrays["ath"]["zma"] = []
resulting_bin_arrays

defaultdict(dict,
            {'zma': {'zma': [], 'ath': []}, 'ath': {'ath': [], 'zma': []}})

In [None]:
# The searches within the same species.
rank_for_not_found = 100
bins =[0,11,21,31,41,51,rank_for_not_found]
bin_names = [10,20,30,40,50,rank_for_not_found]
assert len(bin_names) == len(bins)-1

ctr = 0
for gene in genes.itertuples():
    
    ctr = ctr+1
    limit = 50
    species = gene[1]
    species_code = gene[2]
    identifier = gene[3]
    gene_id = gene[5]
    text = texts[gene_id]
    
    # Because these are being passed as strings to the command line, quotes need to be removed now,
    # isntead of waiting for them to be removed as a preprocessing step of the search strings in the streamlit script.
    text = text.replace("'","")
    text = text.replace('"','')
    
    path = "plots/outputs/output_{}.csv".format(ctr)
    os.system("python app_as_script.py -s {} -t text -q '{}' -l {} -o {}".format(species,text,limit,path))
    time.sleep(4)
    df = pd.read_csv(path)
    id_to_rank = dict(zip(df["id"].values,df["Rank"].values))
    assert rank_for_not_found > limit
    
    # For within the same species, get rid of the identical gene (always rank 1).
    ids_of_interest = [i for i in genes[genes["species"]==species]["id"].values if i != gene_id]
    ranks = [id_to_rank.get(i, rank_for_not_found) for i in ids_of_interest]
    resulting_bin_arrays[species_code][species_code].append(np.histogram(ranks, bins=bins)[0])
    
    #print(np.array( resulting_bin_arrays[species_code][species_code]))
    print(ranks)
    print("done with {} queries".format(ctr))
print('done with all queries')

In [27]:
# The searches across different species.
rank_for_not_found = 100
bins =[0,11,21,31,41,51,rank_for_not_found]
bin_names = [10,20,30,40,50,rank_for_not_found]
assert len(bin_names) == len(bins)-1

ctr = 0
for gene in genes.itertuples():
    
    ctr = ctr+1
    limit = 50
    from_species = gene[1]
    from_species_code = gene[2]
    identifier = gene[3]
    gene_id = gene[5]
    text = texts[gene_id]
    
    
    # Do the switch to make the species for the query and IDs of interest the opposite one.
    to_species = {"Arabidopsis":"Maize","Maize":"Arabidopsis"}[from_species]
    to_species_code = {"ath":"zma","zma":"ath"}[from_species_code]
    
    # Because these are being passed as strings to the command line, quotes need to be removed now,
    # isntead of waiting for them to be removed as a preprocessing step of the search strings in the streamlit script.
    text = text.replace("'","")
    text = text.replace('"','')
    
    path = "plots/outputs/output_{}.csv".format(ctr)
    os.system("python app_as_script.py -s {} -t text -q '{}' -l {} -o {}".format(to_species,text,limit,path))
    time.sleep(4)
    df = pd.read_csv(path)
    id_to_rank = dict(zip(df["id"].values,df["Rank"].values))
    assert rank_for_not_found > limit
    
    # For within the same species, get rid of the identical gene (always rank 1).
    ids_of_interest = [i for i in genes[genes["species"]==to_species]["id"].values if i != gene_id]
    ranks = [id_to_rank.get(i, rank_for_not_found) for i in ids_of_interest]
    resulting_bin_arrays[from_species_code][to_species_code].append(np.histogram(ranks, bins=bins)[0])
    
    #print(np.array( resulting_bin_arrays[species_code][species_code]))
    print(ranks)
    print("done with {} queries".format(ctr))
print('done with all queries')

[12, 100, 100, 100, 100, 100, 100, 100, 100, 100, 19, 100, 40, 100, 100, 100, 100, 100]
done with 1 queries
[25, 100, 100, 100, 100, 100, 19, 100, 100, 100, 100, 100, 28, 100, 100, 100, 100, 100]
done with 2 queries
[100, 100, 10, 100, 11, 100, 100, 100, 13, 100, 100, 12, 100, 100, 100, 100, 100, 100]
done with 3 queries
[100, 100, 1, 100, 2, 100, 100, 16, 4, 13, 100, 3, 100, 100, 100, 100, 34, 100]
done with 4 queries
[25, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 33, 100, 100, 100, 100, 100]
done with 5 queries
[24, 100, 100, 100, 100, 100, 100, 48, 100, 100, 100, 100, 32, 100, 100, 100, 100, 100]
done with 6 queries
[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 32, 100]
done with 7 queries
[32, 100, 100, 100, 100, 100, 16, 100, 100, 100, 12, 100, 39, 100, 100, 100, 100, 100]
done with 8 queries
[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]
done with 9 queries
[100, 100, 100, 100, 100, 100, 1

In [28]:
# Create the output dataframe with the means and standard deviation for each bin and direction.
output_rows = []
for (s1,s2) in product(["ath","zma"],["ath","zma"]):
    means = np.mean(np.array(resulting_bin_arrays[s1][s2]),axis=0)
    std_devs = np.std(np.array(resulting_bin_arrays[s1][s2]),axis=0)
    for i in range(len(bin_names)):
        output_rows.append([s1, s2, bin_names[i], means[i], std_devs[i]])    
names = ["from","to","bin","mean","sd"]    
output_df = pd.DataFrame(output_rows,columns=names)
output_df

Unnamed: 0,from,to,bin,mean,sd
0,ath,ath,10,4.944444,2.914947
1,ath,ath,20,3.611111,2.452638
2,ath,ath,30,1.222222,1.717736
3,ath,ath,40,1.055556,2.460177
4,ath,ath,50,0.333333,0.816497
5,ath,ath,100,5.833333,3.919325
6,ath,zma,10,1.222222,2.015373
7,ath,zma,20,0.111111,0.31427
8,ath,zma,30,0.222222,0.711458
9,ath,zma,40,0.222222,0.53287


In [29]:
output_path = "plots/anthocyanin_plot_data.csv"
output_df.to_csv(output_path, index=False)