In [1]:
import datetime
import nltk
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
from collections import defaultdict
from nltk.corpus import brown
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string, remove_stopwords
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from itertools import combinations
import random

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, merge_list_dicts, flatten, to_hms
from oats.biology.dataset import Dataset
from oats.biology.groupings import Groupings
from oats.biology.relationships import ProteinInteractions, AnyInteractions
from oats.annotation.ontology import Ontology

warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [2]:
DATASETS_DIR = "../data/pickles/"
INPUT_DATASET_PATH = "../data/pickles/gene_phenotype_dataset_all_text_and_annotations.pickle"
OUTPUT_TABLE = defaultdict(dict)
OUTPUT_PAIR_TUPLES = []

### Dataset that can be used to predict whether two genes share a phenotype

In [3]:
# Starting off with the full dataset that is available.
data = load_from_pickle(INPUT_DATASET_PATH)
data.filter_has_description()
print(data.describe())
# Find and save the dataset that is relevant for predicting whether two genes share a phenotype
lloyd_subsets_filename = "../data/pickles/groupings_from_lloyd_subsets.pickle"  
groups = load_from_pickle(lloyd_subsets_filename)
id_to_group_ids, group_id_to_ids = groups.get_groupings_for_dataset(data)
group_mapped_ids = [k for (k,v) in id_to_group_ids.items() if len(v)>0]
data.filter_with_ids(group_mapped_ids)
print("\n",data.describe())
save_to_pickle(obj=data, path=os.path.join(DATASETS_DIR, "gene_phenotype_dataset_for_predicting_phenotypes.pickle"))

  species  num_genes  unique_descriptions
0     ath       6364                 3813
1     gmx         30                   24
2     mtr         37                   36
3     osa         92                   85
4     sly         70                   70
5     zma       1406                  811
6   total       7999                 4839

   species  num_genes  unique_descriptions
0     ath       2868                 2365
1   total       2868                 2365


In [4]:
# Update an output table to show the size of the datasets and the class sizes.
shared = lambda x,y: len(set(id_to_group_ids[x]).intersection(set(id_to_group_ids[y])))>0
pair_list = [shared(x,y) for x,y in list(combinations(group_mapped_ids, 2))]
OUTPUT_TABLE["phenotype"]["relevant_genes"] = len(group_mapped_ids)
OUTPUT_TABLE["phenotype"]["positive"] = pair_list.count(True)
OUTPUT_TABLE["phenotype"]["negative"] = pair_list.count(False)
OUTPUT_TABLE["phenotype"]["class_ratio"] = "{:0.4f}".format(pair_list.count(True)/pair_list.count(False))

In [5]:
# Adding the pairs of phenotype descriptions to be used for curation.
pos_pair_list = [(x,y) for x,y in list(combinations(group_mapped_ids, 2)) if shared(x,y)]
sampling = random.choices(pos_pair_list, k=50)
to_desc = data.get_description_dictionary()
for (x,y) in sampling:
    OUTPUT_PAIR_TUPLES.append((to_desc[x],to_desc[y]))

### Dataset that can be used to predict whether two genes share a biochemical pathway

In [6]:
# Starting off with the full dataset that is available.
data = load_from_pickle(INPUT_DATASET_PATH)
data.filter_has_description()
print(data.describe())
# Using the KEGG pathways from the saved groupings object.
kegg_pathways_filename = "../data/pickles/groupings_from_kegg_pathways.pickle"                         
kegg_groups = load_from_pickle(kegg_pathways_filename)
id_to_kegg_group_ids, kegg_group_id_to_ids = kegg_groups.get_groupings_for_dataset(data)
# Using the PlantCyc pathways from the saved groupings object.
pmn_pathways_filename = "../data/pickles/groupings_from_pmn_pathways.pickle"  
pmn_groups = load_from_pickle(pmn_pathways_filename)
id_to_pmn_group_ids, pmn_group_id_to_ids = pmn_groups.get_groupings_for_dataset(data)
# Find and save the dataset that is relevant for predicting whether two genes share a phenotype
group_mapped_ids = []
group_mapped_ids.extend([k for (k,v) in id_to_kegg_group_ids.items() if len(v)>0])
group_mapped_ids.extend([k for (k,v) in id_to_pmn_group_ids.items() if len(v)>0])
group_mapped_ids = list(set(group_mapped_ids))
data.filter_with_ids(group_mapped_ids)
print("\n",data.describe())
save_to_pickle(obj=data, path=os.path.join(DATASETS_DIR, "gene_phenotype_dataset_for_predicting_pathways.pickle"))

  species  num_genes  unique_descriptions
0     ath       6364                 3813
1     gmx         30                   24
2     mtr         37                   36
3     osa         92                   85
4     sly         70                   70
5     zma       1406                  811
6   total       7999                 4839

   species  num_genes  unique_descriptions
0     ath       1769                 1469
1     gmx          1                    1
2     mtr          2                    2
3     osa          3                    3
4     sly         18                   18
5     zma        185                  160
6   total       1978                 1653


In [7]:
# Update an output table to show the size of the datasets and the class sizes.
shared_kegg = lambda x,y: len(set(id_to_kegg_group_ids[x]).intersection(set(id_to_kegg_group_ids[y])))>0
shared_pmn = lambda x,y: len(set(id_to_pmn_group_ids[x]).intersection(set(id_to_pmn_group_ids[y])))>0
pair_list = [(shared_kegg(x,y) or shared_pmn(x,y)) for x,y in list(combinations(group_mapped_ids, 2))]
OUTPUT_TABLE["pathways"]["relevant_genes"] = len(group_mapped_ids)
OUTPUT_TABLE["pathways"]["positive"] = pair_list.count(True)
OUTPUT_TABLE["pathways"]["negative"] = pair_list.count(False)
OUTPUT_TABLE["pathways"]["class_ratio"] = "{:0.4f}".format(pair_list.count(True)/pair_list.count(False))

In [8]:
# Adding the pairs of phenotype descriptions to be used for curation.
pos_pair_list = [(x,y) for x,y in list(combinations(group_mapped_ids, 2)) if shared(x,y)]
sampling = random.choices(pos_pair_list, k=50)
to_desc = data.get_description_dictionary()
for (x,y) in sampling:
    OUTPUT_PAIR_TUPLES.append((to_desc[x],to_desc[y]))

### Dataset that can be used to predict whether a protein-protein interaction exists

In [9]:
# Starting off with the full dataset that is available.
data = load_from_pickle(INPUT_DATASET_PATH)
data.filter_has_description()
print(data.describe())

# Filter the dataset based on whether or not the genes were successfully mapped to an interaction.
# Reduce size of the dataset by removing genes not mentioned in the STRING.
naming_file = "../data/group_related_files/string/all_organisms.name_2_string.tsv"
interaction_files = [
    "../data/group_related_files/string/3702.protein.links.detailed.v11.0.txt", # Arabidopsis thaliana
    "../data/group_related_files/string/4577.protein.links.detailed.v11.0.txt", # maize
    "../data/group_related_files/string/4530.protein.links.detailed.v11.0.txt", # tomato 
    "../data/group_related_files/string/4081.protein.links.detailed.v11.0.txt", # medicago
    "../data/group_related_files/string/3880.protein.links.detailed.v11.0.txt", # rice 
    "../data/group_related_files/string/3847.protein.links.detailed.v11.0.txt", # soybean
]
genes = data.get_gene_dictionary()
string_data = ProteinInteractions(genes, naming_file, *interaction_files)
data.filter_with_ids(string_data.ids)
print("\n",data.describe())
save_to_pickle(obj=data, path=os.path.join(DATASETS_DIR, "gene_phenotype_dataset_for_predicting_interactions.pickle"))

  species  num_genes  unique_descriptions
0     ath       6364                 3813
1     gmx         30                   24
2     mtr         37                   36
3     osa         92                   85
4     sly         70                   70
5     zma       1406                  811
6   total       7999                 4839

   species  num_genes  unique_descriptions
0     ath       3674                 2950
1     mtr         10                   10
2     osa         44                   42
3     sly          3                    3
4     zma        160                  147
5   total       3891                 3152


In [10]:
# Note that the ProteinInteraction class guarantees the dataframe contains the reverse of each edge as well.
#interaction = lambda x,y: ((string_data.df["from"] == x) & (string_data.df["to"] == y)).any()
#interaction_strings = set(["{}:{}".format(int(i),int(j)) for i,j in zip(string_data.df["from"].values, string_data.df["to"].values)])
#has_interaction = lambda x,y: "{}:{}".format(int(x),int(y)) in interaction_strings

In [11]:
# Update an output table to show the size of the datasets and the class sizes.
num_ids = len(data.get_ids())
num_total_pairs = ((num_ids**2)/2)-num_ids
num_positive = string_data.df.shape[0]/2
num_negative = num_total_pairs - num_positive
OUTPUT_TABLE["interactions"]["relevant_genes"] = num_ids
OUTPUT_TABLE["interactions"]["positive"] = num_positive
OUTPUT_TABLE["interactions"]["negative"] = num_negative
OUTPUT_TABLE["interactions"]["class_ratio"] = "{:0.4f}".format(num_positive/num_negative)

### Dataset that can be used to predict whether two genes are orthologous

In [12]:
# Starting off with the full dataset that is available.
data = load_from_pickle(INPUT_DATASET_PATH)
data.filter_has_description()
print(data.describe())

# Filter the dataset based on whether or not the genes were successfully mapped to an interaction.
# Reduce size of the dataset by removing genes not mentioned in the STRING.
ortholog_file_path = "../data/orthology_related_files/pantherdb/PlantGenomeOrthologs_IRB_Modified.txt"
ortholog_edgelist = AnyInteractions(data.get_name_to_id_dictionary(), ortholog_file_path)
data.filter_with_ids(ortholog_edgelist.ids)
print("\n", data.describe())
save_to_pickle(obj=data, path=os.path.join(DATASETS_DIR, "gene_phenotype_dataset_for_predicting_orthologs.pickle"))

  species  num_genes  unique_descriptions
0     ath       6364                 3813
1     gmx         30                   24
2     mtr         37                   36
3     osa         92                   85
4     sly         70                   70
5     zma       1406                  811
6   total       7999                 4839

   species  num_genes  unique_descriptions
0     ath        350                  303
1     osa         86                   80
2     sly          7                    7
3     zma        443                  337
4   total        886                  727


In [13]:
# Note that the AnyInteraction class guarantees the dataframe contains the reverse of each edge as well.
#ortholog_strings = set(["{}:{}".format(int(i),int(j)) for i,j in zip(ortholog_edgelist.df["from"].values, ortholog_edgelist.df["to"].values)])
#are_orthologs = lambda x,y: "{}:{}".format(int(x),int(y)) in ortholog_strings
#pair_list = [are_orthologs(x,y) for x,y in list(combinations(data.get_ids(), 2))]
# Update an output table to show the size of the datasets and the class sizes.
#OUTPUT_TABLE["orthologs"]["positive"] = pair_list.count(True)
#OUTPUT_TABLE["orthologs"]["negative"] = pair_list.count(False)
#OUTPUT_TABLE["orthologs"]["class_ratio"] = "{:0.4f}".format(pair_list.count(True)/pair_list.count(False))

In [14]:
# Update an output table to show the size of the datasets and the class sizes, without enumerating.
num_ids = len(data.get_ids())
num_total_pairs = ((num_ids**2)/2)-num_ids
num_positive = ortholog_edgelist.df.shape[0]/2
num_negative = num_total_pairs - num_positive
OUTPUT_TABLE["orthologs"]["relevant_genes"] = num_ids
OUTPUT_TABLE["orthologs"]["positive"] = num_positive
OUTPUT_TABLE["orthologs"]["negative"] = num_negative
OUTPUT_TABLE["orthologs"]["class_ratio"] = "{:0.4f}".format(num_positive/num_negative)

### Summarzing table of the classes for each prediction problem.

In [23]:
summary_df = pd.DataFrame(OUTPUT_TABLE).transpose()
summary_df["pos_fraction"] = summary_df["positive"]/(((summary_df["relevant_genes"]**2)-summary_df["relevant_genes"])/2)
summary_df["neg_fraction"] = summary_df["negative"]/(((summary_df["relevant_genes"]**2)-summary_df["relevant_genes"])/2)
summary_df

Unnamed: 0,relevant_genes,positive,negative,class_ratio,pos_fraction,neg_fraction
phenotype,2868,458461,3652817.0,0.1255,0.111513,0.888487
pathways,1978,52530,1902723.0,0.0276,0.0268661,0.973134
interactions,3891,450298,7115750.0,0.0633,0.0595004,0.940243
orthologs,886,77,391535.0,0.0002,0.000196401,0.998674


In [16]:
# Methodology
# These are phenotype pairs drawn from the 

pairs = pd.DataFrame(OUTPUT_PAIR_TUPLES)
pairs.columns = ["Phenotype 1","Phenotype 2"]
pairs["Phenotype 1"] = pairs["Phenotype 1"].str.casefold()
pairs["Phenotype 2"] = pairs["Phenotype 2"].str.casefold()
pairs["Phenotype 1"] = pairs["Phenotype 1"].map(lambda x: "; ".join([s[:-1] for s in sent_tokenize(x)[0:2]]))
pairs["Phenotype 2"] = pairs["Phenotype 2"].map(lambda x: "; ".join([s[:-1] for s in sent_tokenize(x)[0:2]]))
pairs.to_csv("../data/corpus_related_files/phenotype_pairs/unscored.csv",index=False)
pairs

Unnamed: 0,Phenotype 1,Phenotype 2
0,dwarf; late flowering,abnormal root hairs; increased trichome density
1,abnormal inflorescence stem gravitropism; few ...,complete lack of recognizable stomata; cluster...
2,"early flowering; early flowering, normal statu...",abnormal leaf morphology; early flowering
3,chlorotic cotyledons; no homozygous mutant pla...,increased shoot branching; increased hypocotyl...
4,early flowering; long hypocotyl under blue light,this mutant has delayed flowering compared to ...
5,embryo defective-transition; embryo defective;...,embryo defective-transition; pale green cotyle...
6,germination insensitive to aba,increased seed dormancy; sensitive to aba
7,intermediate salt sensitive phenotype; sensiti...,plants grown on agar plates have roots that sk...
8,embryo defective-preglobular; embryo defective...,embryo defective-preglobular; embryo defective...
9,embryo defective-globular; embryo defective; g...,complete female gametophyte defective; embryo ...
