# Creating a Dataset of Phenotype Pairs
The purpose of this notebook is to generate a dataset of pairs of phenotype descriptions, which are composed of either fragments of full sentences separated by semicolons. The reason for creating this dataset of phenotype pairs is so that the similarity between the pairs of phenotypes can be later scored based on some scoring scheme by humans with domain knowledge about these phenotypes or plant biology in general, and then computational approaches can be used to generate similarity scores for each pair as well. Correlation between the scores generated by computational approaches and the similarity scores generated by hand can be used to measure the relative performance of each computational approach. In order to make help ensure that the phenotype pairs are actually drawn from a population of pairs that have a relatively consistent likelihood of each similarity level, the pairs are drawn from phenotype descriptions for genes that either were assigned to the same function group or function in the same biochemical pathway.

In [1]:
import datetime
import nltk
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
from collections import defaultdict
from nltk.corpus import brown
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string, remove_stopwords
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from itertools import combinations
import random

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, flatten
from oats.utils.utils import remove_duplicates_retain_order
from oats.biology.dataset import Dataset
from oats.biology.groupings import Groupings
from oats.biology.relationships import ProteinInteractions, AnyInteractions
from oats.annotation.ontology import Ontology

warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [2]:
INPUT_DATASET_PATH = "../data/pickles/gene_phenotype_dataset_all_text_and_annotations.pickle"
OUTPUT_DATASET_PATH = "../data/corpus_related_files/phenotype_pairs/unscored.csv"
OUTPUT_PAIR_TUPLES = []
TOTAL_PAIRS_TO_FIND = 100
NUMBER_FROM_PHENOTYPE_SUBSETS = 75
NUMBER_FROM_PATHWAYS = 25
assert TOTAL_PAIRS_TO_FIND == NUMBER_FROM_PHENOTYPE_SUBSETS + NUMBER_FROM_PATHWAYS

random.seed(1)

### Sampling pairs of phenotypes from genes in the same functional subset

In [3]:
# Starting off with the full dataset that is available.
data = load_from_pickle(INPUT_DATASET_PATH)
data.filter_has_description()
data.describe()

Unnamed: 0,species,num_genes,unique_descriptions
0,ath,6364,3813
1,gmx,30,24
2,mtr,37,36
3,osa,92,85
4,sly,70,70
5,zma,1406,811
6,total,7999,4839


In [4]:
# Find the dataset that is relevant for predicting whether two genes share a phenotype
lloyd_subsets_filename = "../data/pickles/groupings_from_lloyd_subsets.pickle"  
groups = load_from_pickle(lloyd_subsets_filename)
id_to_group_ids, group_id_to_ids = groups.get_groupings_for_dataset(data)
group_mapped_ids = [k for (k,v) in id_to_group_ids.items() if len(v)>0]
data.filter_with_ids(group_mapped_ids)
data.describe()

Unnamed: 0,species,num_genes,unique_descriptions
0,ath,2868,2365
1,total,2868,2365


In [5]:
# Adding the pairs of phenotype descriptions to be used for curation.
shared = lambda x,y: len(set(id_to_group_ids[x]).intersection(set(id_to_group_ids[y])))>0
pos_pair_list = [(x,y) for x,y in list(combinations(group_mapped_ids, 2)) if shared(x,y)]
sampling = random.choices(pos_pair_list, k=NUMBER_FROM_PHENOTYPE_SUBSETS)
to_desc = data.get_description_dictionary()
for (x,y) in sampling:
    OUTPUT_PAIR_TUPLES.append((to_desc[x],to_desc[y]))
print(len(OUTPUT_PAIR_TUPLES))

75


### Sampling pairs of phenotypes from genes sharing a biochemical pathway

In [6]:
# Starting off with the full dataset that is available.
data = load_from_pickle(INPUT_DATASET_PATH)
data.filter_has_description()
data.describe()

Unnamed: 0,species,num_genes,unique_descriptions
0,ath,6364,3813
1,gmx,30,24
2,mtr,37,36
3,osa,92,85
4,sly,70,70
5,zma,1406,811
6,total,7999,4839


In [7]:
# Using the KEGG pathways from the saved groupings object.
kegg_pathways_filename = "../data/pickles/groupings_from_kegg_pathways.pickle"                         
kegg_groups = load_from_pickle(kegg_pathways_filename)
id_to_kegg_group_ids, kegg_group_id_to_ids = kegg_groups.get_groupings_for_dataset(data)
# Using the PlantCyc pathways from the saved groupings object.
pmn_pathways_filename = "../data/pickles/groupings_from_pmn_pathways.pickle"  
pmn_groups = load_from_pickle(pmn_pathways_filename)
id_to_pmn_group_ids, pmn_group_id_to_ids = pmn_groups.get_groupings_for_dataset(data)
# Find the dataset that is relevant for predicting whether two genes share a phenotype
group_mapped_ids = []
group_mapped_ids.extend([k for (k,v) in id_to_kegg_group_ids.items() if len(v)>0])
group_mapped_ids.extend([k for (k,v) in id_to_pmn_group_ids.items() if len(v)>0])
group_mapped_ids = list(set(group_mapped_ids))
data.filter_with_ids(group_mapped_ids)
data.describe()

Unnamed: 0,species,num_genes,unique_descriptions
0,ath,1769,1469
1,gmx,1,1
2,mtr,2,2
3,osa,3,3
4,sly,18,18
5,zma,185,160
6,total,1978,1653


In [8]:
# Adding the pairs of phenotype descriptions to be used for curation.
shared_kegg = lambda x,y: len(set(id_to_kegg_group_ids[x]).intersection(set(id_to_kegg_group_ids[y])))>0
shared_pmn = lambda x,y: len(set(id_to_pmn_group_ids[x]).intersection(set(id_to_pmn_group_ids[y])))>0
pos_pair_list = [(x,y) for x,y in list(combinations(group_mapped_ids, 2)) if shared_pmn(x,y)]
sampling = random.choices(pos_pair_list, k=NUMBER_FROM_PATHWAYS)
to_desc = data.get_description_dictionary()
for (x,y) in sampling:
    OUTPUT_PAIR_TUPLES.append((to_desc[x],to_desc[y]))
print(len(OUTPUT_PAIR_TUPLES))

100


### Creating the final phenotype pair dataset and writing to a file

In [9]:
# Write the phenotypes pairs to a file that can then be opened and read and similarity scores can be assigned.
pairs = pd.DataFrame(OUTPUT_PAIR_TUPLES)
pairs.columns = ["Phenotype 1","Phenotype 2"]

# Making sure only periods are used to separate phrases/phenes/sentences, not semicolons.
pairs["Phenotype 1"] = pairs["Phenotype 1"].str.casefold().map(lambda x: x.replace(";","."))
pairs["Phenotype 2"] = pairs["Phenotype 2"].str.casefold().map(lambda x: x.replace(";","."))

# Don't include duplicate sentences or fragments if they are present in the dataset, and limit the number of sentences.
sentence_limit = 4
pairs["Phenotype 1"] = pairs["Phenotype 1"].map(lambda x: " ".join([s for s in remove_duplicates_retain_order(sent_tokenize(x))][0:sentence_limit]))
pairs["Phenotype 2"] = pairs["Phenotype 2"].map(lambda x: " ".join([s for s in remove_duplicates_retain_order(sent_tokenize(x))][0:sentence_limit]))

# Make sure that this dataset of phenotype pairs is the expected size and save it to a csv file.
assert pairs.shape[0] == TOTAL_PAIRS_TO_FIND
pairs.to_csv(OUTPUT_DATASET_PATH, index=False)
pairs.head(20)

Unnamed: 0,Phenotype 1,Phenotype 2
0,abnormal phyllotaxy. delayed leaf growth. few ...,abnormal floral organ margins. abnormal leaf m...
1,anthocyanin accumulation. red seedlings. seedl...,bent leaf margins. dark green leaves. long lea...
2,abnormal growth in the dark. anthocyanin accum...,albino. seedling lethal (inferred from pigment...
3,dwarf. increased branching. thin inflorescence...,dark green cotyledons. dwarf. exaggerated apic...
4,embryo defective-preglobular / globular. embry...,hypersensitive to exogenous and endogenous aba...
5,embryo defective-globular. embryo defective. g...,embryo defective. developmental arrest of muta...
6,abnormal cotyledon vasculature. abnormal pisti...,pale green leaves. short roots. concentration ...
7,resistant to disease.,resistant to tobacco mosaic virus. reduced lev...
8,complete loss of post-illumination chlorophyll...,decreased post-illumination chlorophyll fluore...
9,abnormal leaf morphology. abnormal leaf shape ...,incomplete penetrance of exposed ovules. incom...
