In [1]:
import sys
import pandas as pd
import numpy as np
import glob
import os
from itertools import chain
from collections import defaultdict
import pandas as pd
import itertools
import networkx as nx
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation
from nltk.corpus import brown, stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.simplefilter('ignore')

sys.path.append("../../oats")
from oats.nlp.preprocess import concatenate_with_delim
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import annotate_using_noble_coder

In [2]:
# Which files are the reshaped ones that should be combined.
paths = [
    "oellrich_walls_phene_descriptions.csv", 
    "oellrich_walls_phenotype_descriptions.csv", 
    "oellrich_walls_annotations.csv",
    "sgn_phenotype_descriptions.csv", 
    "maizegdb_phenotype_descriptions.csv", 
    "maizegdb_curated_go_annotations.csv",
    "tair_phenotype_descriptions.csv",
    "tair_curated_go_annotations.csv", 
    "tair_curated_po_annotations.csv",
    "planteome_curated_annotations.csv"
]

In [3]:
# Check to make sure that these are the columns present in each read in file.
expected_columns = [
 "species_name",
 "species_code",
 "unique_gene_identifiers", 
 "other_gene_identifiers", 
 "gene_models", 
 "text_unprocessed", 
 "annotations", 
 "reference_name",
 "reference_link",
 "reference_file"]

# Stack all of those dataframes to create one new dataframe.
dfs_to_be_stacked = []
for path in paths:
    df = pd.read_csv(os.path.join("..","reshaped_data",path))
    assert set(expected_columns) == set(df.columns)
    dfs_to_be_stacked.append(df)
df = pd.concat(dfs_to_be_stacked, ignore_index=True)
dfs_to_be_stacked = None
df.head()

Unnamed: 0,species_name,species_code,unique_gene_identifiers,other_gene_identifiers,gene_models,text_unprocessed,annotations,reference_name,reference_link,reference_file
0,Zea mays ssp mays,zma,ns1|GRMZM2G069028,narrow sheath1,GRMZM2G069028,Plants (stems) do not elongate in response to...,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv
1,Zea mays ssp mays,zma,ns2|NM_001111772.1,narrow sheath2,,Plants (stems) do not elongate in response to...,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv
2,Zea mays ssp mays,zma,ns1|GRMZM2G069028,narrow sheath1,GRMZM2G069028,Plants are reduced in height,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv
3,Zea mays ssp mays,zma,ns1|GRMZM2G069028,narrow sheath1,GRMZM2G069028,extremely shortened internodes,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv
4,Zea mays ssp mays,zma,ns2|NM_001111772.1,narrow sheath2,,Plants are reduced in height,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv


In [4]:
# Need to use all the information in the gene identifier columns to create internal gene IDs
# Add a column that acts as an old index with one value for each existing row.
df = df.reset_index()
df.rename({"index":"old_id"},axis="columns",inplace=True)

# Create edges for the graph from the gene names.
# These edges should go from the old row IDs to each of the unique gene identifier strings.
def generate_edges(row,case_sensitive):
    if case_sensitive:
        names = row["unique_gene_identifiers"].split("|")
    else:
        names = row["unique_gene_identifiers"].lower().split("|")
    edges = [(str(row["old_id"]),"{}[SEP]{}".format(row["species_code"],name)) for name in names]
    return(edges)

# Create the network using those edges find the connected components and add those.
g = nx.Graph()
case_sensitive=False
edges = df.apply(generate_edges, case_sensitive=case_sensitive, axis=1)
edges = list(chain.from_iterable(edges.values))
g.add_edges_from(edges)

# Find the mapping between old IDs and connected components.
node_to_component = {}
component_index = 0
for node_set in nx.connected_components(g):
    for node in node_set:
        node_to_component[node] = component_index
    component_index = component_index+1
    
# The connected components number now serves as the gene ID.
df["old_id"] = df["old_id"].map(lambda x: str(x))
df["_gene_id"] = df["old_id"].map(node_to_component)
df.head()

Unnamed: 0,old_id,species_name,species_code,unique_gene_identifiers,other_gene_identifiers,gene_models,text_unprocessed,annotations,reference_name,reference_link,reference_file,_gene_id
0,0,Zea mays ssp mays,zma,ns1|GRMZM2G069028,narrow sheath1,GRMZM2G069028,Plants (stems) do not elongate in response to...,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,0
1,1,Zea mays ssp mays,zma,ns2|NM_001111772.1,narrow sheath2,,Plants (stems) do not elongate in response to...,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,1
2,2,Zea mays ssp mays,zma,ns1|GRMZM2G069028,narrow sheath1,GRMZM2G069028,Plants are reduced in height,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,0
3,3,Zea mays ssp mays,zma,ns1|GRMZM2G069028,narrow sheath1,GRMZM2G069028,extremely shortened internodes,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,0
4,4,Zea mays ssp mays,zma,ns2|NM_001111772.1,narrow sheath2,,Plants are reduced in height,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,1


In [5]:
df._gene_id.value_counts()

23       54676
2621     14272
170       1425
2804       781
2792       762
         ...  
30028        1
30347        1
4078         1
4799         1
4422         1
Name: _gene_id, Length: 61860, dtype: int64

In [6]:
# Now the dataset looks clean, but the gene columns don't yet reflect all the information
# that was used in the network creation step. For example, we want all the gene identifiers
# found on any row to be present everywhere in the dataset for that given line.

In [7]:
agg_df = df.groupby("_gene_id").agg({
    "unique_gene_identifiers": lambda x: concatenate_with_delim("|",x),
    "other_gene_identifiers": lambda x: concatenate_with_delim("|",x),
    "gene_models": lambda x: concatenate_with_delim("|",x)
})

In [8]:
# This is only called by collapse_by_all_gene_names().
# A method necessary for cleaning up lists of gene identifiers after merging.
# This removes things from the other gene identifiers if they are already listed as a unique gene identifier.
# This could happen after merging if some string was unsure about being a unique identifier, but some other entry confirms that is is.
def remove_duplicate_names(row):
    gene_names = row["unique_gene_identifiers"].split("|")
    gene_synonyms = row["other_gene_identifiers"].split("|")
    updated_gene_synonyms = [x for x in gene_synonyms if x not in gene_names]
    gene_synonyms_str = concatenate_with_delim("|", updated_gene_synonyms)
    return(gene_synonyms_str)


# This is only called by collapse_by_all_gene_names().
# Another method necessary for cleaning up lists of gene identifiers after merging.
# This retains the order except for it puts anything that is also in the gene models column last.
def reorder_unique_gene_identifers(row):
    unique_identifiers = row["unique_gene_identifiers"].split("|")
    gene_models = row["gene_models"].split("|")
    reordered_unique_identifiers = [x for x in unique_identifiers if x not in gene_models]
    reordered_unique_identifiers.extend(gene_models)
    reordered_unique_identifiers_str = concatenate_with_delim("|", reordered_unique_identifiers)
    return(reordered_unique_identifiers_str)

  
agg_df["other_gene_identifiers"] = agg_df.apply(lambda x: remove_duplicate_names(x), axis=1)
agg_df["unique_gene_identifiers"] = agg_df.apply(lambda x: reorder_unique_gene_identifers(x), axis=1)

In [9]:
cols_to_retain_from_old_df = ["_gene_id",
                              "species_name",
                              "species_code",
                              "text_unprocessed",
                              "annotations",
                              "reference_name",
                              "reference_link",
                              "reference_file"]
new_df = df[cols_to_retain_from_old_df].merge(right=agg_df, on="_gene_id", how="left")
new_df.head(10)

Unnamed: 0,_gene_id,species_name,species_code,text_unprocessed,annotations,reference_name,reference_link,reference_file,unique_gene_identifiers,other_gene_identifiers,gene_models
0,0,Zea mays ssp mays,zma,Plants (stems) do not elongate in response to...,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,ns1|ns1-R|uniprot=Q70UV1|ncbi=542051|narrow sh...,HB-type transcription factor|prs|WUSCHEL-relat...,GRMZM2G069028|Zm00001d004889
1,1,Zea mays ssp mays,zma,Plants (stems) do not elongate in response to...,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,ns2|NM_001111772.1|ns2-R|uniprot=Q6S3I3|ncbi=5...,WUSCHEL-related homeobox 3B,Zm00001d052598
2,0,Zea mays ssp mays,zma,Plants are reduced in height,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,ns1|ns1-R|uniprot=Q70UV1|ncbi=542051|narrow sh...,HB-type transcription factor|prs|WUSCHEL-relat...,GRMZM2G069028|Zm00001d004889
3,0,Zea mays ssp mays,zma,extremely shortened internodes,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,ns1|ns1-R|uniprot=Q70UV1|ncbi=542051|narrow sh...,HB-type transcription factor|prs|WUSCHEL-relat...,GRMZM2G069028|Zm00001d004889
4,1,Zea mays ssp mays,zma,Plants are reduced in height,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,ns2|NM_001111772.1|ns2-R|uniprot=Q6S3I3|ncbi=5...,WUSCHEL-related homeobox 3B,Zm00001d052598
5,1,Zea mays ssp mays,zma,extremely shortened internodes,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,ns2|NM_001111772.1|ns2-R|uniprot=Q6S3I3|ncbi=5...,WUSCHEL-related homeobox 3B,Zm00001d052598
6,2,Arabidopsis thaliana,ath,50% defective seeds,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,UBP26|At3g49600|AT3G49600,Ubiquitin-Specific Protease|SUP32|ATUBP26|ubiq...,At3g49600|AT3G49600
7,2,Arabidopsis thaliana,ath,Low penetrance of endosperm development withou...,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,UBP26|At3g49600|AT3G49600,Ubiquitin-Specific Protease|SUP32|ATUBP26|ubiq...,At3g49600|AT3G49600
8,2,Arabidopsis thaliana,ath,Reduced fertility,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,UBP26|At3g49600|AT3G49600,Ubiquitin-Specific Protease|SUP32|ATUBP26|ubiq...,At3g49600|AT3G49600
9,2,Arabidopsis thaliana,ath,Shriveled seeds,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv,UBP26|At3g49600|AT3G49600,Ubiquitin-Specific Protease|SUP32|ATUBP26|ubiq...,At3g49600|AT3G49600


In [10]:
new_df.shape

(942592, 11)

In [11]:
df.shape

(942592, 12)

In [12]:
new_df["text_unprocessed"].values

array([' Plants (stems) do not elongate in response to gibberellins.',
       ' Plants (stems) do not elongate in response to gibberellins.',
       'Plants are reduced in height', ..., nan, nan, nan], dtype=object)

In [13]:
new_df["text_unprocessed"].sample(5).values

array([nan, nan, nan, nan, nan], dtype=object)

In [14]:
new_df["text_tokenized_sents"] = new_df["text_unprocessed"].map(lambda x: x.replace(";","."), na_action="ignore")
new_df["text_tokenized_sents"] = new_df["text_tokenized_sents"].map(sent_tokenize, na_action="ignore")
f = lambda sents: " ".join(["[SENT] {}".format(s) for s in sents])
new_df["text_tokenized_sents"] = new_df["text_tokenized_sents"].map(f, na_action="ignore")

In [15]:
new_df["test"] = new_df["text_tokenized_sents"].map(lambda x: x.split("[SENT]"), na_action="ignore")
new_df["test"].sample(5).values

array([nan, nan, nan, nan, nan], dtype=object)

In [16]:
# The input should be one string with sentences separated by something
# This stems each word, removes puncutation and also all of the stopwords and lowercases.
def preprocess_sentences_full(text, sentence_delimiter):
    sentences = text.split(sentence_delimiter)
    sentences = [" ".join(preprocess_string(s)) for s in sentences]
    reformatted_text = " {} ".format(sentence_delimiter).join(sentences)
    reformatted_text = reformatted_text.strip()
    return(reformatted_text)


# The input should be one string with sentences separated by something
# This splits the strings into tokens but leaves the content of them alone.
def preprocess_sentences_partial(text, sentence_delimiter):
    sentences = text.split(sentence_delimiter)
    sentences = [" ".join(word_tokenize(s)) for s in sentences]
    reformatted_text = " {} ".format(sentence_delimiter).join(sentences)
    reformatted_text = reformatted_text.strip()
    return(reformatted_text)


    
SENT_DELIMITER = "[SENT]"
new_df["text_tokenized_stems"] = new_df["text_tokenized_sents"].map(lambda x: preprocess_sentences_full(x, SENT_DELIMITER), na_action="ignore")
new_df["text_tokenized_words"] = new_df["text_tokenized_sents"].map(lambda x: preprocess_sentences_partial(x, SENT_DELIMITER), na_action="ignore")

new_df[["text_tokenized_stems","text_tokenized_words"]].sample(5).values

array([[nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan]], dtype=object)

In [17]:
final_column_order = [
    "_gene_id",
    "species_name",
    "species_code",
    "unique_gene_identifiers",
    "other_gene_identifiers",
    "gene_models",
    "annotations",
    "text_unprocessed",
    "text_tokenized_sents",
    "text_tokenized_words",
    "text_tokenized_stems",
    "reference_name",
    "reference_link",
    "reference_file"
]

In [18]:
df = new_df[final_column_order]
df.sort_values(by="_gene_id", ascending=True, inplace=True, ignore_index=True)
df.drop_duplicates(keep="first", inplace=True, ignore_index=True)
df.head(100)

Unnamed: 0,_gene_id,species_name,species_code,unique_gene_identifiers,other_gene_identifiers,gene_models,annotations,text_unprocessed,text_tokenized_sents,text_tokenized_words,text_tokenized_stems,reference_name,reference_link,reference_file
0,0,Zea mays ssp mays,zma,ns1|ns1-R|uniprot=Q70UV1|ncbi=542051|narrow sh...,HB-type transcription factor|prs|WUSCHEL-relat...,GRMZM2G069028|Zm00001d004889,,Plants (stems) do not elongate in response to...,[SENT] Plants (stems) do not elongate in resp...,[SENT] Plants ( stems ) do not elongate in res...,[SENT] plant stem elong respons gibberellin,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv
1,0,maize,zma,ns1|ns1-R|uniprot=Q70UV1|ncbi=542051|narrow sh...,HB-type transcription factor|prs|WUSCHEL-relat...,GRMZM2G069028|Zm00001d004889,PO:0009025,,,,,Planteome,https://planteome.org/,plant_anatomical_entity.txt
2,0,maize,zma,ns1|ns1-R|uniprot=Q70UV1|ncbi=542051|narrow sh...,HB-type transcription factor|prs|WUSCHEL-relat...,GRMZM2G069028|Zm00001d004889,PO:0020142,,,,,Planteome,https://planteome.org/,plant_anatomical_entity.txt
3,0,maize,zma,ns1|ns1-R|uniprot=Q70UV1|ncbi=542051|narrow sh...,HB-type transcription factor|prs|WUSCHEL-relat...,GRMZM2G069028|Zm00001d004889,PO:0000003,,,,,Planteome,https://planteome.org/,plant_anatomical_entity.txt
4,0,Zea mays ssp mays,zma,ns1|ns1-R|uniprot=Q70UV1|ncbi=542051|narrow sh...,HB-type transcription factor|prs|WUSCHEL-relat...,GRMZM2G069028|Zm00001d004889,PO:0000003|PATO:0000569,,,,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,Arabidopsis,ath,UBP26|At3g49600|AT3G49600,Ubiquitin-Specific Protease|SUP32|ATUBP26|ubiq...,At3g49600|AT3G49600,PO:0020137,,,,,TAIR,https://www.arabidopsis.org/,po_anatomy_gene_arabidopsis_tair.assoc
96,2,Arabidopsis,ath,UBP26|At3g49600|AT3G49600,Ubiquitin-Specific Protease|SUP32|ATUBP26|ubiq...,At3g49600|AT3G49600,PO:0025281,,,,,TAIR,https://www.arabidopsis.org/,po_anatomy_gene_arabidopsis_tair.assoc
97,2,Arabidopsis thaliana,ath,UBP26|At3g49600|AT3G49600,Ubiquitin-Specific Protease|SUP32|ATUBP26|ubiq...,At3g49600|AT3G49600,,50% defective seeds,[SENT] 50% defective seeds,[SENT] 50 % defective seeds,[SENT] defect seed,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv
98,2,Arabidopsis,ath,UBP26|At3g49600|AT3G49600,Ubiquitin-Specific Protease|SUP32|ATUBP26|ubiq...,At3g49600|AT3G49600,PO:0001054,,,,,TAIR,https://www.arabidopsis.org/,po_termporal_gene_arabidopsis_tair.assoc


### Running NOBLE Coder on the text columns

In [20]:
noblecoder_jarfile_path = "../lib/NobleCoder-1.0.jar"  

In [21]:
# Small test for paths.
#texts = {1: "small plants had elongated leaves", 2:"short plants had dwarfism leaves"}
#annots = annotate_using_noble_coder(texts, noblecoder_jarfile_path, "pato", precise=1)
#annots

In [22]:
# Create a mapping between the lines that include text and the row indices.
index_to_text = dict(zip(df[df["text_unprocessed"].notnull()].index, df[df["text_unprocessed"].notnull()]["text_unprocessed"]))

In [23]:
# Create the set of precise NOBLE Coder annotations.
pato_annotations = annotate_using_noble_coder(index_to_text, noblecoder_jarfile_path, "pato", precise=1)
po_annotations = annotate_using_noble_coder(index_to_text, noblecoder_jarfile_path, "po", precise=1)
go_annotations = annotate_using_noble_coder(index_to_text, noblecoder_jarfile_path, "go", precise=1)
print("done running noble coder with precise parameter")

done running noble coder with precise parameter


In [24]:
# Combine those annotations and add them as a column in the dataset.
indices = []
list_of_annotation_lists = []
for index in index_to_text.keys():
    indices.append(index)
    annotations = []
    annotations.extend(pato_annotations[index])
    annotations.extend(po_annotations[index])
    annotations.extend(go_annotations[index])
    annotations_str = concatenate_with_delim("|", annotations)
    list_of_annotation_lists.append(annotations_str)
    
df["annotations_nc"] = np.nan    
df.loc[indices,"annotations_nc"] = list_of_annotation_lists
annotations_nc_col = df.pop("annotations_nc")
df.insert(7, "annotations_nc", annotations_nc_col)
df.head()

Unnamed: 0,_gene_id,species_name,species_code,unique_gene_identifiers,other_gene_identifiers,gene_models,annotations,annotations_nc,text_unprocessed,text_tokenized_sents,text_tokenized_words,text_tokenized_stems,reference_name,reference_link,reference_file
0,0,Zea mays ssp mays,zma,ns1|ns1-R|uniprot=Q70UV1|ncbi=542051|narrow sh...,HB-type transcription factor|prs|WUSCHEL-relat...,GRMZM2G069028|Zm00001d004889,,PATO:0000077|PO:0009047|GO:0009739,Plants (stems) do not elongate in response to...,[SENT] Plants (stems) do not elongate in resp...,[SENT] Plants ( stems ) do not elongate in res...,[SENT] plant stem elong respons gibberellin,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv
1,0,maize,zma,ns1|ns1-R|uniprot=Q70UV1|ncbi=542051|narrow sh...,HB-type transcription factor|prs|WUSCHEL-relat...,GRMZM2G069028|Zm00001d004889,PO:0009025,,,,,,Planteome,https://planteome.org/,plant_anatomical_entity.txt
2,0,maize,zma,ns1|ns1-R|uniprot=Q70UV1|ncbi=542051|narrow sh...,HB-type transcription factor|prs|WUSCHEL-relat...,GRMZM2G069028|Zm00001d004889,PO:0020142,,,,,,Planteome,https://planteome.org/,plant_anatomical_entity.txt
3,0,maize,zma,ns1|ns1-R|uniprot=Q70UV1|ncbi=542051|narrow sh...,HB-type transcription factor|prs|WUSCHEL-relat...,GRMZM2G069028|Zm00001d004889,PO:0000003,,,,,,Planteome,https://planteome.org/,plant_anatomical_entity.txt
4,0,Zea mays ssp mays,zma,ns1|ns1-R|uniprot=Q70UV1|ncbi=542051|narrow sh...,HB-type transcription factor|prs|WUSCHEL-relat...,GRMZM2G069028|Zm00001d004889,PO:0000003|PATO:0000569,,,,,,"Oellrich, Walls et al., 2015",https://plantmethods.biomedcentral.com/article...,13007_2015_53_MOESM1_ESM.csv


In [25]:
df.sample(10)

Unnamed: 0,_gene_id,species_name,species_code,unique_gene_identifiers,other_gene_identifiers,gene_models,annotations,annotations_nc,text_unprocessed,text_tokenized_sents,text_tokenized_words,text_tokenized_stems,reference_name,reference_link,reference_file
241755,11588,Arabidopsis,ath,SDH5|succinate dehydrogenase 5|AT1G47420,AT1G47420.1|T3F24.12,AT1G47420,PO:0008019,,,,,,TAIR,https://www.arabidopsis.org/,po_anatomy_gene_arabidopsis_tair.assoc
571691,27291,Arabidopsis,ath,AT4G37560.1|AT4G37560,F19F18.50|F19F18_50,AT4G37560.1|AT4G37560,PO:0000037,,,,,,TAIR,https://www.arabidopsis.org/,po_anatomy_gene_arabidopsis_tair.assoc
525263,25935,Arabidopsis,ath,AT1G24440.1|AT1G24440,F21J9.10,AT1G24440.1|AT1G24440,PO:0000230,,,,,,TAIR,https://www.arabidopsis.org/,po_anatomy_gene_arabidopsis_tair.assoc
113977,2621,tomato,sly,APS1|sp|STARCH BRANCHING ENZYME III|SBEIII*|SH...,adp|D|self-pruning|+|1|prov2|Dominant,Solyc06g074350,GO:0004024,,,,,,Planteome,https://planteome.org/,molecular_function.txt
50124,1166,Arabidopsis,ath,GASA5|At3g02885|AT3G02885,GAST1 Protein Homolog|GAST1 protein homolog 5,At3g02885|AT3G02885,GO:0009506,,,,,,TAIR,https://www.arabidopsis.org/,ATH_GO_GOSLIM.txt
522283,25849,Arabidopsis,ath,CYP71B28|AT1G13090,"""cytochrome P450, family 71, subfamily B, poly...",AT1G13090,PO:0001081,,,,,,TAIR,https://www.arabidopsis.org/,po_termporal_gene_arabidopsis_tair.assoc
270225,13107,Arabidopsis,ath,AT1G74510|AT1G74510.2,F1M20.19|F1M20_19,AT1G74510|AT1G74510.2,GO:0008150,,,,,,TAIR,https://www.arabidopsis.org/,ATH_GO_GOSLIM.txt
216781,10309,Arabidopsis,ath,WAKL2|AT1G16130,wall associated kinase-like 2|T24D18.21|T24D18_21,AT1G16130,PO:0009047,,,,,,TAIR,https://www.arabidopsis.org/,po_anatomy_gene_arabidopsis_tair.assoc
588810,27790,Arabidopsis,ath,RLK|AT5G67280,receptor-like kinase|K3G17.4|K3G17_4,AT5G67280,PO:0007611,,,,,,TAIR,https://www.arabidopsis.org/,po_termporal_gene_arabidopsis_tair.assoc
559744,26942,Arabidopsis,ath,UMAMIT10|AT3G56620,Usually multiple acids move in and out Transpo...,AT3G56620,PO:0007064,,,,,,TAIR,https://www.arabidopsis.org/,po_termporal_gene_arabidopsis_tair.assoc


In [26]:
print(df.shape)

(743658, 15)


In [27]:
# We don't need the genes that have ontology annotations but no text.
gene_ids_with_text = df[df["text_unprocessed"].notnull()]["_gene_id"].values
df = df[df["_gene_id"].isin(gene_ids_with_text)]
print(df.shape)

(194948, 15)


### Saving the combined datasets to new files

In [28]:
# Saving the full versions of the combined datasets.
csv_path = "../final_data/genes_texts_annotations.csv"
tsv_path = "../final_data/genes_texts_annotations.tsv"
df.to_csv(tsv_path, sep="\t", index=False)
df.to_csv(csv_path, index=False)

In [31]:
# Saving sample versions that should be viewable in the browser on GitHub.
# Prepare a sample file from that whole dataset that makes it easy to understand what the context is.

# Function to truncate strings for more readable sample files.
def truncate_string(text, char_limit):
    truncated_text = text[:char_limit]
    if len(text)>char_limit:
        truncated_text = "{}...".format(truncated_text)
    return(truncated_text)

def truncate_fields(sample_df):
    sample_df["unique_gene_identifiers"] = sample_df["unique_gene_identifiers"].map(lambda x: truncate_string(x, 30), na_action="ignore")
    sample_df["other_gene_identifiers"] = sample_df["other_gene_identifiers"].map(lambda x: truncate_string(x, 20), na_action="ignore")
    sample_df["gene_models"] = sample_df["gene_models"].map(lambda x: truncate_string(x, 30), na_action="ignore")
    sample_df["text_unprocessed"] = sample_df["text_unprocessed"].map(lambda x: truncate_string(x, 100), na_action="ignore")
    sample_df["text_tokenized_sents"] = sample_df["text_tokenized_sents"].map(lambda x: truncate_string(x, 100), na_action="ignore")
    sample_df["text_tokenized_words"] = sample_df["text_tokenized_words"].map(lambda x: truncate_string(x, 100), na_action="ignore")
    sample_df["text_tokenized_stems"] = sample_df["text_tokenized_stems"].map(lambda x: truncate_string(x, 100), na_action="ignore")
    sample_df["annotations"] = sample_df["annotations"].map(lambda x: truncate_string(x, 60), na_action="ignore")
    sample_df["annotations_nc"] = sample_df["annotations_nc"].map(lambda x: truncate_string(x, 60), na_action="ignore")
    return(sample_df)


csv_sample_path = "../final_samples/genes_texts_annotations.csv"
tsv_sample_path = "../final_samples/genes_texts_annotations.tsv"

# Taking only the first few rows and truncating values in some columns.
sample_df = df.head(100)
sample_df = truncate_fields(sample_df)
sample_df.to_csv(tsv_sample_path, sep="\t", index=False)
sample_df.to_csv(csv_sample_path, index=False)
print("done")

done


### Saving smaller subsets of the dataset to new files

In [37]:
# Saving files for only the text fields and corresponding annotations.
df_subset = df[df["text_unprocessed"].notnull()]
subset_csv_path = "../final_data/genes_texts.csv"
subset_tsv_path = "../final_data/genes_texts.tsv"
df_subset.to_csv(subset_tsv_path, sep="\t", index=False)
df_subset.to_csv(subset_csv_path, index=False)

# Saving the sample versions.
sample_df_subset = df_subset.head(100)
sample_df_subset = truncate_fields(sample_df_subset)
cols_to_drop = ["annotations"]
sample_df_subset.drop(cols_to_drop, axis="columns", inplace=True)
subset_csv_path = "../final_samples/genes_texts.csv"
subset_tsv_path = "../final_samples/genes_texts.tsv"
sample_df_subset.to_csv(subset_tsv_path, sep="\t", index=False)
sample_df_subset.to_csv(subset_csv_path, index=False)

In [38]:
# Saving files for only the annotation fields not the text ones.
df_subset = df[df["annotations"].notnull()]
subset_csv_path = "../final_data/genes_annotations.csv"
subset_tsv_path = "../final_data/genes_annotations.tsv"
df_subset.to_csv(subset_tsv_path, sep="\t", index=False)
df_subset.to_csv(subset_csv_path, index=False)

# Saving the sample versions.
sample_df_subset = df_subset.head(100)
sample_df_subset = truncate_fields(sample_df_subset)
cols_to_drop = ["annotations_nc","text_unprocessed", "text_tokenized_sents","text_tokenized_words","text_tokenized_stems"]
sample_df_subset.drop(cols_to_drop, axis="columns", inplace=True)
subset_csv_path = "../final_samples/genes_annotations.csv"
subset_tsv_path = "../final_samples/genes_annotations.tsv"
sample_df_subset.to_csv(subset_tsv_path, sep="\t", index=False)
sample_df_subset.to_csv(subset_csv_path, index=False)

In [39]:
print("done with combining all files")

done with combining all files
