In [9]:
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from string import punctuation
from time import time
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors, Word2Vec
from nltk.stem import WordNetLemmatizer
import os

In [23]:
stop_words = set(stopwords.words('english'))
def process_sentence(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace("(", " ( ")
    text = text.replace(")", " ) ")
    text = text.replace("-"," - ")
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    lemmatizer = WordNetLemmatizer()

    tokens = [token for token in text.split() if token not in punctuation and token not in stop_words]
    tokens = [token.lower() if not token.startswith("Gene_") else token for token in tokens]
    tokens = [lemmatizer.lemmatize(token) if not token.startswith("Gene_") else token  for token in tokens]

    return " ".join(tokens)

In [8]:
from gensim.models  import KeyedVectors, Word2Vec
from utils import load_embedding
from time import time

t = time()
w2v_cbow_pt = load_embedding("./Embeddings/GO/anc2vec_200.bin", binary=True)
print("Time to load cbow embeddings in mins: ", round(((time() - t)/60.0),2))

t = time()
w2v_sg_pt = load_embedding("./Embeddings/Concepts/bioconceptvec_word2vec_skipgram.bin", binary=True)
print("Time to load skipgram embeddings in mins: ", round(((time() - t)/60.0),2))

embedding loaded from ./Embeddings/GO/anc2vec_200.bin
Time to load cbow embeddings in mins:  0.0
embedding loaded from ./Embeddings/Concepts/bioconceptvec_word2vec_skipgram.bin
Time to load skipgram embeddings in mins:  0.58


In [27]:
# process pubmed abstract texts and collect all gene_tags into a dictionary
from constants import pbt_path
pubmed_dat_df = pd.read_csv(os.path.join(pbt_path, "pubmed_annotations.tsv"), sep = "\t")
pubmed_data_list = []
pmids = pubmed_dat_df.pmid.unique()

for pmid in pmids:
    data_entry = dict()
    data_entry["pmid"] = pmid
    dat_of_pmid = pubmed_dat_df[pubmed_dat_df["pmid"] == pmid]
    data_entry["genes"] = list(dat_of_pmid.Gene.unique())
    data_entry["gene_eids"] = list(dat_of_pmid.gene_eid.unique())
    data_entry["title"] = dat_of_pmid.title.array[0]
    data_entry["abstract"] = dat_of_pmid.abstract.array[0]
    sentences = sent_tokenize(data_entry["abstract"])
    data_entry["processed_abstract"] = [process_sentence(sent) for sent in sentences]
    pubmed_data_list.append(data_entry)



In [52]:
# update data entry to associate probable GO terms based on genes found in the abstract
#from utils import get_filtered_uniprot_ids
uniprot_go_dat = pd.read_csv("./Data/UniprotGO/UniprotGO-BP.txt", sep = "\t", header=None)
uniprot_go_dat.columns = ["Uniprot_id", "GO_ids"]
all_uids = uniprot_go_dat.Uniprot_id.to_list()


In [59]:
from unipressed import IdMappingClient
import time
mappings = []
for uid in all_uids:
    request = IdMappingClient.submit(
        source="UniProtKB_AC-ID",  dest="Gene_Name", ids={uid}
    )
    time.sleep(3.0)
    mappings.extend(list(request.each_result()))
    print(mappings)

[{'from': 'P49848', 'to': 'TAF6'}]
[{'from': 'P49848', 'to': 'TAF6'}, {'from': 'P10275', 'to': 'AR'}]
[{'from': 'P49848', 'to': 'TAF6'}, {'from': 'P10275', 'to': 'AR'}, {'from': 'P47897', 'to': 'QARS1'}]
[{'from': 'P49848', 'to': 'TAF6'}, {'from': 'P10275', 'to': 'AR'}, {'from': 'P47897', 'to': 'QARS1'}, {'from': 'P14778', 'to': 'IL1R1'}]
[{'from': 'P49848', 'to': 'TAF6'}, {'from': 'P10275', 'to': 'AR'}, {'from': 'P47897', 'to': 'QARS1'}, {'from': 'P14778', 'to': 'IL1R1'}, {'from': 'Q14781', 'to': 'CBX2'}]
[{'from': 'P49848', 'to': 'TAF6'}, {'from': 'P10275', 'to': 'AR'}, {'from': 'P47897', 'to': 'QARS1'}, {'from': 'P14778', 'to': 'IL1R1'}, {'from': 'Q14781', 'to': 'CBX2'}, {'from': 'O60383', 'to': 'GDF9'}]
[{'from': 'P49848', 'to': 'TAF6'}, {'from': 'P10275', 'to': 'AR'}, {'from': 'P47897', 'to': 'QARS1'}, {'from': 'P14778', 'to': 'IL1R1'}, {'from': 'Q14781', 'to': 'CBX2'}, {'from': 'O60383', 'to': 'GDF9'}, {'from': 'Q9HBM1', 'to': 'SPC25'}]
[{'from': 'P49848', 'to': 'TAF6'}, {'from':

IdMappingError: UniProt has not yet processed the results, consider using time.sleep() to wait until they are complete.