In [1]:
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from string import punctuation
from time import time
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors, Word2Vec
from nltk.stem import WordNetLemmatizer
from constants import pubtator_output

In [2]:
stop_words = set(stopwords.words('english'))
def process_sentence(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace("(", " ( ")
    text = text.replace(")", " ) ")
    text = text.replace("-"," - ")
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    lemmatizer = WordNetLemmatizer()

    tokens = [token for token in text.split() if token not in punctuation and token not in stop_words]
    tokens = [token.lower() if not token.startswith("Gene_") else token for token in tokens]
    tokens = [lemmatizer.lemmatize(token) if not token.startswith("Gene_") else token  for token in tokens]

    return tokens

In [3]:
data = pd.read_csv(pubtator_output)
texts = data.text.to_numpy().tolist()
sentences = []
for text in texts:
    sentences.extend(sent_tokenize(text))

print("the number of sentences: ", len(sentences))

sentences = [process_sentence(sent) for sent in sentences]
print(sentences[0:10])


the number of sentences:  236815
[['recessive', 'dystrophic', 'epidermolysis', 'bullosa', 'rdeb', 'manifest', 'blistering', 'erosion', 'skin', 'mucous', 'membrane', 'due', 'mutation', 'Gene_1294'], ['the', 'scarring', 'driven', 'inflammatory', 'processes,', 'particularly', 'Gene_7039', 'signaling', 'pathways,', 'resulting', 'excess', 'synthesis', 'deposition', 'extracellular', 'matrix,', 'especially', 'collagen'], ['losartan,', 'angiotensin', 'ii', 'type', '1', 'receptor', 'antagonist,', 'inhibitor', 'Gene_7039', 'activity'], ['previous', 'preclinical', 'study', 'hypomorphic', 'Gene_12836', 'mouse', 'recapitulating', 'feature', 'rdeb', 'suggested', 'losartan', 'may', 'improve', 'clinical', 'feature', 'rdeb'], ['the', 'diagnosis', 'based', 'characteristic', 'clinical', 'feature', 'presence', 'biallelic', 'loss', 'function', 'mutation', 'Gene_1294'], ['objective:', 'this', 'study', 'intended', 'explore', 'regulatory', 'function', 'Gene_100133205', 'nasopharyngeal', 'carcinoma', 'npc'], [

In [4]:
from gensim.models  import KeyedVectors, Word2Vec
from utils import load_embedding
from time import time

t = time()
w2v_cbow_pt = load_embedding("./WordVectors/Pretrained/bioconceptvec_word2vec_cbow.bin", binary=True)
print("Time to load cbow embeddings in mins: ", round(((time() - t)/60.0),2))

t = time()
w2v_sg_pt = load_embedding("./WordVectors/Pretrained/bioconceptvec_word2vec_skipgram.bin", binary=True)
print("Time to load skipgram embeddings in mins: ", round(((time() - t)/60.0),2))


embedding loaded from ./WordVectors/Pretrained/bioconceptvec_word2vec_cbow.bin
Time to load cbow embeddings in mins:  0.59
embedding loaded from ./WordVectors/Pretrained/bioconceptvec_word2vec_skipgram.bin
Time to load skipgram embeddings in mins:  0.66


In [19]:
import multiprocessing
from gensim.models import Word2Vec

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
t = time()
w2v_cbow = Word2Vec(min_count=5, window=10, vector_size=100, sample=0.001, alpha=0.025, negative=5, workers= cores - 1, epochs=10, sg = 0)
w2v_cbow.build_vocab(sentences)
w2v_cbow.wv.vectors_lockf = np.ones(len(w2v_cbow.wv))
w2v_cbow.wv.intersect_word2vec_format("./WordVectors/Pretrained/bioconceptvec_word2vec_cbow.bin", binary=True)
w2v_cbow.train(sentences, total_examples= w2v_cbow.corpus_count, epochs=30)
print("Time to train cbow embeddings in mins: ", round(((time() - t)/60.0),2))

Time to train cbow embeddings in mins:  2.12


In [20]:
vocab = list(w2v_cbow.wv.key_to_index)
print("Total number of words: ", len(vocab))

print(vocab[0:100])

Total number of words:  33731
['cell', 'expression', 'the', 'wound', 'healing', 'level', 'in', 'effect', 'protein', 'migration', 'tissue', 'cancer', 'role', 'study', 'significantly', 'increased', 'mouse', 'group', 'human', 'gene', 'result', 'assay', 'we', 'treatment', 'also', 'pathway', 'patient', 'growth', 'signaling', 'showed', 'may', 'proliferation', 'factor', 'activity', 'using', 'compared', 'invasion', 'fibroblast', 'mrna', 'tumor', 'activation', 'control', 'results:', 'skin', 'analysis', 'Gene_7422', 'found', 'western', 'function', 'used', 'day', 'decreased', 'collagen', 'inhibited', 'target', 'vitro', 'mechanism', 'epithelial', '1', 'bone', 'reduced', 'could', 'potential', 'formation', 'proliferation,', 'induced', 'inhibitor', 'associated', 'model', 'receptor', 'increase', 'normal', 'overexpression', 'vivo', 'cells,', 'demonstrated', '2', 'Gene_7039', 'via', 'these', 'inhibition', 'line', 'this', '0', 'response', 'repair', 'however,', 'Gene_7040', 'type', 'p', 'expressed', 'high

In [21]:
t = time()
w2v_sg = Word2Vec(min_count=5, window=10, vector_size=100, sample=0.001, alpha=0.025, negative=5, workers= cores - 1, epochs=10, sg = 1)
w2v_sg.build_vocab(sentences)
w2v_sg.wv.vectors_lockf = np.ones(len(w2v_sg.wv))
w2v_sg.wv.intersect_word2vec_format("./WordVectors/Pretrained/bioconceptvec_word2vec_cbow.bin", binary=True)
w2v_sg.train(sentences, total_examples= w2v_sg.corpus_count, epochs=30)
print("Time to train cbow embeddings in mins: ", round(((time() - t)/60.0),2))

Time to train cbow embeddings in mins:  1.37


In [22]:
all_genes = [word for word in vocab if word.startswith("Gene_")]

print("Total number of genes in vocab: ", len(all_genes))

#with open("GeneLists/all_genes.txt","w") as f:
    #for gene in all_genes:
        #f.write(gene + "\n")
#f.close()

Total number of genes in vocab:  9151


In [23]:
# save the trained vectors 
w2v_cbow.wv.save_word2vec_format("./WordVectors/Computed/word2vec_cbow.bin", binary = True)
w2v_sg.wv.save_word2vec_format("./WordVectors/Computed/word2vec_skipgram.bin", binary = True)