## Imports

In [1]:
import pandas as pd
import numpy as np

import regex as re
from nltk.tokenize import RegexpTokenizer
from gensim.models.word2vec import Word2Vec

## Read in data

In [2]:
articles = pd.read_csv('merged_filtered_articles.csv')
articles.head(2)

Unnamed: 0,source,title,abstract,publish_time,authors,journal,url,paper_id,discussion,text_body
0,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,http://europepmc.org/articles/pmc125375?pdf=re...,e3d0d482ebd9a8ba81c254cc433f314142e72174,,Carcinoembryonic antigen (CEA; CD66e) was init...
1,PMC,Structure of coronavirus main proteinase revea...,The key enzyme in coronavirus polyprotein proc...,2002-07-01,"Anand, Kanchan; Palm, Gottfried J.; Mesters, J...",The EMBO Journal,http://europepmc.org/articles/pmc126080?pdf=re...,cf584e00f637cbd8f1bb35f3f09f5ed07b71aeb0,,Transmissible gastroenteritis virus (TGEV) bel...


## Tokenize text data

In [3]:
# Function that strips text of leading and trailing punctuation
import string

def strip_punc(list_of_words):
    return [word.strip(string.punctuation) for word in list_of_words]

In [4]:
# Function that lowercases, tokenizes, and strips the text of any trailing punctuation

def clean_text(text):
    # Cast text as string (floats in body text)
    text = str(text)
    
    # Lowercase all words
    lower = text.lower()
    
    # Tokenizes by whitespace
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    tokens = tokenizer.tokenize(lower)
    
    # Strip leading and trailing punctuation
    tokens_stripped = strip_punc(tokens)
    
    # Return tokens stripped of trailing punctuation
    return tokens_stripped

In [5]:
# Tokenize titles
articles['title'] = articles['title'].map(clean_text)

# Tokenize abstract
articles['abstract'] = articles['abstract'].map(clean_text)

# Tokenize body text
articles['text_body'] = articles['text_body'].map(clean_text)

In [6]:
articles.head(2)

Unnamed: 0,source,title,abstract,publish_time,authors,journal,url,paper_id,discussion,text_body
0,PMC,"[crystal, structure, of, murine, sceacam1a[1,4...","[ceacam1, is, a, member, of, the, carcinoembry...",2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,http://europepmc.org/articles/pmc125375?pdf=re...,e3d0d482ebd9a8ba81c254cc433f314142e72174,,"[carcinoembryonic, antigen, cea, cd66e, was, i..."
1,PMC,"[structure, of, coronavirus, main, proteinase,...","[the, key, enzyme, in, coronavirus, polyprotei...",2002-07-01,"Anand, Kanchan; Palm, Gottfried J.; Mesters, J...",The EMBO Journal,http://europepmc.org/articles/pmc126080?pdf=re...,cf584e00f637cbd8f1bb35f3f09f5ed07b71aeb0,,"[transmissible, gastroenteritis, virus, tgev, ..."


## Word2Vec Model - CBOW

In [7]:
# Train a Word2Vec model on body text using bag of words

model = Word2Vec(articles['text_body'],
                 min_count=1,
                 workers=4)

In [8]:
model.wv.most_similar('coronavirus', topn=25)

[('cov', 0.8692415952682495),
 ('coronaviruses', 0.7392231225967407),
 ('betacoronavirus', 0.6885976195335388),
 ('coronaviral', 0.6696343421936035),
 ('covs', 0.6676793098449707),
 ('preprintdesigning', 0.6473985910415649),
 ('coronavirus-china.hypoxemia', 0.6204733848571777),
 ('sars-coronavirus', 0.6188088059425354),
 ('corona-virus', 0.610124409198761),
 ('virus', 0.6073490381240845),
 ('beta-coronavirus', 0.5982087254524231),
 ('β-coronavirus', 0.5929840207099915),
 ('kobuvirus', 0.587209939956665),
 ('hcov', 0.5823780298233032),
 ('sars-cov', 0.5814834237098694),
 ('henipavirus', 0.5751717686653137),
 ('alphacoronavirus', 0.5709920525550842),
 ('ncov-19,the', 0.5706366896629333),
 ('betacov', 0.5705119371414185),
 ('viral', 0.5690200924873352),
 ('paramyxovirus', 0.5684882402420044),
 ('torovirus', 0.565484344959259),
 ('hcov-emc', 0.5619979500770569),
 ('negative.defined', 0.5616637468338013),
 ('filovirus', 0.556121826171875)]

In [12]:
# Update model with words from abstract
# https://www.machinelearningplus.com/nlp/gensim-tutorial/#15howtoupdateanexistingword2vecmodelwithnewdata

model.build_vocab(articles['abstract'], update=True)
model.train(articles['abstract'],
            total_examples = model.corpus_count,
            epochs = model.epochs)

(10730671, 13470080)

In [13]:
model.wv.most_similar('coronavirus', topn=25)

[('cov', 0.7612472772598267),
 ('preprintdesigning', 0.6432745456695557),
 ('corona-virus', 0.6327853202819824),
 ('eoronavirus', 0.6172501444816589),
 ('coronaviruses', 0.6105033159255981),
 ('betacoronavirus', 0.6028242111206055),
 ('coronaviral', 0.6002240777015686),
 ('beta-coronavirus', 0.5967655181884766),
 ('kobuvirus', 0.5890163779258728),
 ('sars-coronavirus', 0.5869377851486206),
 ('β-coronavirus', 0.585595428943634),
 ('coronavirus.the', 0.5662718415260315),
 ('virus', 0.565331757068634),
 ('coronavirus-china.hypoxemia', 0.5557613372802734),
 ('available.spike', 0.5557141900062561),
 ('negative.defined', 0.5533357858657837),
 ('coronavirus"all', 0.5406161546707153),
 ('alphacoronavirus', 0.5374699831008911),
 ('parvovirus', 0.5236529111862183),
 ('betacov', 0.5220209360122681),
 ('calicivirus', 0.5193199515342712),
 ('hcov-emc', 0.5166037082672119),
 ('filovirus', 0.5147768259048462),
 ('covs', 0.5146432518959045),
 ('henipavirus', 0.5111933350563049)]

## Word2Vec Model - SkipGram

In [10]:
# Train Word2Vec model on body text using SkipGram

model2 = Word2Vec(articles['text_body'],
                  min_count=1,
                  sg=1,
                  workers=4)

In [11]:
model2.wv.most_similar('coronavirus', topn=25)

[('cov', 0.8551164865493774),
 ('coronaviruses', 0.7946915626525879),
 ('sarscoronavirus', 0.7800401449203491),
 ('sars-coronavirus', 0.7679806351661682),
 ('sarscov-2', 0.7635361552238464),
 ('coronavirus.the', 0.7550227642059326),
 ('sars-ncov', 0.7525564432144165),
 ('sarscov2', 0.7520612478256226),
 ('betacoronavirus', 0.7496750950813293),
 ('hecv', 0.7488572001457214),
 ('sars-cov', 0.7478749752044678),
 ('coronavirus.a', 0.7467749714851379),
 ('beta-coronavirus', 0.746642529964447),
 ('betacov', 0.7464969158172607),
 ('coronaviral', 0.743518590927124),
 ('sarsassociated', 0.7423727512359619),
 ('corona', 0.7416399121284485),
 ('hypervirulent', 0.7405579090118408),
 ('h-cov', 0.7396107912063599),
 ('coronavims', 0.739063560962677),
 ('hcov)-229e', 0.7387552261352539),
 ('coronavirus-2', 0.7384424805641174),
 ('bocov', 0.738169252872467),
 ('corona-virus', 0.7380975484848022),
 ('virus', 0.7364841103553772)]

In [14]:
# Update model with words from abstract

model2.build_vocab(articles['abstract'], update=True)
model2.train(articles['abstract'],
             total_examples = model2.corpus_count,
             epochs = model2.epochs)

(10729853, 13470080)

In [15]:
model2.wv.most_similar('coronavirus', topn=25)

[('cov', 0.854965090751648),
 ('sarscoronavirus', 0.8060474991798401),
 ('virus', 0.8028204441070557),
 ('cov-sars', 0.7906704545021057),
 ('corona-virus', 0.7903923988342285),
 ('sar-cov-2', 0.7901962995529175),
 ('coronavirus.the', 0.7863421440124512),
 ('navirus', 0.7827399969100952),
 ('bocov', 0.7815808653831482),
 ('sarscov-2', 0.7806040048599243),
 ('mers--cov', 0.7787435054779053),
 ('mcov', 0.7784234881401062),
 ('sars)-cov', 0.7772793769836426),
 ('new-emerging', 0.7769531011581421),
 ('syndromeassociated', 0.7758957147598267),
 ('h-cov', 0.7744204998016357),
 ('cornavirus', 0.7719203233718872),
 ('viruses.since', 0.7712740898132324),
 ('co-v', 0.7698397040367126),
 ('hcov)-229e', 0.7696917653083801),
 ('sarsassociated', 0.7682657837867737),
 ('coronavims', 0.7671185731887817),
 ('editor-the', 0.7669306993484497),
 ('syndrome-coronavirus-2', 0.7668635845184326),
 ('sarscov2', 0.7661660313606262)]