In [108]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')

import numpy as np
import pandas as pd
from gensim.models import word2vec

from google.colab import drive
drive.mount('/content/drive')

import re # For regular expressions

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## (a) Load the dataset

In [109]:
def load_data():
    """ Read tweets from the file.
        Return:
            list of lists (list_words), with words from each of the processed tweets
    """
    tweets = pd.read_csv('/content/drive/MyDrive/Corona_Tweets_large.csv', header=None)
    list_words = []
    # define stop words
    stop_words = set(stopwords.words('english'))
    ### iterate over all tweets from the dataset
    for i in tweets.index:
      tweet = tweets.loc[i, 0]
      # Remove URLs
      text = re.sub(r'https?://\S+', '', tweet)
      ### remove non-letter.
      text = re.sub(r'[^a-zA-Z]', ' ', text)
      ### tokenize
      words = nltk.word_tokenize(text)

      new_words = []
      ### iterate over all words of a tweet, remove the stop words and convert a word (w) to the lower case
      for w in words:
        w_lower = w.lower()
        if w_lower not in stop_words:
          new_words.append(w_lower)

      list_words.append(new_words)
    return list_words

# check a few samples of twitter corpus
twitter_corpus = load_data()
print(twitter_corpus[:3])

[['smelled', 'scent', 'hand', 'sanitizers', 'today', 'someone', 'past', 'would', 'think', 'intoxicated'], ['hey', 'yankees', 'yankeespr', 'mlb', 'made', 'sense', 'players', 'pay', 'respects'], ['diane', 'wdunlap', 'realdonaldtrump', 'trump', 'never', 'claimed', 'covid', 'hoax', 'claim', 'effort']]


## (b) Word2Vec

In [110]:
# Creating the word2vec model and setting values for the various parameters

# Function for model training

def train_word2vec(twitter_corpus, num_features, context):
  num_workers = 4     # Number of parallel threads, can be changed
  min_word_count = 10  # Minimum word count. You can change it also.
  downsampling = 1e-3 # (0.001) Downsample setting for frequent words, can be changed

  # Initializing the train model
  print(f"Training Word2Vec model with {num_features} features and {context} context window size....")
  model = word2vec.Word2Vec(twitter_corpus, workers=num_workers,\
                            vector_size=num_features,\
                            min_count=min_word_count,\
                            window=context,
                            sample=downsampling)

  # To make the model memory efficient
  model.init_sims(replace=True)

  return model

settings = [(20, 5), (50, 10), (75, 15)]
models = []

for num_features, context in settings:
    model = train_word2vec(twitter_corpus, num_features, context)
    models.append(model)

print("All models trained")






Training Word2Vec model with 20 features and 5 context window size....


  model.init_sims(replace=True)


Training Word2Vec model with 50 features and 10 context window size....




Training Word2Vec model with 75 features and 15 context window size....




All models trained


In [111]:
for model, (num_features, context) in zip(models, settings):
    print(f"Model with {num_features} features and a context window of {context}:")
    print(model.wv.most_similar("covid"))




Model with 20 features and a context window of 5:
[('coronavirus', 0.880866527557373), ('current', 0.7242090702056885), ('repor', 0.6912548542022705), ('abroad', 0.6783382892608643), ('unknown', 0.6728489398956299), ('recover', 0.6718835830688477), ('underestimate', 0.6703233122825623), ('upsurge', 0.6688410043716431), ('emergence', 0.6636218428611755), ('arising', 0.6616073846817017)]
Model with 50 features and a context window of 10:
[('coronavirus', 0.42557772994041443), ('concerning', 0.4255542755126953), ('infe', 0.4168890118598938), ('races', 0.413379430770874), ('mln', 0.4132644832134247), ('numb', 0.41047918796539307), ('accurately', 0.4094751477241516), ('decrease', 0.40897104144096375), ('underestimate', 0.4088849127292633), ('mapping', 0.4070318639278412)]
Model with 75 features and a context window of 15:
[('infe', 0.466819167137146), ('infecti', 0.4321218729019165), ('climbed', 0.41643258929252625), ('incr', 0.407633900642395), ('mln', 0.40618956089019775), ('cou', 0.40054

In [112]:
for model, (num_features, context) in zip(models, settings):
    print(f"Model with {num_features} features and a context window of {context}:")
    print(model.wv.most_similar("grocery"))


Model with 20 features and a context window of 5:
[('shopping', 0.9466589689254761), ('store', 0.9262773394584656), ('shop', 0.9156877398490906), ('buses', 0.8945198059082031), ('dining', 0.8872543573379517), ('door', 0.8849291801452637), ('shoppers', 0.8841686844825745), ('restaurant', 0.8757009506225586), ('stores', 0.8668549656867981), ('outdoor', 0.8583507537841797)]
Model with 50 features and a context window of 10:
[('shopping', 0.9162435531616211), ('shop', 0.9002662301063538), ('store', 0.898946225643158), ('stores', 0.8803632855415344), ('restaurant', 0.8647862076759338), ('outdoor', 0.8508056998252869), ('mall', 0.8480250835418701), ('barber', 0.847423255443573), ('dining', 0.837873637676239), ('distanced', 0.8351019620895386)]
Model with 75 features and a context window of 15:
[('shop', 0.9381406903266907), ('store', 0.9290154576301575), ('stores', 0.9211677312850952), ('shopping', 0.903228223323822), ('pack', 0.8588300347328186), ('shoppers', 0.8552000522613525), ('customer

In [113]:
for model, (num_features, context) in zip(models, settings):
    print(f"Model with {num_features} features and a context window of {context}:")
    print(model.wv.most_similar("virus"))

Model with 20 features and a context window of 5:
[('deadly', 0.7818624973297119), ('contagious', 0.7509030699729919), ('control', 0.7446085810661316), ('viruses', 0.7289645671844482), ('uncontrolled', 0.7250337600708008), ('mutation', 0.7197945713996887), ('bat', 0.7112981677055359), ('known', 0.7036827802658081), ('faster', 0.7033466696739197), ('disease', 0.6989998817443848)]
Model with 50 features and a context window of 10:
[('deadly', 0.6614598035812378), ('contagious', 0.6559390425682068), ('controlled', 0.6473415493965149), ('wuhan', 0.6172404289245605), ('mutation', 0.6028766632080078), ('bat', 0.5987941026687622), ('viruses', 0.5984153747558594), ('herd', 0.5943875312805176), ('underestimate', 0.5918603539466858), ('stopping', 0.5887576341629028)]
Model with 75 features and a context window of 15:
[('deadly', 0.6631927490234375), ('controlled', 0.6271705627441406), ('herd', 0.6147304177284241), ('mutation', 0.6085119247436523), ('contagious', 0.6076118350028992), ('wuhan', 0.

In [114]:
for model, (num_features, context) in zip(models, settings):
    print(f"Model with {num_features} features and a context window of {context}:")
    print(model.wv.most_similar("corona"))

Model with 20 features and a context window of 5:
[('coronainfoch', 0.7689714431762695), ('worldnews', 0.7321677803993225), ('source', 0.7146664261817932), ('utc', 0.7091984748840332), ('shalinitelevision', 0.695848822593689), ('coronavirus', 0.6747753620147705), ('bat', 0.6710646152496338), ('repor', 0.6609982252120972), ('coronaoutbreak', 0.6604428887367249), ('wuhan', 0.6534472703933716)]
Model with 50 features and a context window of 10:
[('coronainfoch', 0.702916145324707), ('worldnews', 0.6888173222541809), ('source', 0.60324627161026), ('coronainindia', 0.5926268100738525), ('utc', 0.5862802267074585), ('repor', 0.5820522308349609), ('bat', 0.5597086548805237), ('modified', 0.539826512336731), ('virus', 0.5249661207199097), ('coronovirus', 0.5178118944168091)]
Model with 75 features and a context window of 15:
[('coronainfoch', 0.692014217376709), ('worldnews', 0.6426941156387329), ('source', 0.5553585290908813), ('bat', 0.5439143180847168), ('korona', 0.5367870926856995), ('eff

In [115]:
for model, (num_features, context) in zip(models, settings):
    print(f"Model with {num_features} features and a context window of {context}:")
    print(model.wv.most_similar("pandemic"))

Model with 20 features and a context window of 5:
[('crisis', 0.8270560503005981), ('disruption', 0.7848400473594666), ('midst', 0.783033549785614), ('pan', 0.7794546484947205), ('wemerry', 0.7750093936920166), ('gracepoint', 0.7741866111755371), ('depression', 0.7621597051620483), ('climatechange', 0.7589793801307678), ('uncertainty', 0.7582972645759583), ('pande', 0.7578797340393066)]
Model with 50 features and a context window of 10:
[('crisis', 0.7169919610023499), ('pandemi', 0.6076443791389465), ('pande', 0.6002398729324341), ('pan', 0.5887707471847534), ('disruption', 0.5620629787445068), ('fears', 0.5607689619064331), ('pandem', 0.5534719228744507), ('wemerry', 0.5512080788612366), ('warming', 0.548931896686554), ('pand', 0.5382226705551147)]
Model with 75 features and a context window of 15:
[('crisis', 0.7158800959587097), ('downturns', 0.6055337190628052), ('disruption', 0.5616447329521179), ('pan', 0.5516571402549744), ('crises', 0.5516414642333984), ('uncertainty', 0.54411

In [116]:
for model, (num_features, context) in zip(models, settings):
    print(f"Model with {num_features} features and a context window of {context}:")
    print(model.wv.most_similar("lockdown"))

Model with 20 features and a context window of 5:
[('restrictions', 0.9008350968360901), ('tourists', 0.8164910078048706), ('easing', 0.8008719682693481), ('travel', 0.7970287203788757), ('melbourne', 0.7955950498580933), ('imposed', 0.7938512563705444), ('australia', 0.7771329283714294), ('stricter', 0.7623907923698425), ('auckland', 0.7609186172485352), ('spain', 0.7499379515647888)]
Model with 50 features and a context window of 10:
[('restrictions', 0.7411366701126099), ('easing', 0.712208092212677), ('lifted', 0.6908116340637207), ('imposed', 0.6876134276390076), ('eased', 0.6648248434066772), ('stricter', 0.6464563012123108), ('curfew', 0.6428375244140625), ('complete', 0.6231052875518799), ('parts', 0.6230484843254089), ('melbourne', 0.6221562623977661)]
Model with 75 features and a context window of 15:
[('restrictions', 0.7326436042785645), ('imposed', 0.6752158403396606), ('lifted', 0.6513366103172302), ('manchester', 0.6435272693634033), ('easing', 0.6407857537269592), ('hot

In [117]:
# function to find odd word out
def odd_one_out(model, words):
  odd_word = model.wv.doesnt_match(words)
  return odd_word;

# list of test words
words = ["covid", "grocery", "virus", "corona", "pandemic"]

for model, (num_features, context) in zip(models, settings):
    print(f"Model with {num_features} features and a context window of {context}:")
    print(f"The odd word out is: {odd_one_out(model, words)}")


Model with 20 features and a context window of 5:
The odd word out is: grocery
Model with 50 features and a context window of 10:
The odd word out is: grocery
Model with 75 features and a context window of 15:
The odd word out is: grocery
