# Assignment 3 - Word Embedding

In [114]:
# import all the needed libraries
import pandas as pd
import gensim        # Gensim library for word embedding 
from gensim.models import KeyedVectors  # access to pre-trained word embeddings
from gensim.utils import simple_preprocess # preprocess the text data

import re
import nltk          # Natural Language Toolkit for text processing
from nltk.corpus import stopwords         # for stopwords
from nltk.stem import WordNetLemmatizer   # for lemmatization

In [2]:
# read the CSV file

df = pd.read_csv("master_covid19_2020_03_15-31 - master_covid19_2020_03_15-31.csv")

In [3]:
# select only the English tweets

texts = df.loc[df['lang'] == "en"]['text']
print(texts)

0        RT @realDonaldTrump: 30 DAYS TO SLOW THE SPREA...
2        RT @spectatorindex: JUST IN: 13 year old who t...
4        RT @StephMcNasty: The Coronavirus is really sh...
6        RT @CharlieDaniels: Hazel and myself send our ...
7        RT @PrakritiGaba: Last night in the ICU of a #...
                               ...                        
24339    RT @Penalosa_G: "If the Swedes are doing it, i...
24343    RT @SenWarren: The House changes to the corona...
24347    RT @sapnamadan: His wife is being quarantined ...
24348    RT @jsolomonReports: Chris Cuomo, CNN anchor a...
24349    RT @NorbertElekes: NEW: India reports 272 new ...
Name: text, Length: 14849, dtype: object


#### Pre-processing

In [118]:
# pre-processing code retrived from: https://gist.github.com/MrEliptik/b3f16179aa2f530781ef8ca9a16499af

def preprocess_text(text):
    
    # replace URLs with placeholder URL
    text = re.sub(r'http\S+', 'URL', text) 

    # replace user mentions with placeholder USER
    text = re.sub(r'@\w+', '@USER', text)
   
    # Tokenize text
    tokens = text.split()
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Rejoin tokens into a string
    text = ' '.join(tokens)
    
    return text

In [119]:
prep_texts = texts.apply(preprocess_text)

In [120]:
prep_texts

0        RT @USER: 30 DAYS TO SLOW THE SPREAD #COVIDー19...
2        RT @USER: JUST IN: 13 year old who tested posi...
4        RT @USER: The Coronavirus is really showing wh...
6        RT @USER: Hazel and myself send our love and p...
7        RT @USER: Last night in the ICU of a #NYC hosp...
                               ...                        
24339    RT @USER: "If the Swedes are doing it, it must...
24343    RT @USER: The House change to the coronavirus ...
24347    RT @USER: His wife is being quarantined for 14...
24348    RT @USER: Chris Cuomo, CNN anchor and brother ...
24349    RT @USER: NEW: India report 272 new case of co...
Name: text, Length: 14849, dtype: object

In [121]:
# remove stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

stop_words = stopwords.words('english')
clean_texts = remove_stopwords(prep_texts)

In [122]:
print(clean_texts[1])

['rt', 'user', 'year', 'old', 'tested', 'positive', 'coronavirus', 'uk', 'ha', 'died']


In [117]:
# convert clean_texts list into a pandas series
clean_texts = pd.Series(clean_texts)

# split into samples of different sizes
sample1 = clean_texts.sample(n=1000, random_state=42)
sample2 = clean_texts.sample(n=1500, random_state=42)
sample3 = clean_texts.sample(n=2000, random_state=42)
sample4 = clean_texts.sample(n=2500, random_state=42)

# save samples to separate files
sample1.to_csv('sample1.csv', index=False)
sample2.to_csv('sample2.csv', index=False)
sample3.to_csv('sample3.csv', index=False)
sample4.to_csv('sample4.csv', index=False)

#### Sample 1 (1000)

In [116]:
# Train a Word2Vec model
w2v_model_sample1 = gensim.models.Word2Vec(
        sample1,
        vector_size = 50,
        window = 5,
        min_count = 1,
        workers = 1,
        sg = 1,
        epochs=10)

In [98]:
print(w2v_model_sample1)

Word2Vec<vocab=3757, vector_size=50, alpha=0.025>


In [99]:
# save the model
w2v_model_sample1.save('sample1.model')

# load the model
s1_model = KeyedVectors.load("sample1.model")
               
# Printing the first 20 words that in our vocabulary
print(s1_model.wv.index_to_key[0:20])

['user', 'rt', 'coronavirus', 'covid', 'url', 'trump', 'new', 'case', 'say', 'one', 'people', 'pandemic', 'china', 'positive', 'amp', 'death', 'virus', 'test', 'us', 'time']


In [102]:
# Get similar words for covid
print(f'covid:{w2v_model_sample1.wv.most_similar("covid", topn=10)}') 

# Get similar words for coronavirus
print(f'coronavirus:{w2v_model_sample1.wv.most_similar("coronavirus", topn=10)}')

# Get similar words for virus
print(f'virus:{w2v_model_sample1.wv.most_similar("virus", topn=10)}')

covid:[('mid', 0.9612714648246765), ('ecoin', 0.9607360363006592), ('register', 0.9586893916130066), ('slated', 0.9585748314857483), ('world', 0.95509272813797), ('please', 0.9550577402114868), ('way', 0.9533871412277222), ('corona', 0.9519773125648499), ('fools', 0.9513662457466125), ('results', 0.9513639211654663)]
coronavirus:[('url', 0.9954591393470764), ('cases', 0.9949049949645996), ('user', 0.994881808757782), ('england', 0.9945002198219299), ('th', 0.9941933155059814), ('said', 0.9939618110656738), ('country', 0.993722140789032), ('update', 0.9935998916625977), ('year', 0.993543803691864), ('outbreak', 0.9934816360473633)]
virus:[('wuhan', 0.9948258399963379), ('called', 0.992692232131958), ('name', 0.9924917221069336), ('hubei', 0.9924367666244507), ('chinese', 0.9923813343048096), ('market', 0.9923664927482605), ('bed', 0.9922053217887878), ('make', 0.9921841025352478), ('hoax', 0.9921717643737793), ('focus', 0.992072582244873)]


#### Sample 2 (1500)

In [103]:
# Train a Word2Vec model
w2v_model_sample2 = gensim.models.Word2Vec(
        sample2,
        vector_size = 50,
        window = 5,
        min_count = 1,
        workers = 1,
        sg = 1,
        epochs=10)

print(w2v_model_sample2)

Word2Vec<vocab=4744, vector_size=50, alpha=0.025>


In [104]:
# save the model
w2v_model_sample2.save('sample2.model')

# load the model
s2_model = KeyedVectors.load("sample2.model")
               
# Printing the first 20 words that in our vocabulary
print(s2_model.wv.index_to_key[0:20])

['user', 'rt', 'coronavirus', 'covid', 'url', 'trump', 'new', 'case', 'pandemic', 'people', 'positive', 'say', 'amp', 'china', 'death', 'one', 'us', 'time', 'president', 'virus']


In [105]:
# Get similar words for covid
print(f'covid:{w2v_model_sample2.wv.most_similar("covid", topn=10)}') 

# Get similar words for coronavirus
print(f'coronavirus:{w2v_model_sample2.wv.most_similar("coronavirus", topn=10)}')

# Get similar words for virus
print(f'virus:{w2v_model_sample2.wv.most_similar("virus", topn=10)}')

covid:[('mid', 0.94792640209198), ('fools', 0.9372761845588684), ('fighting', 0.9371911883354187), ('pandemic', 0.9367079138755798), ('may', 0.9351383447647095), ('via', 0.9344696402549744), ('third', 0.9338407516479492), ('lt', 0.9329092502593994), ('ecoin', 0.9322813749313354), ('two', 0.9316513538360596)]
coronavirus:[('breaking', 0.9843987226486206), ('first', 0.98375403881073), ('update', 0.9818281531333923), ('via', 0.980609118938446), ('pm', 0.9777363538742065), ('url', 0.9776174426078796), ('today', 0.9761503338813782), ('reported', 0.9742425084114075), ('uk', 0.9734832644462585), ('people', 0.9727181792259216)]
virus:[('wuhan', 0.9791972041130066), ('chinese', 0.9755679965019226), ('called', 0.9734005331993103), ('epicenter', 0.9728990197181702), ('corona', 0.9722326397895813), ('hubei', 0.9714521765708923), ('racist', 0.9700241684913635), ('make', 0.96927809715271), ('funky', 0.9692081212997437), ('came', 0.9688615202903748)]


#### Sample 3 (2000) 

In [106]:
# Train a Word2Vec model
w2v_model_sample3 = gensim.models.Word2Vec(
        sample3,
        vector_size = 50,
        window = 5,
        min_count = 1,
        workers = 1,
        sg = 1,
        epochs=10)

print(w2v_model_sample3)

Word2Vec<vocab=5704, vector_size=50, alpha=0.025>


In [107]:
# save the model
w2v_model_sample3.save('sample3.model')

# load the model
s3_model = KeyedVectors.load("sample3.model")
               
# Printing the first 20 words that in our vocabulary
print(s3_model.wv.index_to_key[0:20])

['user', 'rt', 'coronavirus', 'covid', 'url', 'trump', 'new', 'case', 'pandemic', 'people', 'positive', 'amp', 'say', 'china', 'death', 'test', 'one', 'us', 'president', 'time']


In [108]:
# Get similar words for covid
print(f'covid:{w2v_model_sample3.wv.most_similar("covid", topn=10)}') 

# Get similar words for coronavirus
print(f'coronavirus:{w2v_model_sample3.wv.most_similar("coronavirus", topn=10)}')

# Get similar words for virus
print(f'virus:{w2v_model_sample3.wv.most_similar("virus", topn=10)}')

covid:[('two', 0.9347029328346252), ('due', 0.9273419976234436), ('day', 0.9199617505073547), ('patient', 0.9188029766082764), ('health', 0.9145526885986328), ('stupidity', 0.9122781157493591), ('mid', 0.9114583134651184), ('swear', 0.9104412198066711), ('confirms', 0.9098772406578064), ('per', 0.9096565246582031)]
coronavirus:[('live', 0.9668242931365967), ('reporting', 0.9663543105125427), ('died', 0.9642297029495239), ('et', 0.9632266163825989), ('united', 0.9630823731422424), ('rate', 0.9630070924758911), ('fauci', 0.9619635939598083), ('end', 0.9617858529090881), ('watch', 0.9600171446800232), ('states', 0.9580413103103638)]
virus:[('wuhan', 0.9578901529312134), ('chinese', 0.9554226398468018), ('called', 0.9388936161994934), ('china', 0.9087188243865967), ('bro', 0.9020308256149292), ('president', 0.8953123688697815), ('impeachment', 0.8927450180053711), ('really', 0.8894342184066772), ('make', 0.8889905214309692), ('think', 0.8863164782524109)]


#### Sample 4 (2500) 

In [109]:
# Train a Word2Vec model
w2v_model_sample4 = gensim.models.Word2Vec(
        sample4,
        vector_size = 50,
        window = 5,
        min_count = 1,
        workers = 1,
        sg = 1,
        epochs=10)

print(w2v_model_sample4)

Word2Vec<vocab=6470, vector_size=50, alpha=0.025>


In [110]:
# save the model
w2v_model_sample4.save('sample3.model')

# load the model
s4_model = KeyedVectors.load("sample3.model")
               
# Printing the first 20 words that in our vocabulary
print(s4_model.wv.index_to_key[0:20])

['user', 'rt', 'coronavirus', 'url', 'covid', 'trump', 'case', 'new', 'pandemic', 'people', 'positive', 'amp', 'china', 'say', 'death', 'us', 'test', 'one', 'time', 'president']


In [111]:
# Get similar words for covid
print(f'covid:{w2v_model_sample4.wv.most_similar("covid", topn=10)}') 

# Get similar words for coronavirus
print(f'coronavirus:{w2v_model_sample4.wv.most_similar("coronavirus", topn=10)}')

# Get similar words for virus
print(f'virus:{w2v_model_sample4.wv.most_similar("virus", topn=10)}')

covid:[('treatment', 0.8980750441551208), ('url', 0.8853574991226196), ('due', 0.8849505186080933), ('amid', 0.8792859315872192), ('fighting', 0.879266619682312), ('story', 0.8787024617195129), ('mid', 0.878048837184906), ('two', 0.8771635890007019), ('lockdown', 0.8769465088844299), ('stupidity', 0.8740412592887878)]
coronavirus:[('tuesday', 0.9345659017562866), ('latest', 0.9324128031730652), ('rise', 0.9303003549575806), ('et', 0.9301825761795044), ('past', 0.9286713600158691), ('celebrity', 0.9285599589347839), ('updates', 0.9269441962242126), ('germany', 0.9258491396903992), ('announced', 0.9256832003593445), ('national', 0.9249150156974792)]
virus:[('wuhan', 0.9257809519767761), ('chinese', 0.9158656001091003), ('china', 0.8530799150466919), ('called', 0.8423262238502502), ('bro', 0.8248056173324585), ('corona', 0.8187914490699768), ('travelled', 0.8134801387786865), ('blame', 0.8023353219032288), ('chinesevirus', 0.7956954836845398), ('cover', 0.790327787399292)]


#### Use existing model

In [129]:
import gensim.downloader as api  # importing the gensim downloader

# load the GloVe Twitter pre-trained model of 50 dimensions
glove_vectors = api.load("glove-twitter-50") 

In [124]:
word = 'covid'
top10_sim = glove_vectors.most_similar(positive = word, topn = 10)
print(top10_sim)

KeyError: "Key 'covid' not present in vocabulary"

In [134]:
word = 'coronavirus'
top10_sim = glove_vectors.most_similar(positive = word, topn = 10)
print(top10_sim)

[('wikileaks', 0.6786817908287048), ('abduction', 0.6699736714363098), ('plotter', 0.6616529822349548), ('megaupload', 0.6569192409515381), ('amazigh', 0.6552634835243225), ('norovirus', 0.6525553464889526), ('rotavirus', 0.6516395211219788), ('sandero', 0.6494700312614441), ('yugoslavia', 0.6478256583213806), ('mikhail', 0.6476300358772278)]


In [135]:
word = 'virus'
top10_sim = glove_vectors.most_similar(positive = word, topn = 10)
print(top10_sim)

[('malware', 0.7857435941696167), ('hacker', 0.740230917930603), ('hepatitis', 0.7397270202636719), ('hiv', 0.7216421365737915), ('tumor', 0.7198520302772522), ('tsunami', 0.7155616283416748), ('anti', 0.710803210735321), ('hack', 0.7009544372558594), ('mosquito', 0.6940008997917175), ('hackers', 0.6936617493629456)]
