In [1]:
from datetime import datetime as dt
import unicodedata
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec

import string
from nltk.corpus import stopwords

# Quality Checks

We will conduct two quality checks:
1. Analogy Relationships
2. Nearest Neighbors

## Word2Vec Models

We load our word2vec models trained in `embeddings_wordCategories.ipynb`.

In [3]:
# Create a word2vec model for each month
jan_model = Word2Vec.load("models/jan_model.model")
feb_model = Word2Vec.load("models/feb_model.model")
mar_model = Word2Vec.load("models/mar_model.model")
apr_model = Word2Vec.load("models/apr_model.model")
may_model = Word2Vec.load("models/may_model.model")
june_model = Word2Vec.load("models/june_model.model")
july_model = Word2Vec.load("models/july_model.model")

First, we get the analogy score for each model for synctatic and semantic analogies.

## Analogy Relationships

### January 

In [4]:
syntactic_analogy_word2vec = jan_model.wv.evaluate_word_analogies('analogy/syntactic.txt')
semantic_analogy_word2vec = jan_model.wv.evaluate_word_analogies('analogy/semantic.txt')

In [5]:
# Syntactic Analogy Word2Vec 
print('Syntactic: ', syntactic_analogy_word2vec[0])

# Semantic Analogy Word2Vec
print('Semantic: ', semantic_analogy_word2vec[0])

Syntactic:  0.07773808057163976
Semantic:  0.09923664122137404


### February 

In [6]:
syntactic_analogy_word2vec = feb_model.wv.evaluate_word_analogies('analogy/syntactic.txt')
semantic_analogy_word2vec = feb_model.wv.evaluate_word_analogies('analogy/semantic.txt')

In [7]:
# Syntactic Analogy Word2Vec 
print('Syntactic: ', syntactic_analogy_word2vec[0])

# Semantic Analogy Word2Vec
print('Semantic: ', semantic_analogy_word2vec[0])

Syntactic:  0.12883231876037632
Semantic:  0.1786030061892131


### March 

In [8]:
syntactic_analogy_word2vec = mar_model.wv.evaluate_word_analogies('analogy/syntactic.txt')
semantic_analogy_word2vec = mar_model.wv.evaluate_word_analogies('analogy/semantic.txt')

In [9]:
# Syntactic Analogy Word2Vec 
print('Syntactic: ', syntactic_analogy_word2vec[0])

# Semantic Analogy Word2Vec
print('Semantic: ', semantic_analogy_word2vec[0])

Syntactic:  0.14215899218071243
Semantic:  0.1953962396766825


### April

In [10]:
syntactic_analogy_word2vec = apr_model.wv.evaluate_word_analogies('analogy/syntactic.txt')
semantic_analogy_word2vec = apr_model.wv.evaluate_word_analogies('analogy/semantic.txt')

In [11]:
# Syntactic Analogy Word2Vec 
print('Syntactic: ', syntactic_analogy_word2vec[0])

# Semantic Analogy Word2Vec
print('Semantic: ', semantic_analogy_word2vec[0])

Syntactic:  0.16232153941651148
Semantic:  0.16810592976396085


### May

In [12]:
syntactic_analogy_word2vec = may_model.wv.evaluate_word_analogies('analogy/syntactic.txt')
semantic_analogy_word2vec = may_model.wv.evaluate_word_analogies('analogy/semantic.txt')

In [13]:
# Syntactic Analogy Word2Vec 
print('Syntactic: ', syntactic_analogy_word2vec[0])

# Semantic Analogy Word2Vec
print('Semantic: ', semantic_analogy_word2vec[0])

Syntactic:  0.1566078613432374
Semantic:  0.16787459969661217


### June

In [14]:
syntactic_analogy_word2vec = june_model.wv.evaluate_word_analogies('analogy/syntactic.txt')
semantic_analogy_word2vec = june_model.wv.evaluate_word_analogies('analogy/semantic.txt')

In [15]:
# Syntactic Analogy Word2Vec 
print('Syntactic: ', syntactic_analogy_word2vec[0])

# Semantic Analogy Word2Vec
print('Semantic: ', semantic_analogy_word2vec[0])

Syntactic:  0.14471143531431024
Semantic:  0.1932935393258427


### July

In [16]:
syntactic_analogy_word2vec = july_model.wv.evaluate_word_analogies('analogy/syntactic.txt')
semantic_analogy_word2vec = july_model.wv.evaluate_word_analogies('analogy/semantic.txt')

In [17]:
# Syntactic Analogy Word2Vec 
print('Syntactic: ', syntactic_analogy_word2vec[0])

# Semantic Analogy Word2Vec
print('Semantic: ', semantic_analogy_word2vec[0])

Syntactic:  0.15183842568617298
Semantic:  0.15172298427567749


## Nearest Neighbors
We are going to look at the nearest neighbors for words like `car` and `food` where we can expect to see words related to vehicles and words related to food. 

### Car

In [18]:
jan_model.wv.most_similar(positive=['car'])

[('webaul', 0.6419444680213928),
 ('cars', 0.6231793165206909),
 ('jaguar', 0.6225389838218689),
 ('rentals', 0.6224567890167236),
 ('wheel', 0.6180456280708313),
 ('fuse', 0.6026849746704102),
 ('ghosttown', 0.5929633378982544),
 ('gibbons', 0.5929521322250366),
 ('skoda', 0.5910767912864685),
 ('halewood', 0.5897871255874634)]

In [19]:
feb_model.wv.most_similar(positive=['car'])

[('cars', 0.7409082055091858),
 ('vehicle', 0.7394551038742065),
 ('dealership', 0.6509534120559692),
 ('selfdriving', 0.6507112383842468),
 ('auto', 0.649484395980835),
 ('benz', 0.6451889872550964),
 ('dealerships', 0.6439517736434937),
 ('vehicles', 0.6356154680252075),
 ('electric', 0.6338905096054077),
 ('carmakers', 0.6305521726608276)]

In [20]:
mar_model.wv.most_similar(positive=['car'])

[('cars', 0.7202456593513489),
 ('vehicle', 0.6924893856048584),
 ('bicycle', 0.6816522479057312),
 ('accident', 0.6595501899719238),
 ('suv', 0.6548261046409607),
 ('parked', 0.6494114398956299),
 ('scooter', 0.6460229754447937),
 ('quad', 0.6371784806251526),
 ('privatehire', 0.6344361901283264),
 ('minivan', 0.6318668127059937)]

In [21]:
apr_model.wv.most_similar(positive=['car'])

[('cars', 0.7886906266212463),
 ('vehicle', 0.7585102915763855),
 ('parked', 0.6883108615875244),
 ('motorcycle', 0.6816992163658142),
 ('tires', 0.6785660982131958),
 ('bikes', 0.6752482652664185),
 ('rides', 0.6723027229309082),
 ('bike', 0.6718657612800598),
 ('bicycle', 0.6684290170669556),
 ('motorbike', 0.6613712906837463)]

In [22]:
may_model.wv.most_similar(positive=['car'])

[('vehicle', 0.7911328077316284),
 ('cars', 0.7624408006668091),
 ('bicycle', 0.7585214972496033),
 ('bicycles', 0.7117853164672852),
 ('dealership', 0.6917356848716736),
 ('vehicles', 0.6787472367286682),
 ('bikes', 0.6775545477867126),
 ('scooter', 0.6659582853317261),
 ('motorbike', 0.6625623106956482),
 ('suv', 0.6607215404510498)]

In [23]:
june_model.wv.most_similar(positive=['car'])

[('dealership', 0.7867011427879333),
 ('cars', 0.7773681282997131),
 ('vehicle', 0.7706952095031738),
 ('truck', 0.752478837966919),
 ('scooter', 0.7405713200569153),
 ('motorcycle', 0.7253866791725159),
 ('oncoming', 0.7225614786148071),
 ('tow', 0.7121523022651672),
 ('bikes', 0.7113629579544067),
 ('suv', 0.7105033993721008)]

In [24]:
july_model.wv.most_similar(positive=['car'])

[('cars', 0.7965576648712158),
 ('vehicle', 0.7698507905006409),
 ('motorcycle', 0.7038615942001343),
 ('scooter', 0.6954658031463623),
 ('suv', 0.6807191371917725),
 ('suvs', 0.6779518127441406),
 ('bicycle', 0.6732996702194214),
 ('tow', 0.6677586436271667),
 ('amtrak', 0.6642812490463257),
 ('parked', 0.6576599478721619)]

We can see that for each month, we do have words that are associated with cars.

### Food

In [25]:
jan_model.wv.most_similar(positive=['food'])

[('wellcooked', 0.6954552531242371),
 ('necessities', 0.6764464378356934),
 ('nutritious', 0.6761460900306702),
 ('essentials', 0.6693312525749207),
 ('fruits', 0.6598597764968872),
 ('unsanitary', 0.6593190431594849),
 ('ganesan', 0.6554536819458008),
 ('stored', 0.6516037583351135),
 ('contraband', 0.6512461304664612),
 ('perishable', 0.6499354243278503)]

In [26]:
feb_model.wv.most_similar(positive=['food'])

[('bottled', 0.7137508988380432),
 ('essentials', 0.7079458236694336),
 ('nonperishable', 0.6901780366897583),
 ('tinned', 0.6887165904045105),
 ('packaged', 0.6866973638534546),
 ('fruits', 0.6864193677902222),
 ('diapers', 0.6857799887657166),
 ('vegetables', 0.6826168894767761),
 ('pastas', 0.6799852848052979),
 ('canned', 0.6771701574325562)]

In [27]:
mar_model.wv.most_similar(positive=['food'])

[('parcels', 0.7904275059700012),
 ('pantries', 0.7715544104576111),
 ('essentials', 0.7600551247596741),
 ('toiletries', 0.7586692571640015),
 ('groceries', 0.738259494304657),
 ('nonperishable', 0.7236242294311523),
 ('perishable', 0.718319833278656),
 ('pantry', 0.714946985244751),
 ('fruits', 0.7083743810653687),
 ('rations', 0.7070468664169312)]

In [28]:
apr_model.wv.most_similar(positive=['food'])

[('nonperishable', 0.7473136782646179),
 ('parcels', 0.747100293636322),
 ('packaged', 0.7314946055412292),
 ('grains', 0.7314867973327637),
 ('meals', 0.7289881706237793),
 ('necessities', 0.7113165855407715),
 ('essentials', 0.7085106372833252),
 ('groceries', 0.7081417441368103),
 ('pantries', 0.7071753740310669),
 ('toiletries', 0.7066894173622131)]

In [29]:
may_model.wv.most_similar(positive=['food'])

[('rations', 0.7541570067405701),
 ('pantries', 0.7484046816825867),
 ('toiletries', 0.7461102604866028),
 ('groceries', 0.7415049076080322),
 ('meals', 0.7262624502182007),
 ('packaged', 0.7222284078598022),
 ('necessities', 0.720681369304657),
 ('grains', 0.7160525918006897),
 ('beverage', 0.7147131562232971),
 ('perishable', 0.7080368995666504)]

In [30]:
june_model.wv.most_similar(positive=['food'])

[('groceries', 0.7549852728843689),
 ('nutritious', 0.731977105140686),
 ('parcels', 0.7210739254951477),
 ('meals', 0.7209938764572144),
 ('shelfstable', 0.7108553051948547),
 ('nonperishable', 0.707969605922699),
 ('pantries', 0.7064752578735352),
 ('pantry', 0.7020173072814941),
 ('necessities', 0.7001433968544006),
 ('diapers', 0.6960179805755615)]

In [31]:
july_model.wv.most_similar(positive=['food'])

[('pantries', 0.7791734337806702),
 ('pantry', 0.7139326333999634),
 ('meals', 0.6907495260238647),
 ('meal', 0.6881261467933655),
 ('parcels', 0.6852877140045166),
 ('nutritious', 0.6851959824562073),
 ('groceries', 0.6845822334289551),
 ('nonperishable', 0.6805075407028198),
 ('drink', 0.6780723929405212),
 ('necessities', 0.675387442111969)]

We can see that for each month, we do have words that are associated with food. It is interesting to note that since it is during the pandemic, there are more words associatd with pantries, necessities, ratios, etc. which fits the trend of people stocking up their food in case of a lockdown or shut down in their city rather than words like hamburger, hot dog, etc.