In [1]:
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

from datetime import datetime
from tqdm import tqdm
import pandas as pd
import pickle
from scipy.sparse import save_npz, csc_matrix, load_npz
import sqlite3

In [2]:
corpus_file="./data/tokens.txt"
model_path="./models/"
sparse_dtm = "./data/dtm.npz"

## Train and test embedding models

In [5]:
# Learn ngrams
sentences = gensim.models.word2vec.LineSentence(corpus_file)
# ngrams = Phrases(sentences, connector_words=ENGLISH_CONNECTOR_WORDS, min_count=15)

ngrams = Phrases.load('./data/ngrams')

In [6]:
# What Arseniev-Koehler et al. (2021) do (see their appendix 2):
# Also: CBOW, negative sampling (with negative=5)
window_size = [5,7,10]
vector_size = [50,100,200,300]

In [7]:
for window in window_size:
    for vector in vector_size:
        name = f"gensim_model_window{window}_vector_{vector}"
        print(f"Starting with {name} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        model = Word2Vec(ngrams[sentences], vector_size=vector, window=window, epochs=10, min_count=15)

        google_test = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
        print(google_test[0])
        similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
        print(similarities)
        print()
        print("-"*80)

        filename=f"{model_path}{name}"
        model.save(filename)
    
print(f"Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Starting with gensim_model_window5_vector_50 at 2022-02-21 11:26:48
0.2826943747760659
{'section': 'Total accuracy', 'correct': [('ATHENS', 'GREECE', 'STOCKHOLM', 'SWEDEN'), ('BAGHDAD', 'IRAQ', 'PARIS', 'FRANCE'), ('BANGKOK', 'THAILAND', 'MOSCOW', 'RUSSIA'), ('BEIJING', 'CHINA', 'MOSCOW', 'RUSSIA'), ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'BERLIN', 'GERMANY'), ('PARIS', 'FRANCE', 'BERLIN', 'GERMANY'), ('STOCKHOLM', 'SWEDEN', 'ROME', 'ITALY'), ('TOKYO', 'JAPAN', 'PARIS', 'FRANCE'), ('DUBLIN', 'IRELAND', 'MOSCOW', 'RUSSIA'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('VIENNA', 'AUSTRIA', 'ATHENS', 'GREECE'), ('PHILADELPHIA', 'PENNSYLVANIA', 'PORTLAND', 'OREGON'), ('DALLAS', 'TEXAS', 'MIAMI', 'FLORIDA'), ('AUSTIN', 'TEXAS', 'MIAMI', 'FLORIDA'), ('AUSTIN', 'TEXAS', 'ORLANDO', 'FLORIDA'), ('SACRAMENTO', 'CALIFORNIA', 'ATLANTA', 'GEORGIA'), ('MESA', 'ARIZONA', 'CLEVELAND', 'OHIO'), ('MESA', 'ARIZONA', 'CINCINNATI', 'OHIO'), ('

0.38068792547474023
{'section': 'Total accuracy', 'correct': [('ATHENS', 'GREECE', 'MOSCOW', 'RUSSIA'), ('ATHENS', 'GREECE', 'ROME', 'ITALY'), ('BAGHDAD', 'IRAQ', 'PARIS', 'FRANCE'), ('BAGHDAD', 'IRAQ', 'TOKYO', 'JAPAN'), ('BANGKOK', 'THAILAND', 'ROME', 'ITALY'), ('BEIJING', 'CHINA', 'MOSCOW', 'RUSSIA'), ('BEIJING', 'CHINA', 'TOKYO', 'JAPAN'), ('BERLIN', 'GERMANY', 'LONDON', 'ENGLAND'), ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'BERLIN', 'GERMANY'), ('MOSCOW', 'RUSSIA', 'ATHENS', 'GREECE'), ('PARIS', 'FRANCE', 'BERLIN', 'GERMANY'), ('DUBLIN', 'IRELAND', 'MOSCOW', 'RUSSIA'), ('KIEV', 'UKRAINE', 'MOSCOW', 'RUSSIA'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('MOSCOW', 'RUSSIA', 'TALLINN', 'ESTONIA'), ('CHICAGO', 'ILLINOIS', 'MIAMI', 'FLORIDA'), ('PHILADELPHIA', 'PENNSYLVANIA', 'PORTLAND', 'OREGON'), ('AUSTIN', 'TEXAS', 'MIAMI', 'FLORIDA'), ('AUSTIN', 'TEXAS', 'ORLANDO', 'FLORIDA'), ('DETROIT', 'MICHIGAN', 'MIAMI', 'FLORIDA')

Starting with gensim_model_window5_vector_200 at 2022-02-21 12:28:14
0.44509136510211394
{'section': 'Total accuracy', 'correct': [('ATHENS', 'GREECE', 'CAIRO', 'EGYPT'), ('ATHENS', 'GREECE', 'STOCKHOLM', 'SWEDEN'), ('BAGHDAD', 'IRAQ', 'PARIS', 'FRANCE'), ('BANGKOK', 'THAILAND', 'BERLIN', 'GERMANY'), ('BANGKOK', 'THAILAND', 'ROME', 'ITALY'), ('BEIJING', 'CHINA', 'STOCKHOLM', 'SWEDEN'), ('BEIJING', 'CHINA', 'TOKYO', 'JAPAN'), ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'BERLIN', 'GERMANY'), ('LONDON', 'ENGLAND', 'CAIRO', 'EGYPT'), ('MOSCOW', 'RUSSIA', 'STOCKHOLM', 'SWEDEN'), ('MOSCOW', 'RUSSIA', 'CAIRO', 'EGYPT'), ('PARIS', 'FRANCE', 'BERLIN', 'GERMANY'), ('ROME', 'ITALY', 'STOCKHOLM', 'SWEDEN'), ('STOCKHOLM', 'SWEDEN', 'BERLIN', 'GERMANY'), ('STOCKHOLM', 'SWEDEN', 'CAIRO', 'EGYPT'), ('TOKYO', 'JAPAN', 'BERLIN', 'GERMANY'), ('ATHENS', 'GREECE', 'CAIRO', 'EGYPT'), ('BANGKOK', 'THAILAND', 'BERLIN', 'GERMANY'), ('COPENHAGEN', 'D

Starting with gensim_model_window5_vector_300 at 2022-02-21 13:00:24
0.45378000716589034
{'section': 'Total accuracy', 'correct': [('ATHENS', 'GREECE', 'CAIRO', 'EGYPT'), ('BAGHDAD', 'IRAQ', 'PARIS', 'FRANCE'), ('BANGKOK', 'THAILAND', 'ROME', 'ITALY'), ('BANGKOK', 'THAILAND', 'ATHENS', 'GREECE'), ('BEIJING', 'CHINA', 'CAIRO', 'EGYPT'), ('BEIJING', 'CHINA', 'MOSCOW', 'RUSSIA'), ('BEIJING', 'CHINA', 'TOKYO', 'JAPAN'), ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'CAIRO', 'EGYPT'), ('MADRID', 'SPAIN', 'PARIS', 'FRANCE'), ('MOSCOW', 'RUSSIA', 'CAIRO', 'EGYPT'), ('PARIS', 'FRANCE', 'BERLIN', 'GERMANY'), ('PARIS', 'FRANCE', 'LONDON', 'ENGLAND'), ('ROME', 'ITALY', 'STOCKHOLM', 'SWEDEN'), ('STOCKHOLM', 'SWEDEN', 'ROME', 'ITALY'), ('TOKYO', 'JAPAN', 'BERLIN', 'GERMANY'), ('TOKYO', 'JAPAN', 'CAIRO', 'EGYPT'), ('TOKYO', 'JAPAN', 'PARIS', 'FRANCE'), ('ATHENS', 'GREECE', 'CAIRO', 'EGYPT'), ('BEIJING', 'CHINA', 'CAIRO', 'EGYPT'), ('KIEV', 

Starting with gensim_model_window7_vector_50 at 2022-02-21 13:32:56
0.2861877463274812
{'section': 'Total accuracy', 'correct': [('ATHENS', 'GREECE', 'ROME', 'ITALY'), ('BAGHDAD', 'IRAQ', 'PARIS', 'FRANCE'), ('BEIJING', 'CHINA', 'MOSCOW', 'RUSSIA'), ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'BERLIN', 'GERMANY'), ('PARIS', 'FRANCE', 'BERLIN', 'GERMANY'), ('STOCKHOLM', 'SWEDEN', 'PARIS', 'FRANCE'), ('TOKYO', 'JAPAN', 'PARIS', 'FRANCE'), ('BUDAPEST', 'HUNGARY', 'KIEV', 'UKRAINE'), ('DHAKA', 'BANGLADESH', 'MOSCOW', 'RUSSIA'), ('DUBLIN', 'IRELAND', 'MOSCOW', 'RUSSIA'), ('KIEV', 'UKRAINE', 'MOSCOW', 'RUSSIA'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('MOSCOW', 'RUSSIA', 'TALLINN', 'ESTONIA'), ('HOUSTON', 'TEXAS', 'CINCINNATI', 'OHIO'), ('PHOENIX', 'ARIZONA', 'CINCINNATI', 'OHIO'), ('AUSTIN', 'TEXAS', 'MIAMI', 'FLORIDA'), ('AUSTIN', 'TEXAS', 'TAMPA', 'FLORIDA'), ('AUSTIN', 'TEXAS', 'ORLANDO', 'FLORIDA'), ('BOSTON', 'MASSACHUSET

Starting with gensim_model_window7_vector_100 at 2022-02-21 14:02:48
0.38570404872805447
{'section': 'Total accuracy', 'correct': [('ATHENS', 'GREECE', 'MOSCOW', 'RUSSIA'), ('BAGHDAD', 'IRAQ', 'BERLIN', 'GERMANY'), ('BAGHDAD', 'IRAQ', 'PARIS', 'FRANCE'), ('BANGKOK', 'THAILAND', 'MOSCOW', 'RUSSIA'), ('BANGKOK', 'THAILAND', 'ATHENS', 'GREECE'), ('BEIJING', 'CHINA', 'MOSCOW', 'RUSSIA'), ('BEIJING', 'CHINA', 'TOKYO', 'JAPAN'), ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'BERLIN', 'GERMANY'), ('MADRID', 'SPAIN', 'TOKYO', 'JAPAN'), ('PARIS', 'FRANCE', 'BERLIN', 'GERMANY'), ('STOCKHOLM', 'SWEDEN', 'PARIS', 'FRANCE'), ('TOKYO', 'JAPAN', 'PARIS', 'FRANCE'), ('BAGHDAD', 'IRAQ', 'BERLIN', 'GERMANY'), ('DUBLIN', 'IRELAND', 'MOSCOW', 'RUSSIA'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('TALLINN', 'ESTONIA', 'ATHENS', 'GREECE'), ('VIENNA', 'AUSTRIA', 'CAIRO', 'EGYPT'), ('CHICAGO', 'ILLINOIS', 'ATLANTA', 'GEORGIA'), ('CHICAGO', 'ILLINOIS'

Starting with gensim_model_window7_vector_200 at 2022-02-21 14:31:56
0.4441060551773558
{'section': 'Total accuracy', 'correct': [('BAGHDAD', 'IRAQ', 'PARIS', 'FRANCE'), ('BANGKOK', 'THAILAND', 'ROME', 'ITALY'), ('BEIJING', 'CHINA', 'BERLIN', 'GERMANY'), ('BEIJING', 'CHINA', 'TOKYO', 'JAPAN'), ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'BERLIN', 'GERMANY'), ('MADRID', 'SPAIN', 'PARIS', 'FRANCE'), ('MADRID', 'SPAIN', 'LONDON', 'ENGLAND'), ('MOSCOW', 'RUSSIA', 'STOCKHOLM', 'SWEDEN'), ('MOSCOW', 'RUSSIA', 'BERLIN', 'GERMANY'), ('MOSCOW', 'RUSSIA', 'CAIRO', 'EGYPT'), ('OSLO', 'NORWAY', 'CAIRO', 'EGYPT'), ('PARIS', 'FRANCE', 'BERLIN', 'GERMANY'), ('ROME', 'ITALY', 'STOCKHOLM', 'SWEDEN'), ('STOCKHOLM', 'SWEDEN', 'BERLIN', 'GERMANY'), ('STOCKHOLM', 'SWEDEN', 'CAIRO', 'EGYPT'), ('TOKYO', 'JAPAN', 'BERLIN', 'GERMANY'), ('BEIJING', 'CHINA', 'BERLIN', 'GERMANY'), ('BERLIN', 'GERMANY', 'BRUSSELS', 'BELGIUM'), ('LONDON', 'ENGLAND', 'PAR

Starting with gensim_model_window7_vector_300 at 2022-02-21 15:03:42
0.46855965603726263
{'section': 'Total accuracy', 'correct': [('ATHENS', 'GREECE', 'CAIRO', 'EGYPT'), ('ATHENS', 'GREECE', 'ROME', 'ITALY'), ('ATHENS', 'GREECE', 'STOCKHOLM', 'SWEDEN'), ('BAGHDAD', 'IRAQ', 'BERLIN', 'GERMANY'), ('BAGHDAD', 'IRAQ', 'CAIRO', 'EGYPT'), ('BAGHDAD', 'IRAQ', 'PARIS', 'FRANCE'), ('BANGKOK', 'THAILAND', 'CAIRO', 'EGYPT'), ('BANGKOK', 'THAILAND', 'ROME', 'ITALY'), ('BEIJING', 'CHINA', 'TOKYO', 'JAPAN'), ('BERLIN', 'GERMANY', 'LONDON', 'ENGLAND'), ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'), ('CAIRO', 'EGYPT', 'ATHENS', 'GREECE'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'TOKYO', 'JAPAN'), ('LONDON', 'ENGLAND', 'BERLIN', 'GERMANY'), ('LONDON', 'ENGLAND', 'CAIRO', 'EGYPT'), ('MADRID', 'SPAIN', 'LONDON', 'ENGLAND'), ('MOSCOW', 'RUSSIA', 'STOCKHOLM', 'SWEDEN'), ('MOSCOW', 'RUSSIA', 'CAIRO', 'EGYPT'), ('OSLO', 'NORWAY', 'CAIRO', 'EGYPT'), ('PARIS', 'FRANCE', 'BERLIN', 'GERMANY')

Starting with gensim_model_window10_vector_50 at 2022-02-21 15:35:45
0.29442852024364025
{'section': 'Total accuracy', 'correct': [('BAGHDAD', 'IRAQ', 'BERLIN', 'GERMANY'), ('BAGHDAD', 'IRAQ', 'PARIS', 'FRANCE'), ('BAGHDAD', 'IRAQ', 'STOCKHOLM', 'SWEDEN'), ('BANGKOK', 'THAILAND', 'MOSCOW', 'RUSSIA'), ('BANGKOK', 'THAILAND', 'ATHENS', 'GREECE'), ('BEIJING', 'CHINA', 'MOSCOW', 'RUSSIA'), ('BEIJING', 'CHINA', 'ATHENS', 'GREECE'), ('BERLIN', 'GERMANY', 'LONDON', 'ENGLAND'), ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'BERLIN', 'GERMANY'), ('MOSCOW', 'RUSSIA', 'ATHENS', 'GREECE'), ('PARIS', 'FRANCE', 'BERLIN', 'GERMANY'), ('STOCKHOLM', 'SWEDEN', 'PARIS', 'FRANCE'), ('BAGHDAD', 'IRAQ', 'BERLIN', 'GERMANY'), ('DHAKA', 'BANGLADESH', 'MOSCOW', 'RUSSIA'), ('DUBLIN', 'IRELAND', 'MOSCOW', 'RUSSIA'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('VIENNA', 'AUSTRIA', 'ATHENS', 'GREECE'), ('PHOENIX', 'ARIZONA', 'AUSTIN', 'TEXAS'), ('PHOENIX',

0.39600501612325334
{'section': 'Total accuracy', 'correct': [('ATHENS', 'GREECE', 'MOSCOW', 'RUSSIA'), ('BAGHDAD', 'IRAQ', 'PARIS', 'FRANCE'), ('BANGKOK', 'THAILAND', 'ATHENS', 'GREECE'), ('BEIJING', 'CHINA', 'MOSCOW', 'RUSSIA'), ('BEIJING', 'CHINA', 'TOKYO', 'JAPAN'), ('BERLIN', 'GERMANY', 'LONDON', 'ENGLAND'), ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'BERLIN', 'GERMANY'), ('MOSCOW', 'RUSSIA', 'STOCKHOLM', 'SWEDEN'), ('MOSCOW', 'RUSSIA', 'BEIJING', 'CHINA'), ('PARIS', 'FRANCE', 'BERLIN', 'GERMANY'), ('STOCKHOLM', 'SWEDEN', 'MOSCOW', 'RUSSIA'), ('TOKYO', 'JAPAN', 'PARIS', 'FRANCE'), ('COPENHAGEN', 'DENMARK', 'DUBLIN', 'IRELAND'), ('DUBLIN', 'IRELAND', 'MOSCOW', 'RUSSIA'), ('KIEV', 'UKRAINE', 'MOSCOW', 'RUSSIA'), ('LIMA', 'PERU', 'ROME', 'ITALY'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('MOSCOW', 'RUSSIA', 'STOCKHOLM', 'SWEDEN'), ('TALLINN', 'ESTONIA', 'TOKYO', 'JAPAN'), ('TALLINN', 'ESTONIA', 'ATHENS', 'GREECE'), ('VI

Starting with gensim_model_window10_vector_200 at 2022-02-21 16:35:40
0.4511823719097098
{'section': 'Total accuracy', 'correct': [('ATHENS', 'GREECE', 'MOSCOW', 'RUSSIA'), ('BAGHDAD', 'IRAQ', 'BERLIN', 'GERMANY'), ('BAGHDAD', 'IRAQ', 'PARIS', 'FRANCE'), ('BANGKOK', 'THAILAND', 'CAIRO', 'EGYPT'), ('BANGKOK', 'THAILAND', 'ATHENS', 'GREECE'), ('BEIJING', 'CHINA', 'BERLIN', 'GERMANY'), ('BEIJING', 'CHINA', 'MOSCOW', 'RUSSIA'), ('BEIJING', 'CHINA', 'TOKYO', 'JAPAN'), ('BERLIN', 'GERMANY', 'LONDON', 'ENGLAND'), ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'), ('CAIRO', 'EGYPT', 'ATHENS', 'GREECE'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'BERLIN', 'GERMANY'), ('MADRID', 'SPAIN', 'PARIS', 'FRANCE'), ('MADRID', 'SPAIN', 'LONDON', 'ENGLAND'), ('MOSCOW', 'RUSSIA', 'CAIRO', 'EGYPT'), ('OSLO', 'NORWAY', 'CAIRO', 'EGYPT'), ('PARIS', 'FRANCE', 'BERLIN', 'GERMANY'), ('PARIS', 'FRANCE', 'LONDON', 'ENGLAND'), ('ROME', 'ITALY', 'LONDON', 'ENGLAND'), ('STOCKHOLM', 'SWEDEN', 'CAIRO', 'EG

Starting with gensim_model_window10_vector_300 at 2022-02-21 17:06:41
0.4572733787173056
{'section': 'Total accuracy', 'correct': [('ATHENS', 'GREECE', 'CAIRO', 'EGYPT'), ('ATHENS', 'GREECE', 'MOSCOW', 'RUSSIA'), ('ATHENS', 'GREECE', 'ROME', 'ITALY'), ('ATHENS', 'GREECE', 'STOCKHOLM', 'SWEDEN'), ('BAGHDAD', 'IRAQ', 'BERLIN', 'GERMANY'), ('BAGHDAD', 'IRAQ', 'PARIS', 'FRANCE'), ('BAGHDAD', 'IRAQ', 'STOCKHOLM', 'SWEDEN'), ('BANGKOK', 'THAILAND', 'CAIRO', 'EGYPT'), ('BANGKOK', 'THAILAND', 'STOCKHOLM', 'SWEDEN'), ('BANGKOK', 'THAILAND', 'ATHENS', 'GREECE'), ('BEIJING', 'CHINA', 'BERLIN', 'GERMANY'), ('BEIJING', 'CHINA', 'TOKYO', 'JAPAN'), ('BERLIN', 'GERMANY', 'LONDON', 'ENGLAND'), ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'MADRID', 'SPAIN'), ('LONDON', 'ENGLAND', 'PARIS', 'FRANCE'), ('LONDON', 'ENGLAND', 'STOCKHOLM', 'SWEDEN'), ('LONDON', 'ENGLAND', 'BERLIN', 'GERMANY'), ('LONDON', 'ENGLAND', 'CAIRO', 'EGYPT'), ('MADRID', 'SPAIN', 'PARIS', 'FRANCE'), ('MADRID', 'SPAIN

Finished at 2022-02-21 17:39:38


## Save ngrams applied to corpus

In [6]:
with open('./data/tokenized_comments.p', 'rb') as f:
    comments = pickle.load(f)

In [7]:
comments[0]

['additionally',
 'thank',
 'you',
 'so',
 'much',
 'for',
 'caring',
 'enough',
 'to',
 'ask',
 'in',
 'a',
 'genuinely',
 'curious',
 'and',
 'respectful',
 'way',
 'i',
 "'m",
 'curious',
 'to',
 'hear',
 'your',
 'thoughts']

In [8]:
ngrams[comments[0]]

['additionally',
 'thank_you',
 'so',
 'much',
 'for',
 'caring',
 'enough',
 'to',
 'ask',
 'in',
 'a',
 'genuinely_curious',
 'and',
 'respectful',
 'way',
 'i',
 "'m_curious",
 'to',
 'hear',
 'your',
 'thoughts']

In [10]:
tokenized_comments = []

for comment in tqdm(comments):
    comment_tokens = ngrams[comment]
    tokenized_comments.append(comment_tokens)

100%|██████████████████████████████████████████████████████████████████████| 2118317/2118317 [10:07<00:00, 3488.65it/s]


In [14]:
print(tokenized_comments[:5])

[['additionally', 'thank_you', 'so', 'much', 'for', 'caring', 'enough', 'to', 'ask', 'in', 'a', 'genuinely_curious', 'and', 'respectful', 'way', 'i', "'m_curious", 'to', 'hear', 'your', 'thoughts'], ['i', 'am', 'watching', 'shape_of_water', 'it', 'says', 'some', 'stuff', 'in', 'russian', 'how', 'can', 'i', 'find', 'out', 'what', 'they', 'are', 'saying'], ['extremely', 'underwhelmed', 'by', 'the', 'film', 'it', 'just', 'felt', 'dull', 'the', "'romance", 'between', 'fish-dick', 'and', 'mute-chick', 'was', 'too', 'quick', 'and', 'i', 'never', 'felt', 'any', 'connection', 'or', 'empathy', 'for', 'the', "'asset", 'the', 'pie-shop', 'scenes', 'were', 'so', 'quick', 'and', 'short', 'and', 'ultimately', 'meant', 'nothing', 'i', 'want', 'to', 'get', 'to', 'know', 'you', 'ew', 'no', 'and', 'that', 'was', 'it', 'just', 'the', 'film', 'saying', 'hah', 'look', 'the', '40', "'s", 'hated', 'gays', 'the', 'strange', 'musical', 'imagination', 'scene', 'was', 'a', 'complete', 'tonal', 'change', 'from', 

In [9]:
ngrams.save('./data/ngrams')

In [11]:
with open('./data/ngram_comments.p', 'wb') as f:
    pickle.dump(tokenized_comments, f)

## Save Document-Term Matrix

For use with CMD. Saving as CSC matrix since that's the preferred format in R

In [3]:
with open('./data/ngram_comments.p', 'rb') as f:
    tokenized_comments = pickle.load(f)

In [13]:
body_list = []
for comment in tqdm(tokenized_comments):
    body_list.append(" ".join(comment))

100%|█████████████████████████████████████████████████████████████████████| 2118317/2118317 [00:26<00:00, 78656.60it/s]


In [14]:
comments = pd.read_csv('./data/comments.csv', sep=';')

In [15]:
assert len(comments) == len(body_list)

In [24]:
print(comments['body'][:3])
print(body_list[:3])

0    ADDITIONALLY-- thank you so much for caring en...
1    I am watching Shape of Water.  It says some st...
2    Extremely underwhelmed by the film.\n\nIt just...
Name: body, dtype: object
["additionally thank_you so much for caring enough to ask in a genuinely_curious and respectful way i 'm_curious to hear your thoughts", 'i am watching shape_of_water it says some stuff in russian how can i find out what they are saying', "extremely underwhelmed by the film it just felt dull the 'romance between fish-dick and mute-chick was too quick and i never felt any connection or empathy for the 'asset the pie-shop scenes were so quick and short and ultimately meant nothing i want to get to know you ew no and that was it just the film saying hah look the 40 's hated gays the strange musical imagination scene was a complete tonal change from the rest of the film and it seems the film just ignored all logic to tell the story the 'asset had no security at all no guards no cameras the cleaner was 

In [25]:
comments["tokenized_body"] = body_list

In [31]:
len(comments)

2118317

In [29]:
comments.head()

Unnamed: 0,comment_id,submission_id,body,author,score,created,tokenized_body
0,e0r6q9y,7llz2i,ADDITIONALLY-- thank you so much for caring en...,e-lutris,1,1529119335,additionally thank_you so much for caring enou...
1,e0enlht,7llz2i,I am watching Shape of Water. It says some st...,figshooting,1,1528587739,i am watching shape_of_water it says some stuf...
2,dz9lw9e,7llz2i,Extremely underwhelmed by the film.\n\nIt just...,Harry101UK,1,1526784495,extremely underwhelmed by the film it just fel...
3,dz9llo2,7llz2i,"""We need a quirky 'mute' girl. We need a gay g...",Harry101UK,1,1526784107,we need a quirky 'mute girl we need a gay guy ...
4,dwov5pc,7llz2i,But she wasn't either. She was part fish and ...,porkpie1028,1,1522697652,but she was n't either she was part fish and h...


In [30]:
comments.to_csv('./data/comments.csv', sep=';', index=False)

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer

In [6]:
mlb = MultiLabelBinarizer(sparse_output=True)
df = mlb.fit_transform(tokenized_comments)

In [36]:
dx = csc_matrix(df)

In [37]:
save_npz(sparse_dtm, dx)

## Save chosen model in matrix format

For the use with CMD in R

In [3]:
currentmodel = KeyedVectors.load('./models/gensim_model_window7_vector_300')

In [4]:
wv = currentmodel.wv

In [5]:
wvdf = pd.DataFrame(wv.vectors, index=wv.key_to_index)
wvdf.sort_values(0, ascending=False).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
greatest,5.440329,0.938576,-1.615892,-0.945932,-0.530272,-0.083419,-0.452763,-3.149378,1.045976,-1.28282,...,-1.855555,0.084996,0.208643,0.5169,1.990961,0.881314,-0.046081,-0.684844,-0.661305,-0.149191
best,5.055298,1.050507,-3.429342,-0.880544,-1.063668,-0.551393,-2.111573,-1.949505,1.465971,-0.952791,...,0.19892,-0.589248,0.05016,1.110237,1.128765,1.673347,1.866306,-0.346958,2.261786,-0.497793
favourite,5.036606,0.213984,-1.418238,-1.011131,-1.864969,-0.012331,-1.150541,-2.286463,0.300508,1.023595,...,-0.235299,-0.721494,-1.973386,-0.040152,-0.993226,-1.081784,-1.440037,1.129978,0.215867,-1.030312
me,4.607864,1.136046,1.968725,-2.610231,0.272721,0.970855,-0.18989,-3.52143,0.232322,-0.288308,...,0.598449,2.382849,1.095291,1.616596,1.415084,-0.903183,0.149645,-0.768922,4.014684,-1.586202
least_favorite,4.51535,0.998368,-4.022372,-0.663905,-2.834869,0.356608,-0.77639,-1.810024,-0.321186,1.507745,...,0.032043,-0.643435,-0.477781,0.559958,0.582203,0.529455,-0.10038,3.18779,-0.11678,-1.005851


In [7]:
wvdf.to_csv('data/wvdf.csv')

In [44]:
# Discard? Can be used to read the model into R for word2vec library
vectors.save_word2vec_format("./models/gensim_model_window7_vector_300_kv.w2v", binary=True)