In [5]:
import gensim
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

from datetime import datetime
from tqdm import tqdm
import pandas as pd
import pickle

In [2]:
corpus_file="./data/tokens.txt"
model_path="./models/"

## Train and test embedding models

In [None]:
# What Arseniev-Koehler et al. (2021) do (see their appendix 2):
# Also: CBOW, negative sampling (with negative=5)
window_size = [5,7,10]
vector_size = [50,100,200,300]

In [None]:
for window in window_size:
    for vector in vector_size:
        name = f"gensim_model_window{window}_vector_{vector}"
        print(f"Starting with {name} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        model = Word2Vec(corpus_file=corpus_file, vector_size=vector, window=window, epochs=10, min_count=15)

        google_test = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
        print(google_test[0])
        print(google_test[1][-1])
        similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
        print(similarities)
        print()
        print("-"*80)

        filename=f"{model_path}{name}"
        model.save(filename)
    
print(f"Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## Detect ngrams

In [3]:
# Learn ngrams
sentences = gensim.models.word2vec.LineSentence(corpus_file)
ngrams = Phrases(sentences, connector_words=ENGLISH_CONNECTOR_WORDS, min_count=15)

In [6]:
with open('./data/tokenized_comments.p', 'rb') as f:
    comments = pickle.load(f)

In [7]:
comments[0]

['additionally',
 'thank',
 'you',
 'so',
 'much',
 'for',
 'caring',
 'enough',
 'to',
 'ask',
 'in',
 'a',
 'genuinely',
 'curious',
 'and',
 'respectful',
 'way',
 'i',
 "'m",
 'curious',
 'to',
 'hear',
 'your',
 'thoughts']

In [8]:
ngrams[comments[0]]

['additionally',
 'thank_you',
 'so',
 'much',
 'for',
 'caring',
 'enough',
 'to',
 'ask',
 'in',
 'a',
 'genuinely_curious',
 'and',
 'respectful',
 'way',
 'i',
 "'m_curious",
 'to',
 'hear',
 'your',
 'thoughts']

In [10]:
tokenized_comments = []

for comment in tqdm(comments):
    comment_tokens = ngrams[comment]
    tokenized_comments.append(comment_tokens)

100%|██████████████████████████████████████████████████████████████████████| 2118317/2118317 [10:07<00:00, 3488.65it/s]


In [14]:
print(tokenized_comments[:5])

[['additionally', 'thank_you', 'so', 'much', 'for', 'caring', 'enough', 'to', 'ask', 'in', 'a', 'genuinely_curious', 'and', 'respectful', 'way', 'i', "'m_curious", 'to', 'hear', 'your', 'thoughts'], ['i', 'am', 'watching', 'shape_of_water', 'it', 'says', 'some', 'stuff', 'in', 'russian', 'how', 'can', 'i', 'find', 'out', 'what', 'they', 'are', 'saying'], ['extremely', 'underwhelmed', 'by', 'the', 'film', 'it', 'just', 'felt', 'dull', 'the', "'romance", 'between', 'fish-dick', 'and', 'mute-chick', 'was', 'too', 'quick', 'and', 'i', 'never', 'felt', 'any', 'connection', 'or', 'empathy', 'for', 'the', "'asset", 'the', 'pie-shop', 'scenes', 'were', 'so', 'quick', 'and', 'short', 'and', 'ultimately', 'meant', 'nothing', 'i', 'want', 'to', 'get', 'to', 'know', 'you', 'ew', 'no', 'and', 'that', 'was', 'it', 'just', 'the', 'film', 'saying', 'hah', 'look', 'the', '40', "'s", 'hated', 'gays', 'the', 'strange', 'musical', 'imagination', 'scene', 'was', 'a', 'complete', 'tonal', 'change', 'from', 

In [9]:
ngrams.save('./data/ngrams')

In [11]:
with open('./data/ngram_comments.p', 'wb') as f:
    pickle.dump(tokenized_comments, f)