In [1]:
import re
import pandas as pd
from time import time
from collections import defaultdict
import spacy
import logging  
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
df = pd.read_csv("simpsons_dataset.csv")
df

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
...,...,...
158309,Miss Hoover,I'm back.
158310,Miss Hoover,"You see, class, my Lyme disease turned out to ..."
158311,Miss Hoover,Psy-cho-so-ma-tic.
158312,Ralph Wiggum,Does that mean you were crazy?


In [3]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [4]:
df = df.dropna().reset_index(drop=True)

In [5]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]

    if len(txt) > 2:
        return ' '.join(txt)


In [6]:
#removing non-alpabetic characters
brief_cleaning = (re.sub( "[^A-Za-z]", " ", str(row) ).lower() for row in df["spoken_words"] )

In [7]:
%%time

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]

CPU times: user 55 s, sys: 1.33 s, total: 56.3 s
Wall time: 1min 13s


In [8]:
df_clean = pd.DataFrame({"clean" : txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(101440, 1)

In [9]:
from gensim.models.phrases import Phrases, Phraser
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 18:32:37: collecting all words and their counts
INFO - 18:32:37: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 18:32:37: PROGRESS: at sentence #10000, processed 62487 words and 46791 word types
INFO - 18:32:37: PROGRESS: at sentence #20000, processed 129901 words and 88432 word types
INFO - 18:32:37: PROGRESS: at sentence #30000, processed 195229 words and 124610 word types
INFO - 18:32:37: PROGRESS: at sentence #40000, processed 255163 words and 156068 word types
INFO - 18:32:37: PROGRESS: at sentence #50000, processed 311914 words and 184134 word types
INFO - 18:32:37: PROGRESS: at sentence #60000, processed 375510 words and 216077 word types
INFO - 18:32:37: PROGRESS: at sentence #70000, processed 438052 words and 246129 word types
INFO - 18:32:38: PROGRESS: at sentence #80000, processed 501664 words and 276109 word types
INFO - 18:32:38: PROGRESS: at sentence #90000, processed 564332 words and 305369 word types
INFO - 18:32:38: PROGRESS: at sentence #10

In [10]:
bigram = Phraser(phrases)
sentences = bigram[sent]

INFO - 18:32:38: exporting phrases from Phrases<336799 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 18:32:38: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<138 phrases, min_count=30, threshold=10.0> from Phrases<336799 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.53s', 'datetime': '2023-11-21T18:32:38.700013', 'gensim': '4.3.2', 'python': '3.9.16 (main, Dec  7 2022, 01:12:08) \n[GCC 11.3.0]', 'platform': 'Linux-5.19.0-45-generic-x86_64-with-glibc2.35', 'event': 'created'}


In [11]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

30035

In [12]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['s', 'm', 'oh', 'don_t', 'll', 'like', 'know', 'hey', 'think', 'right']

# Training

In [13]:
import multiprocessing
from gensim.models import Word2Vec

In [14]:
cores = multiprocessing.cpu_count()

In [15]:
%%time
w2v_model = Word2Vec(
    min_count=20,
    window=2,
    vector_size=300,
    sample=6e-5,
    alpha=0.03,
    min_alpha=0.0007,
    negative=20,
    workers=cores-1
)

INFO - 18:32:39: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2023-11-21T18:32:39.279153', 'gensim': '4.3.2', 'python': '3.9.16 (main, Dec  7 2022, 01:12:08) \n[GCC 11.3.0]', 'platform': 'Linux-5.19.0-45-generic-x86_64-with-glibc2.35', 'event': 'created'}


CPU times: user 1.34 ms, sys: 66 µs, total: 1.41 ms
Wall time: 1.6 ms


In [16]:
%%time
w2v_model.build_vocab(sentences, progress_per=10000)

INFO - 18:32:39: collecting all words and their counts
INFO - 18:32:39: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 18:32:39: PROGRESS: at sentence #10000, processed 59382 words, keeping 8610 word types
INFO - 18:32:39: PROGRESS: at sentence #20000, processed 123594 words, keeping 13154 word types
INFO - 18:32:39: PROGRESS: at sentence #30000, processed 185852 words, keeping 16270 word types
INFO - 18:32:39: PROGRESS: at sentence #40000, processed 243123 words, keeping 18744 word types
INFO - 18:32:39: PROGRESS: at sentence #50000, processed 297306 words, keeping 20812 word types
INFO - 18:32:39: PROGRESS: at sentence #60000, processed 358133 words, keeping 23012 word types
INFO - 18:32:39: PROGRESS: at sentence #70000, processed 417950 words, keeping 24938 word types
INFO - 18:32:39: PROGRESS: at sentence #80000, processed 478855 words, keeping 26741 word types
INFO - 18:32:39: PROGRESS: at sentence #90000, processed 538870 words, keeping 28427 word types


CPU times: user 636 ms, sys: 16.2 ms, total: 652 ms
Wall time: 649 ms


In [17]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

INFO - 18:32:39: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 3426 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2023-11-21T18:32:39.939937', 'gensim': '4.3.2', 'python': '3.9.16 (main, Dec  7 2022, 01:12:08) \n[GCC 11.3.0]', 'platform': 'Linux-5.19.0-45-generic-x86_64-with-glibc2.35', 'event': 'train'}
INFO - 18:32:40: EPOCH 0: training on 609265 raw words (226420 effective words) took 0.9s, 260101 effective words/s
INFO - 18:32:41: EPOCH 1 - PROGRESS: at 79.02% examples, 177478 words/s, in_qsize 13, out_qsize 0
INFO - 18:32:42: EPOCH 1: training on 609265 raw words (226020 effective words) took 1.1s, 212795 effective words/s
INFO - 18:32:42: EPOCH 2: training on 609265 raw words (226113 effective words) took 1.0s, 236957 effective words/s
INFO - 18:32:44: EPOCH 3 - PROGRESS: at 87.23% examples, 195063 words/s, in_qsize 1, out_qsize 1
INFO - 18:32:44: EPOCH 3: training on 609265 raw words (226069

(6784368, 18277950)

In [18]:
w2v_model.wv.most_similar(positive=["homer"])

[('becky', 0.6330074071884155),
 ('marge', 0.6316094994544983),
 ('depressed', 0.6307206153869629),
 ('unno', 0.603551983833313),
 ('humiliate', 0.6029792428016663),
 ('homie', 0.5990012884140015),
 ('bongo', 0.5930894613265991),
 ('straighten', 0.5916837453842163),
 ('ohh', 0.5912268161773682),
 ('reverend_lovejoy', 0.5878326296806335)]