In [1]:
import re
import pandas as pd
from time import time
from collections import defaultdict
import spacy

In [2]:
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [3]:
df = pd.read_csv('simpsons_dataset.csv')
df.shape

(158314, 2)

In [4]:
df.info

<bound method DataFrame.info of              raw_character_text  \
0                   Miss Hoover   
1                  Lisa Simpson   
2                   Miss Hoover   
3                  Lisa Simpson   
4       Edna Krabappel-Flanders   
5                 Martin Prince   
6       Edna Krabappel-Flanders   
7                  Bart Simpson   
8                           NaN   
9                  Lisa Simpson   
10                     Landlady   
11                 Lisa Simpson   
12                     Landlady   
13                 Lisa Simpson   
14                     Landlady   
15                 Lisa Simpson   
16                          NaN   
17                 Bart Simpson   
18                 Nelson Muntz   
19                 Bart Simpson   
20                 Terri/sherri   
21                 Bart Simpson   
22          Milhouse Van Houten   
23                 Bart Simpson   
24                 Bart Simpson   
25          Milhouse Van Houten   
26                 Bart

In [4]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [5]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [6]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [7]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [8]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [9]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 1.19 mins


In [10]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85954, 1)

In [37]:
df_clean

Unnamed: 0,clean
0,actually little disease magazine news show nat...
2,know sure like talk touch lesson plan teach
3,life worth live
4,poll open end recess case decide thought final...
7,victory party slide
8,mr bergstrom mr bergstrom
9,hey hey move morning new job take copernicus c...
11,think take train capital city
12,train like traditional environmentally sound
13,yes backbone country leland stanford drive gol...


In [11]:
from gensim.models.phrases import Phrases, Phraser

In [12]:
sent = [row.split() for row in df_clean['clean']]

In [47]:
sent

['psy', 'cho', 'ma', 'tic']

In [13]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 17:58:52: collecting all words and their counts
INFO - 17:58:52: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 17:58:52: PROGRESS: at sentence #10000, processed 63557 words and 52796 word types
INFO - 17:58:52: PROGRESS: at sentence #20000, processed 130938 words and 99801 word types
INFO - 17:58:53: PROGRESS: at sentence #30000, processed 192959 words and 138413 word types
INFO - 17:58:53: PROGRESS: at sentence #40000, processed 249832 words and 172509 word types
INFO - 17:58:53: PROGRESS: at sentence #50000, processed 311271 words and 208406 word types
INFO - 17:58:53: PROGRESS: at sentence #60000, processed 373576 words and 243519 word types
INFO - 17:58:53: PROGRESS: at sentence #70000, processed 436427 words and 278547 word types
INFO - 17:58:53: PROGRESS: at sentence #80000, processed 497891 words and 311704 word types
INFO - 17:58:53: collected 330480 word types from a corpus of 537095 words (unigram + bigrams) and 85954 sentences
INFO - 17:58:53: us

In [14]:
bigram = Phraser(phrases)

INFO - 17:58:53: source_vocab length 330480
INFO - 17:58:56: Phraser built with 127 phrasegrams


In [15]:
sentences = bigram[sent] 

In [38]:
sentences

<gensim.interfaces.TransformedCorpus at 0x210ac487160>

In [16]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)


30242

In [17]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['oh', 'like', 'know', 'get', 'hey', 'think', 'come', 'right', 'look', 'want']

In [18]:
import multiprocessing

from gensim.models import Word2Vec

In [19]:
cores = multiprocessing.cpu_count()

In [20]:
the best dog in the world

8


In [None]:
Study

In [21]:
w2v_model = Word2Vec(min_count=20,
                     window=2, 
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [22]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 17:58:57: collecting all words and their counts
INFO - 17:58:57: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 17:58:58: PROGRESS: at sentence #10000, processed 61710 words, keeping 9572 word types
INFO - 17:58:58: PROGRESS: at sentence #20000, processed 127345 words, keeping 14535 word types
INFO - 17:58:58: PROGRESS: at sentence #30000, processed 187806 words, keeping 17660 word types
INFO - 17:58:58: PROGRESS: at sentence #40000, processed 243314 words, keeping 20424 word types
INFO - 17:58:58: PROGRESS: at sentence #50000, processed 303176 words, keeping 22934 word types
INFO - 17:58:59: PROGRESS: at sentence #60000, processed 363916 words, keeping 25246 word types
INFO - 17:58:59: PROGRESS: at sentence #70000, processed 425379 words, keeping 27467 word types
INFO - 17:58:59: PROGRESS: at sentence #80000, processed 485507 words, keeping 29350 word types
INFO - 17:58:59: collected 30242 word types from a corpus of 523616 raw words and 85954 sentence

Time to build vocab: 0.04 mins


In [23]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 17:59:00: training model with 7 workers on 3309 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 17:59:01: EPOCH 1 - PROGRESS: at 53.71% examples, 104450 words/s, in_qsize 0, out_qsize 0
INFO - 17:59:02: worker thread finished; awaiting finish of 6 more threads
INFO - 17:59:02: worker thread finished; awaiting finish of 5 more threads
INFO - 17:59:02: worker thread finished; awaiting finish of 4 more threads
INFO - 17:59:02: worker thread finished; awaiting finish of 3 more threads
INFO - 17:59:02: worker thread finished; awaiting finish of 2 more threads
INFO - 17:59:02: worker thread finished; awaiting finish of 1 more threads
INFO - 17:59:02: worker thread finished; awaiting finish of 0 more threads
INFO - 17:59:02: EPOCH - 1 : training on 523616 raw words (198345 effective words) took 1.9s, 104950 effective words/s
INFO - 17:59:03: EPOCH 2 - PROGRESS: at 55.69% examples, 106801 words/s, in_qsize 0, out_qsize 1
INFO - 17:59:04: worker thre

INFO - 17:59:21: EPOCH 12 - PROGRESS: at 53.71% examples, 102034 words/s, in_qsize 0, out_qsize 0
INFO - 17:59:22: worker thread finished; awaiting finish of 6 more threads
INFO - 17:59:22: worker thread finished; awaiting finish of 5 more threads
INFO - 17:59:22: worker thread finished; awaiting finish of 4 more threads
INFO - 17:59:22: worker thread finished; awaiting finish of 3 more threads
INFO - 17:59:22: worker thread finished; awaiting finish of 2 more threads
INFO - 17:59:22: worker thread finished; awaiting finish of 1 more threads
INFO - 17:59:22: worker thread finished; awaiting finish of 0 more threads
INFO - 17:59:22: EPOCH - 12 : training on 523616 raw words (198611 effective words) took 1.8s, 109061 effective words/s
INFO - 17:59:23: EPOCH 13 - PROGRESS: at 53.71% examples, 101511 words/s, in_qsize 0, out_qsize 1
INFO - 17:59:24: worker thread finished; awaiting finish of 6 more threads
INFO - 17:59:24: worker thread finished; awaiting finish of 5 more threads
INFO - 17

INFO - 17:59:42: worker thread finished; awaiting finish of 6 more threads
INFO - 17:59:42: worker thread finished; awaiting finish of 5 more threads
INFO - 17:59:42: worker thread finished; awaiting finish of 4 more threads
INFO - 17:59:42: worker thread finished; awaiting finish of 3 more threads
INFO - 17:59:42: worker thread finished; awaiting finish of 2 more threads
INFO - 17:59:42: worker thread finished; awaiting finish of 1 more threads
INFO - 17:59:42: worker thread finished; awaiting finish of 0 more threads
INFO - 17:59:42: EPOCH - 23 : training on 523616 raw words (198256 effective words) took 1.8s, 110265 effective words/s
INFO - 17:59:43: EPOCH 24 - PROGRESS: at 57.55% examples, 110545 words/s, in_qsize 0, out_qsize 0
INFO - 17:59:44: worker thread finished; awaiting finish of 6 more threads
INFO - 17:59:44: worker thread finished; awaiting finish of 5 more threads
INFO - 17:59:44: worker thread finished; awaiting finish of 4 more threads
INFO - 17:59:44: worker thread f

Time to train the model: 0.92 mins


In [25]:
w2v_model.wv.most_similar(positive=["homer"])

[('rude', 0.7770212888717651),
 ('sweetheart', 0.7760750651359558),
 ('gee', 0.7625381350517273),
 ('embarrassing', 0.7622367143630981),
 ('crummy', 0.7588742971420288),
 ('marge', 0.7517530918121338),
 ('hammock', 0.7325915098190308),
 ('happen', 0.7325051426887512),
 ('straighten', 0.7279349565505981),
 ('terrific', 0.716513991355896)]

In [26]:
w2v_model.wv.most_similar(positive=["marge"])

[('grownup', 0.764975368976593),
 ('ralphie', 0.7616559267044067),
 ('worry', 0.7569176554679871),
 ('sure', 0.7567306160926819),
 ('rude', 0.7535579800605774),
 ('homer', 0.7517530918121338),
 ('raccoon', 0.7514562606811523),
 ('sweetheart', 0.7497808337211609),
 ('worried', 0.7441363334655762),
 ('crummy', 0.7433844804763794)]

In [27]:
w2v_model.wv.most_similar(positive=["bart"])

[('lisa', 0.8640710115432739),
 ('hearing', 0.8371952772140503),
 ('convince', 0.8040638566017151),
 ('mom', 0.7893967628479004),
 ('homework', 0.788345217704773),
 ('pay_attention', 0.7860027551651001),
 ('strangle', 0.7801192402839661),
 ('jealous', 0.7796902060508728),
 ('surprised', 0.7708951234817505),
 ('muntz', 0.7708556056022644)]

In [31]:
w2v_model.wv.similarity("homer", 'tavern')

0.50523514

In [33]:
w2v_model.wv.similarity('maggie', 'baby')

0.734279

In [34]:
w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'milhouse'

In [40]:
w2v_model.wv.doesnt_match(["nelson", "bart", "milhouse"])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'nelson'

In [41]:
vec = w2v_model.wv['homer']

In [1]:
vec

NameError: name 'vec' is not defined