In [1]:
import re
import pandas as pd
from time import time
from collections import defaultdict

In [2]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [3]:
import spacy

In [4]:
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [5]:
df = pd.read_csv('/Users/anthonyzippay/Desktop/simpsons_dataset.csv')

In [6]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [7]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [8]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [9]:
nlp = spacy.load('en', disable=['ner', 'parser'])

def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

In [10]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [11]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.78 mins


In [12]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85954, 1)

# Biagrams using Genism phrases package to auto detect common phrases

In [13]:
import numpy as np
import scipy as sy

In [14]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [15]:
from gensim.models.phrases import Phrases, Phraser


as phrases() takes a list of list of words as input

In [16]:
sent = [row.split() for row in df_clean['clean']]

creates the relevant phrases from the list of sentences

In [17]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 11:24:07: collecting all words and their counts
INFO - 11:24:07: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 11:24:08: PROGRESS: at sentence #10000, processed 63557 words and 52796 word types
INFO - 11:24:08: PROGRESS: at sentence #20000, processed 130938 words and 99801 word types
INFO - 11:24:08: PROGRESS: at sentence #30000, processed 192959 words and 138413 word types
INFO - 11:24:08: PROGRESS: at sentence #40000, processed 249832 words and 172509 word types
INFO - 11:24:08: PROGRESS: at sentence #50000, processed 311271 words and 208406 word types
INFO - 11:24:08: PROGRESS: at sentence #60000, processed 373576 words and 243519 word types
INFO - 11:24:08: PROGRESS: at sentence #70000, processed 436427 words and 278547 word types
INFO - 11:24:08: PROGRESS: at sentence #80000, processed 497891 words and 311704 word types
INFO - 11:24:08: collected 330480 word types from a corpus of 537095 words (unigram + bigrams) and 85954 sentences
INFO - 11:24:08: us

The goal of Phraser() is to cut down memory consumption of Phrases(), by discarding model state not strictly needed for the bigram detection task:

transform the corpus based on the bigrams detected:

In [18]:
bigram = Phraser(phrases)

INFO - 11:24:08: source_vocab length 330480
INFO - 11:24:10: Phraser built with 127 phrasegrams


In [19]:
sentences = bigram[sent]

In [20]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

30242

Most Frequent Words:
check of the effectiveness of the lemmatization, removal of stopwords, and addition of bigrams.

In [21]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['oh', 'like', 'know', 'get', 'hey', 'think', 'come', 'right', 'look', 'want']

Gensim Word2Vec Implementation:
We use Gensim implementation of word2vec: https://radimrehurek.com/gensim/models/word2vec.html

In [22]:
import multiprocessing

from gensim.models import Word2Vec

In [23]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

# The parameters:
min_count = int - Ignores all words with total absolute frequency lower than this - (2, 100)
window = int - The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and window words on the left of our target - (2, 10)
size = int - Dimensionality of the feature vectors. - (50, 300)
sample = float - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial. - (0, 1e-5)
alpha = float - The initial learning rate - (0.01, 0.05)
min_alpha = float - Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00
negative = int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)
workers = int - Use these many worker threads to train the model (=faster training with multicore machines)


In [24]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them):

In [25]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 11:24:12: collecting all words and their counts
INFO - 11:24:12: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 11:24:12: PROGRESS: at sentence #10000, processed 61710 words, keeping 9572 word types
INFO - 11:24:12: PROGRESS: at sentence #20000, processed 127345 words, keeping 14535 word types
INFO - 11:24:12: PROGRESS: at sentence #30000, processed 187806 words, keeping 17660 word types
INFO - 11:24:12: PROGRESS: at sentence #40000, processed 243314 words, keeping 20424 word types
INFO - 11:24:13: PROGRESS: at sentence #50000, processed 303176 words, keeping 22934 word types
INFO - 11:24:13: PROGRESS: at sentence #60000, processed 363916 words, keeping 25246 word types
INFO - 11:24:13: PROGRESS: at sentence #70000, processed 425379 words, keeping 27467 word types
INFO - 11:24:13: PROGRESS: at sentence #80000, processed 485507 words, keeping 29350 word types
INFO - 11:24:13: collected 30242 word types from a corpus of 523616 raw words and 85954 sentence

Time to build vocab: 0.03 mins


In [26]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 11:24:14: training model with 7 workers on 3309 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 11:24:15: EPOCH 1 - PROGRESS: at 63.32% examples, 123398 words/s, in_qsize 0, out_qsize 0
INFO - 11:24:15: worker thread finished; awaiting finish of 6 more threads
INFO - 11:24:15: worker thread finished; awaiting finish of 5 more threads
INFO - 11:24:15: worker thread finished; awaiting finish of 4 more threads
INFO - 11:24:15: worker thread finished; awaiting finish of 3 more threads
INFO - 11:24:15: worker thread finished; awaiting finish of 2 more threads
INFO - 11:24:15: worker thread finished; awaiting finish of 1 more threads
INFO - 11:24:15: worker thread finished; awaiting finish of 0 more threads
INFO - 11:24:15: EPOCH - 1 : training on 523616 raw words (198345 effective words) took 1.5s, 129467 effective words/s
INFO - 11:24:16: EPOCH 2 - PROGRESS: at 65.22% examples, 123762 words/s, in_qsize 0, out_qsize 1
INFO - 11:24:17: worker thre

INFO - 11:24:31: EPOCH 12 - PROGRESS: at 63.32% examples, 122583 words/s, in_qsize 0, out_qsize 0
INFO - 11:24:32: worker thread finished; awaiting finish of 6 more threads
INFO - 11:24:32: worker thread finished; awaiting finish of 5 more threads
INFO - 11:24:32: worker thread finished; awaiting finish of 4 more threads
INFO - 11:24:32: worker thread finished; awaiting finish of 3 more threads
INFO - 11:24:32: worker thread finished; awaiting finish of 2 more threads
INFO - 11:24:32: worker thread finished; awaiting finish of 1 more threads
INFO - 11:24:32: worker thread finished; awaiting finish of 0 more threads
INFO - 11:24:32: EPOCH - 12 : training on 523616 raw words (198611 effective words) took 1.5s, 130457 effective words/s
INFO - 11:24:33: EPOCH 13 - PROGRESS: at 63.32% examples, 123151 words/s, in_qsize 0, out_qsize 0
INFO - 11:24:33: worker thread finished; awaiting finish of 6 more threads
INFO - 11:24:33: worker thread finished; awaiting finish of 5 more threads
INFO - 11

INFO - 11:24:48: worker thread finished; awaiting finish of 6 more threads
INFO - 11:24:48: worker thread finished; awaiting finish of 5 more threads
INFO - 11:24:48: worker thread finished; awaiting finish of 4 more threads
INFO - 11:24:48: worker thread finished; awaiting finish of 3 more threads
INFO - 11:24:48: worker thread finished; awaiting finish of 2 more threads
INFO - 11:24:48: worker thread finished; awaiting finish of 1 more threads
INFO - 11:24:48: worker thread finished; awaiting finish of 0 more threads
INFO - 11:24:48: EPOCH - 23 : training on 523616 raw words (198256 effective words) took 1.4s, 141571 effective words/s
INFO - 11:24:49: EPOCH 24 - PROGRESS: at 65.22% examples, 126969 words/s, in_qsize 0, out_qsize 0
INFO - 11:24:50: worker thread finished; awaiting finish of 6 more threads
INFO - 11:24:50: worker thread finished; awaiting finish of 5 more threads
INFO - 11:24:50: worker thread finished; awaiting finish of 4 more threads
INFO - 11:24:50: worker thread f

Time to train the model: 0.78 mins


In [27]:
w2v_model.init_sims(replace=True)

INFO - 11:24:59: precomputing L2-norms of word weight vectors


Exploring the model
Most similar to:
Here, we will ask our model to find the word most similar to some of the most iconic characters of the Simpsons!

In [28]:
w2v_model.wv.most_similar(positive=["homer"])

[('sweetheart', 0.7791968584060669),
 ('rude', 0.7722176313400269),
 ('embarrassing', 0.765440821647644),
 ('marge', 0.7588808536529541),
 ('gee', 0.7577983140945435),
 ('hammock', 0.7476896047592163),
 ('crummy', 0.742877721786499),
 ('good_friend', 0.7285328507423401),
 ('duh', 0.719735324382782),
 ('terrific', 0.7162286043167114)]

In [29]:
w2v_model.wv.most_similar(positive=["bart"])

[('lisa', 0.8628568053245544),
 ('hearing', 0.8464859127998352),
 ('convince', 0.8102348446846008),
 ('upset', 0.7956368327140808),
 ('homework', 0.7915209531784058),
 ('strangle', 0.7873169779777527),
 ('mom', 0.7869598865509033),
 ('exploit', 0.7823977470397949),
 ('surprised', 0.7791073322296143),
 ('grownup', 0.7773650884628296)]

In [30]:
w2v_model.wv.most_similar(positive=["homer_simpson"])

[('council', 0.7904050350189209),
 ('montgomery_burns', 0.7875198125839233),
 ('pleased', 0.7829771041870117),
 ('robert', 0.7803866863250732),
 ('waylon', 0.7780139446258545),
 ('select', 0.7718239426612854),
 ('easily', 0.7672803997993469),
 ('governor', 0.7651355266571045),
 ('threat', 0.763095498085022),
 ('recent', 0.7572838068008423)]

Similarities:

In [31]:
w2v_model.wv.similarity("moe", 'tavern')

0.8871527

In [32]:
w2v_model.wv.similarity("maggie", 'baby')

0.70427865

In [33]:
w2v_model.wv.similarity("milhouse", 'bart')

0.74568856

Finding those that dont match

In [34]:
w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'jimbo'

In [35]:
w2v_model.wv.doesnt_match(["nelson", "bart", "milhouse"])

'nelson'

In [36]:
w2v_model.wv.doesnt_match(["patty", "selma", "homer"])

'homer'

Anology difference

In [37]:
w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3)

[('admire', 0.6712377071380615),
 ('rude', 0.6696723699569702),
 ('attractive', 0.6556844711303711)]

this was supposed to show which word is most similar to woman as homer is to marge, but top result should be man

In [39]:
w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)

[('lisa', 0.7707328796386719),
 ('pregnant', 0.7573974132537842),
 ('upset', 0.6969519853591919)]

t-SNE visualizations:
t-SNE is a non-linear dimensionality reduction algorithm that attempts to represent high-dimensional data and the underlying relationships between vectors in a lower-dimensional space.

In [40]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
 
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE