In [5]:
import re
import pandas as pd
from time import time
from collections import defaultdict
import spacy

In [None]:
# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = 'REPLACE_WITH_YOUR_FILE_ID'
downloaded = drive.CreateFile({'id': file_id})
print('Downloaded content "{}"'.format(downloaded.GetContentString()))

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [11]:
%cd /content/drive/MyDrive/GenismFiles

[Errno 2] No such file or directory: '/content/drive/MyDrive/GenismFiles'
/content/drive/MyDrive


In [14]:
df = pd.read_csv('simpsons_dataset.csv')
df.shape

(158314, 2)

In [15]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [16]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

INFO - 11:34:56: NumExpr defaulting to 2 threads.


raw_character_text    0
spoken_words          0
dtype: int64

In [17]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [18]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [19]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 1.22 mins


In [20]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85964, 1)

In [21]:
df_clean.head()

Unnamed: 0,clean
0,actually little disease magazine news show nat...
2,know sure like talk touch lesson plan teach
3,life worth live
4,poll open end recess case decide thought final...
7,victory party slide


In [22]:
!pip install gensim



In [23]:
from gensim.models.phrases import Phrases, Phraser

INFO - 11:36:20: 'pattern' package not found; tag filters are not available for English


In [24]:
sent = [row.split() for row in df_clean['clean']]

In [25]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 11:36:20: collecting all words and their counts
INFO - 11:36:20: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 11:36:20: PROGRESS: at sentence #10000, processed 63561 words and 52816 word types
INFO - 11:36:20: PROGRESS: at sentence #20000, processed 130943 words and 99866 word types
INFO - 11:36:21: PROGRESS: at sentence #30000, processed 192972 words and 138532 word types
INFO - 11:36:21: PROGRESS: at sentence #40000, processed 249842 words and 172659 word types
INFO - 11:36:21: PROGRESS: at sentence #50000, processed 311265 words and 208566 word types
INFO - 11:36:21: PROGRESS: at sentence #60000, processed 373588 words and 243702 word types
INFO - 11:36:21: PROGRESS: at sentence #70000, processed 436441 words and 278740 word types
INFO - 11:36:21: PROGRESS: at sentence #80000, processed 497829 words and 311886 word types
INFO - 11:36:21: collected 330804 word types from a corpus of 537160 words (unigram + bigrams) and 85964 sentences
INFO - 11:36:21: us

In [26]:
bigram = Phraser(phrases)

INFO - 11:36:21: source_vocab length 330804
INFO - 11:36:25: Phraser built with 126 phrasegrams


In [27]:
sentences = bigram[sent]

In [28]:
for sent in sentences:
  print(sent)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['right', 'numbskull', 'son', 'fool', 'town']
['fix', 'drive', 'car']
['good', 'get', 'feed', 'reality', 'show']
['wish', 'fix', 'faucet']
['okay', 'homer', 'speak', 'sound', 'whistle']
['wife', 'look', 'hope', 'idea', 'destroy', 'life', 'know']
['smile', 'nod', 'stall', 'stall', 'stall']
['change', 'mind']
['right', 'try', 'mean', 'sex']
['okay', 'homer', 'moment', 'truth', 'get_to', 'tell', 'marge', 'want', 'kid']
['uh', 'oh', 'pick', 'name', 'put', 'thought', 'have', 'baby', 'stall', 'stall', 'stall']
['fat', 'pathetic', 'slob', 'kid']
['oh', 'right', 'dad', 'phew', 'sweet', 'maybe', 'thing', 'kid']
['blow', 'surprise', 'pretend', 'forget']
['uh', 'oh', 'honest', 'tell', 'train']
['homer_simpson', 'want', 'cheat', 'friend', 'lie', 'wife', 'avoid', 'kid', 'jesus']
['simulation', 'bring', 'brain', 'subsidiary', 'penis']
['marge', 'think', 'flander', 'annoy', 'marriage', 'get', 'interesting']
['wow', 'bart', 'feeling', 'm

In [29]:
import multiprocessing

from gensim.models import Word2Vec

In [30]:
cores = multiprocessing.cpu_count()

In [31]:
w = list(range(2,11))

In [32]:
d = list(range(10, 100, 10)) + list(range(100,600,100))

In [33]:
d

[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500]

In [34]:
count = 0
for window in w:
  for size in d:
    count += 1
print(count)

126


In [46]:
w2v_test = Word2Vec(min_count=20,
                     iter = 100,
                     window=4, #just to set as an init value
                     size=4, #just to set as an init value
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [43]:
t = time()
w2v_test.build_vocab(sentences, progress_per=10000, update = True)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

t = time()
w2v_test.train(sentences, total_examples=w2v_test.corpus_count, epochs=2, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))


INFO - 11:56:03: collecting all words and their counts
INFO - 11:56:03: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 11:56:03: PROGRESS: at sentence #10000, processed 61718 words, keeping 9558 word types
INFO - 11:56:03: PROGRESS: at sentence #20000, processed 127351 words, keeping 14506 word types
INFO - 11:56:03: PROGRESS: at sentence #30000, processed 187829 words, keeping 17619 word types
INFO - 11:56:04: PROGRESS: at sentence #40000, processed 243332 words, keeping 20385 word types
INFO - 11:56:04: PROGRESS: at sentence #50000, processed 303182 words, keeping 22878 word types
INFO - 11:56:04: PROGRESS: at sentence #60000, processed 363940 words, keeping 25200 word types
INFO - 11:56:04: PROGRESS: at sentence #70000, processed 425408 words, keeping 27401 word types
INFO - 11:56:04: PROGRESS: at sentence #80000, processed 485464 words, keeping 29275 word types
INFO - 11:56:05: collected 30178 word types from a corpus of 523700 raw words and 85964 sentence

Time to build vocab: 0.03 mins


INFO - 11:56:06: EPOCH 1 - PROGRESS: at 39.38% examples, 78084 words/s, in_qsize 0, out_qsize 0
INFO - 11:56:07: EPOCH 1 - PROGRESS: at 80.39% examples, 78788 words/s, in_qsize 0, out_qsize 0
INFO - 11:56:07: worker thread finished; awaiting finish of 0 more threads
INFO - 11:56:07: EPOCH - 1 : training on 523700 raw words (198820 effective words) took 2.6s, 75190 effective words/s
INFO - 11:56:08: EPOCH 2 - PROGRESS: at 39.38% examples, 78906 words/s, in_qsize 0, out_qsize 0
INFO - 11:56:09: EPOCH 2 - PROGRESS: at 80.39% examples, 79379 words/s, in_qsize 0, out_qsize 0
INFO - 11:56:10: worker thread finished; awaiting finish of 0 more threads
INFO - 11:56:10: EPOCH - 2 : training on 523700 raw words (199218 effective words) took 2.5s, 80228 effective words/s
INFO - 11:56:10: training on a 1047400 raw words (398038 effective words) took 5.1s, 77340 effective words/s
INFO - 11:56:10: saving Word2Vec object under mydemo.mdl, separately None
INFO - 11:56:10: not storing attribute vectors_

Time to train the model: 0.09 mins


In [44]:
w2v_test.save("mydemo.mdl")

INFO - 11:56:44: saving Word2Vec object under mydemo.mdl, separately None
INFO - 11:56:44: not storing attribute vectors_norm
INFO - 11:56:44: not storing attribute cum_table
INFO - 11:56:44: saved mydemo.mdl


In [36]:
w2v_test.window = 4

In [37]:
print(w2v_test.window)

4


###Week 11 Experimentation on window size (w) and vector size (d)**

In [51]:
model_name = " "
w2v = Word2Vec(min_count=20,
                     iter = 100,
                     window=2, #just to set as an init value
                     size=2, #just to set as an init value
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)
w2v.build_vocab(sentences, progress_per=10000)
for window in w:
  for size in d:
    model_name = "w2v_model_window-{}_size-{}.mdl".format(window,size)
    #os.makedir("/content/.../w2v_model_window-{}_size-{}")
    #w2v.reset()
    w2v.windows = window
    w2v.size = size
    #make sure that the model start training from scratch and does not continue on previous iteration!
    w2v.train(sentences, total_examples=w2v.corpus_count, epochs=2, report_delay=1)
    w2v.save(model_name)
    
    

INFO - 12:13:41: collecting all words and their counts
INFO - 12:13:41: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 12:13:41: PROGRESS: at sentence #10000, processed 61718 words, keeping 9558 word types
INFO - 12:13:42: PROGRESS: at sentence #20000, processed 127351 words, keeping 14506 word types
INFO - 12:13:42: PROGRESS: at sentence #30000, processed 187829 words, keeping 17619 word types
INFO - 12:13:42: PROGRESS: at sentence #40000, processed 243332 words, keeping 20385 word types
INFO - 12:13:42: PROGRESS: at sentence #50000, processed 303182 words, keeping 22878 word types
INFO - 12:13:43: PROGRESS: at sentence #60000, processed 363940 words, keeping 25200 word types
INFO - 12:13:43: PROGRESS: at sentence #70000, processed 425408 words, keeping 27401 word types
INFO - 12:13:43: PROGRESS: at sentence #80000, processed 485464 words, keeping 29275 word types
INFO - 12:13:43: collected 30178 word types from a corpus of 523700 raw words and 85964 sentence

In [56]:
model = Word2Vec.load("w2v_model_window-2_size-300.mdl")

INFO - 12:36:04: loading Word2Vec object from w2v_model_window-2_size-300.mdl
INFO - 12:36:04: loading wv recursively from w2v_model_window-2_size-300.mdl.wv.* with mmap=None
INFO - 12:36:04: setting ignored attribute vectors_norm to None
INFO - 12:36:04: loading vocabulary recursively from w2v_model_window-2_size-300.mdl.vocabulary.* with mmap=None
INFO - 12:36:04: loading trainables recursively from w2v_model_window-2_size-300.mdl.trainables.* with mmap=None
INFO - 12:36:04: setting ignored attribute cum_table to None
INFO - 12:36:04: loaded w2v_model_window-2_size-300.mdl


In [66]:
model.wv.most_similar(positive=["grownup"])

[('happen', 0.9999999403953552),
 ('feed', 0.9999998211860657),
 ('pull', 0.9999998211860657),
 ('television', 0.9999997615814209),
 ('arnie', 0.9999995827674866),
 ('valentine', 0.9999995827674866),
 ('cook', 0.999999463558197),
 ('rack', 0.9999994039535522),
 ('forward', 0.9999988675117493),
 ('hugh', 0.9999979734420776)]

In [65]:
model.wv.similarity("marge", 'grownup')

0.9761697

In [None]:
for model in model_var:
  name = model + ".bin"
  model = Word2Vec(min_count=20,
                     window=windows, 
                     size=dimensions,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

                     
  t = time()
  model.build_vocab(sentences, progress_per=10000)
  print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

  t = time()
  model.train(sentences, total_examples=w2v_model.corpus_count, epochs=2, report_delay=1)
  print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

  model.save(name)
      

INFO - 12:33:04: collecting all words and their counts
INFO - 12:33:04: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 12:33:04: PROGRESS: at sentence #10000, processed 61718 words, keeping 9558 word types
INFO - 12:33:04: PROGRESS: at sentence #20000, processed 127351 words, keeping 14506 word types
INFO - 12:33:04: PROGRESS: at sentence #30000, processed 187829 words, keeping 17619 word types
INFO - 12:33:04: PROGRESS: at sentence #40000, processed 243332 words, keeping 20385 word types
INFO - 12:33:05: PROGRESS: at sentence #50000, processed 303182 words, keeping 22878 word types
INFO - 12:33:05: PROGRESS: at sentence #60000, processed 363940 words, keeping 25200 word types
INFO - 12:33:05: PROGRESS: at sentence #70000, processed 425408 words, keeping 27401 word types
INFO - 12:33:05: PROGRESS: at sentence #80000, processed 485464 words, keeping 29275 word types
INFO - 12:33:05: collected 30178 word types from a corpus of 523700 raw words and 85964 sentence

Time to build vocab: 0.04 mins


Exception in thread Thread-12:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.6/dist-packages/gensim/models/base_any2vec.py", line 270, in _job_producer
    epoch_progress = 1.0 * pushed_words / total_words
TypeError: unsupported operand type(s) for /: 'float' and 'NoneType'



KeyboardInterrupt: ignored

In [None]:
model = Word2Vec(min_count=20,
                     window=windows, 
                     size=dimensions,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)
      model.save(name)

In [None]:
w2v_model = Word2Vec(min_count=20,
                     window=2, 
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:
import tempfile

with tempfile.NamedTemporaryFile(prefix='gensim-model-', delete=False) as tmp:
    temporary_filepath = tmp.name
    w2v_model.save("w2v_model")

INFO - 12:15:19: saving Word2Vec object under w2v_model, separately None
INFO - 12:15:19: not storing attribute vectors_norm
INFO - 12:15:19: not storing attribute cum_table
INFO - 12:15:19: saved w2v_model
