In [None]:
# https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

In [None]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Gensim Word2Vect/simpsons_dataset.csv')
df.shape

(158314, 2)

In [None]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [None]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [None]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

INFO - 14:03:35: NumExpr defaulting to 2 threads.


raw_character_text    0
spoken_words          0
dtype: int64

In [None]:
# We are lemmatizing (ex jouera == jouer) and removing the stopwords (les mots vides) and non-alphabetic characters for each line of dialogue

nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)


In [None]:
# Removes non-alphabetic characters

brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [None]:
# Taking advantage of spaCy .pipe() attribute to speed-up the cleaning process:

t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
# Put the results in a DataFrame to remove missing values and duplicates

df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape


In [None]:
# Bigrams:
# We are using Gensim Phrases package to automatically detect common phrases (bigrams) from a list of sentences. 
# https://radimrehurek.com/gensim/models/phrases.html

from gensim.models.phrases import Phrases, Phraser


In [None]:
# As Phrases() takes a list of list of words as input:

sent = [row.split() for row in df_clean['clean']]

In [None]:
# Creates the relevant phrases from the list of sentences

phrases = Phrases(sent, min_count=30, progress_per=10000)

