In [2]:
import re
import pandas as pd
import numpy as np
from time import time
from collections import defaultdict

import spacy

import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [4]:
df = pd.read_csv('simpsons_dataset.csv')

In [6]:
df.shape

(158314, 2)

In [7]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [8]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

### Cleaning

In [10]:
# Disable unnecessary features to lighten the model
nlp = spacy.load('en', disable=['ner', 'parser'])

In [11]:
def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    
    if len(txt) > 2:
        return ' '.join(txt)

In [12]:
# Clean up non-alphabetic characters
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [13]:
# Taking advantage of spacy's pipe attribute to speed up the cleaning process

t = time()
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print("Time to clean up everything: {} mins".format(round((time() - t)/60, 2)))

Time to clean up everything: 3.2 mins


In [14]:
df_clean = pd.DataFrame({'clean':txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(92412, 1)

#### Bigrams

In [15]:
from gensim.models.phrases import Phrases, Phraser

In [16]:
sent = [row.split() for row in df_clean['clean']]

In [17]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 18:16:00: collecting all words and their counts
INFO - 18:16:00: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 18:16:00: PROGRESS: at sentence #10000, processed 67396 words and 50551 word types
INFO - 18:16:00: PROGRESS: at sentence #20000, processed 140465 words and 95808 word types
INFO - 18:16:01: PROGRESS: at sentence #30000, processed 207950 words and 132011 word types
INFO - 18:16:01: PROGRESS: at sentence #40000, processed 270207 words and 164407 word types
INFO - 18:16:02: PROGRESS: at sentence #50000, processed 334085 words and 196195 word types
INFO - 18:16:02: PROGRESS: at sentence #60000, processed 400877 words and 228659 word types
INFO - 18:16:03: PROGRESS: at sentence #70000, processed 467802 words and 260712 word types
INFO - 18:16:03: PROGRESS: at sentence #80000, processed 534361 words and 292095 word types
INFO - 18:16:03: PROGRESS: at sentence #90000, processed 602037 words and 321944 word types
INFO - 18:16:04: collected 328658 word typ

The goal of Phraser is to cut down memory consumption of Phrases(), by discarding model state not strictly needed for the bigram detection task.

In [18]:
bigram = Phraser(phrases)

INFO - 18:17:55: source_vocab length 328658
INFO - 18:18:06: Phraser built with 127 phrasegrams


In [19]:
sentences = bigram[sent]

In [24]:
sentences[2]

['not',
 'know',
 'would',
 'sure',
 'like',
 'talk',
 'not',
 'touch',
 'lesson',
 'plan',
 'teach']