In [1]:
import pandas as pd
import numpy as np

import re
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.collections import Counter

STOP_WORDS = stopwords.words('english')
STOP_WORDS_KEEP_NOT = [word for word in STOP_WORDS if (word not in ['no', 'not'])]

ADD_TO_STOPWORDS = ['oh', 'like', 'okay', 'would', 'get', 'hey', 'go', 'one', 'going', 'want', 'yes', 
                    'uh', 'let', 'ok', 'gonna', 'come', 'see', 'could', 'back', 'um', 'take', 'make', 
                    'us', 'got', 'know', 'man', 'think', 'one', 'two', 'way']

In [2]:
data = pd.read_csv('../../data/The-Office-Lines-V4.csv.zip', compression = 'zip')
data.drop('Unnamed: 6', 1, inplace = True)
data.head(3)

Unnamed: 0,season,episode,title,scene,speaker,line
0,1,1,Pilot,1,Michael,All right Jim. Your quarterlies look very good...
1,1,1,Pilot,1,Jim,"Oh, I told you. I couldn't close it. So..."
2,1,1,Pilot,1,Michael,So you've come to the master for guidance? Is ...


In [3]:
# scene_count = data.groupby(['season', 'episode']).agg({'scene' : 'nunique', 'title' : 'first'}).reset_index().sort_values('scene')

### Clean up speakers

In [4]:
data['speaker'] = data['speaker'].str.replace('[^a-zA-Z]', ' ', regex = True)
data['speaker'] = data['speaker'].str.replace('\s+', ' ', regex = True)
data['speaker'] = data['speaker'].str.replace(' and ', ' ', regex = True)
data['speaker'] = data['speaker'].str.strip()

In [5]:
mapper = {'Angel' : 'Angela', 'Angels' : 'Angela', 'Anglea' : 'Angela',
          'Dacvid Walalce' : 'David Wallace', 'Dacvid Wallace' : 'David Wallace', 'David' : 'David Wallace',
          'Darrly' : 'Darryl', 'Darry' : 'Darryl', 'Daryl' : 'Darryl',
          'DeAgnelo' : 'DeAngelo', 'Deangelo' : 'DeAngelo', 'Denagelo' : 'DeAngelo',
          'Dight' : 'Dwight', 'DwightKSchrute' : 'Dwight',
          'JIm' : 'Jim', 'JIM' : 'Jim', 'Jo' : 'Jo Bennett', 
          'M ichael' : 'Michael', 'MIchael' : 'Michael', 'Micael' : 'Michael','Michae' : 'Michael', 'Mihael' : 'Michael',
          'Micahel' : 'Michael', 'Michal' : 'Michael', 'Micheal' : 'Michael','Michel' : 'Michael', 'Miichael' : 'Michael',
          'Meridith' : 'Meredith', 'Nellie' : 'Nellie Bertram', 
          'Phylis' : 'Phyllis', 'Phyliss' : 'Phyllis', 'Robert' : 'Robert California',
          'Stanely' : 'Stanley', 'Todd' : 'Todd Packer', 'abe' : 'Gabe', 'sAndy' : 'Andy'}

data['speaker'].replace(mapper, inplace = True)

In [6]:
data['speaker'].nunique()

660

In [7]:
data.loc[(data['speaker'] == 'Clark') & (data['season'] == 6), 'speaker'] = 'Clark the Hospital Nurse'

In [8]:
data['speaker'].value_counts().head(20)

Michael           10876
Dwight             6794
Jim                6262
Pam                5005
Andy               3734
Angela             1555
Kevin              1542
Erin               1429
Oscar              1356
Darryl             1210
Ryan               1187
Phyllis             972
Kelly               826
Toby                814
Jan                 805
Stanley             674
Meredith            559
Holly               555
Nellie Bertram      528
David Wallace       490
Name: speaker, dtype: int64

In [9]:
data['speaker'].value_counts().head(20).sum() / data.shape[0]

0.8635631384322484

When analyzing sentiments toward each other, then I will drop the non-top-20 characters, until then I can keep all lines

### Clean up lines

1. Just basic modifications: --> wordclouds, ngrams, tf-idf
    - drop numbers and special characters
    - lower
    - normalize contractions
    
    
2. Drop stopwords --> wordclouds, ngrams, tf-idf, sentiment scores by words / lines
    - english stopwords
    - frequently used but meaningless words
    
    
3. [MAYBE] lemmatization --> sentiment scores by lines

#### 1. Basic text cleaner

In [10]:
cList = {
    "ain't": "am not",
    "aren't": "are not",
    "can't've": "cannot have",
    "can't": "cannot",
    "'cause": "because",
    "could've": "could have",
    "couldn't've": "could not have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't've": "had not have",
    "hadn't": "had not",    
    "hasn't": "has not",
    "haven't": "have not",
    "he'd've": "he would have",
    "he'd": "he would",  
    "he'll've": "he will have",
    "he'll": "he will",    
    "he's": "he is",
    "how'd'y": "how do you",
    "how'd": "how did",  
    "how'll": "how will",
    "how's": "how is",
    "i'd've": "i would have",
    "i'd": "i would",
    "i'll've": "i will have",
    "i'll": "i will",    
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd've": "it would have",
    "it'd": "it would",
    "it'll've": "it will have",  
    "it'll": "it will",  
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't've": "might not have",
    "mightn't": "might not",    
    "must've": "must have",
    "mustn't've": "must not have",
    "mustn't": "must not",
    "needn't've": "need not have",  
    "needn't": "need not",  
    "oughtn't've": "ought not have",
    "oughtn't": "ought not",
    "shan't've": "shall not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd've": "she would have",
    "she'd": "she would",
    "she'll've": "she will have",
    "she'll": "she will",  
    "she's": "she is",
    "should've": "should have",
    "shouldn't've": "should not have",
    "shouldn't": "should not",  
    "so've": "so have",
    "so's": "so is",
    "that'd've": "that would have",
    "that'd": "that would",  
    "that's": "that is",
    "there'd've": "there would have",
    "there'd": "there had",  
    "there's": "there is",
    "they'd've": "they would have",
    "they'd": "they would",
    "they'll've": "they will have",
    "they'll": "they will",  
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd've": "we would have",
    "we'd": "we would",
    "we'll've": "we will have",  
    "we'll": "we will",  
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll've": "what will have",
    "what'll": "what will",    
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll've": "who will have",
    "who'll": "who will",    
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't've": "will not have",
    "won't": "will not",    
    "would've": "would have",
    "wouldn't've": "would not have",
    "wouldn't": "would not",
    "y'all'd've": "you all would have",
    "y'all'd": "you all would",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "y'all": "you all",
    "y'alls": "you all",  
    "you'd've": "you would have",
    "you'd": "you had",
    "you'll've": "you will have",
    "you'll": "you will",  
    "you're": "you are",
    "you've": "you have"
}

In [11]:
def decontracted(phrase):
    
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    return phrase

In [12]:
def basic_text_cleaner(text, treat_contractions = False, remove_stopwords = False):  
    
    # lower chars and treat whitespaces
    text = text.lower()
    text = re.compile('\s+').sub(' ', text) # whitespaces
    text = text.strip() # trailing whitespaces    
    
    # if want to expand contractions
    if treat_contractions:
        text = decontracted(text)
        text = ' '.join([cList.get(word,word) for word in text.split()]) # if anything remains

    # get rid of special chars and re-check whitespaces    
    text = re.compile('[^a-z]').sub(' ', text) # nonletters
    
    # after numbers
    text = re.sub(' th ', '', text)
    text = re.sub(' st ', '', text)
    text = re.sub(' nd ', '', text)
    text = re.compile('\s+').sub(' ', text) # whitespaces
    text = text.strip() # trailing whitespaces
    
    # if want to remove stopwords
    if remove_stopwords:
        text = ' '.join([word for word in text.split() if word not in STOP_WORDS + ADD_TO_STOPWORDS])
    
    return text

In [13]:
data['cleaned'] = data['line'].apply(lambda x: basic_text_cleaner(x, treat_contractions = True, remove_stopwords = False))

In [14]:
data.head(3)

Unnamed: 0,season,episode,title,scene,speaker,line,cleaned
0,1,1,Pilot,1,Michael,All right Jim. Your quarterlies look very good...,all right jim your quarterlies look very good ...
1,1,1,Pilot,1,Jim,"Oh, I told you. I couldn't close it. So...",oh i told you i could not close it so
2,1,1,Pilot,1,Michael,So you've come to the master for guidance? Is ...,so you have come to the master for guidance is...


#### 2. Remove stopwords and frequent, meaningless words

In [15]:
data['cleaned_sw_rem'] = data['cleaned'].apply(lambda x: basic_text_cleaner(x, treat_contractions = True, remove_stopwords = True))

In [16]:
data.head(3)

Unnamed: 0,season,episode,title,scene,speaker,line,cleaned,cleaned_sw_rem
0,1,1,Pilot,1,Michael,All right Jim. Your quarterlies look very good...,all right jim your quarterlies look very good ...,right jim quarterlies look good things library
1,1,1,Pilot,1,Jim,"Oh, I told you. I couldn't close it. So...",oh i told you i could not close it so,told close
2,1,1,Pilot,1,Michael,So you've come to the master for guidance? Is ...,so you have come to the master for guidance is...,master guidance saying grasshopper


Check cleaned and stopword-free columns for most frequent words. If they are meaningless, add them to stopwords list and also remove them

In [17]:
data.to_csv('../../data/transcripts_cleaned.csv', index = False)