In [1]:
import pandas as pd
import re
import numpy as np

import string

import multiprocessing as mp


# Data Cleaning

In [None]:
# script roughly follows the following format

CUT TO: Washington Monument - MID DAY

CHARACTER1
Lorem ipsum 

Character1 walks across office in some sort of stage direction

CHARACTER2 [audio direction]
[delivery instruction of some kind] Lorem ipsum

In [2]:
# odd-man-out formatting corrections that affect the regularity of the script and thus effectiveness of subsequent slice-and-dice
def clean_script(script):
    script = script.lstrip('<pre>').split('\nTHE END')[0]
    script = script.replace('McGARRY', 'MCGARRY')
    script = script.replace('CUT TO:\n\n', 'CUT TO: ')
    script_ = re.sub(r'([A-Z])(\nCUT TO:)', r'\1\n\2',  script)
    script = script_.replace('FADE OUT.\nEND', '')
    script = script.replace('FADE OUT.\n\nEND ', '')
    script = script.rstrip('\n')
     
    return script


In [3]:
# split script into line elements (line headers and spoken dialogue, stage directions, scene direction, etc.)
def make_script_df(script, episode_num):
    bits = script.split('\n\n')
    return pd.DataFrame({'elements':bits, 'episode': [episode_num for ik in range(len(bits))]})

In [4]:
# label stage directions 
def parse_stage_dir(script_df):
    stage_dir = []
    for ik, bit in enumerate(script_df.elements):
        if (bit.split('\n')[0].isupper() == False) and ('[' not in bit.split('\n')[0]):
            stage_dir.append(1)
            script_df.loc[ik, 'elements'] = bit.replace('\n', ' ')
        else:
            stage_dir.append(0)
            
    script_df['stage_dir'] = stage_dir
    return script_df

In [5]:
# label scene specifications
def parse_scene_set(script_df):
    scene_set = []
    for elem in script_df.elements:
        lines = elem.split('\n')
        if ('-' in lines[0]) and (lines[0].isupper()):
            scene_set.append(1)
        else:
            scene_set.append(0)

    script_df['scene_set'] = scene_set
    return script_df

In [6]:
# Split out character attribution/line and audio instructions (e.g. VO- voice over) and line delivery instructions (e.g. crossing to pick up the phone)
def parse_lines_characters(script_df):
    line = []
    character = []
    deliv_dir = []
    audio_dir = []

    for ik in range(len(script_df.elements)):
        if (script_df.iloc[ik]['scene_set'] == 0) and (script_df.iloc[ik]['stage_dir'] == 0):
            tmp_elem  = script_df.iloc[ik]['elements'].split('\n', 1)
            
            if len(tmp_elem)>1:
                tmp_elem[1] = tmp_elem[1].replace('\n', ' ')

                # character name & audio delivery notes
                tmp_audio_dir = tmp_elem[0].split(' [')
                if len(tmp_audio_dir)>1:
                    audio_dir.append(tmp_audio_dir[1].strip(']'))
                    character.append(tmp_audio_dir[0])
                else:
                    if (tmp_audio_dir[0].isupper() == True):
                        audio_dir.append(np.nan)
                        character.append(tmp_audio_dir[0])
                    else:
                        character.append(' ')
                        audio_dir.append(np.nan)

                # line & acting delivery notes
                tmp_deliv_dir = tmp_elem[1].split('] ')
                if len(tmp_deliv_dir)>1:
                    deliv_dir.append(tmp_deliv_dir[0].strip('['))
                    line.append(tmp_deliv_dir[1])
                else:
                    line.append(tmp_deliv_dir[0])
                    deliv_dir.append(np.nan)
            else:
                line.append(np.nan)
                character.append('Drop')
                deliv_dir.append(np.nan)
                audio_dir.append(np.nan)

        else:
            line.append(np.nan)
            character.append(' ')
            deliv_dir.append(np.nan)
            audio_dir.append(np.nan)

    script_df['line'] = line
    script_df['character'] = character
    script_df['deliv_dir'] = deliv_dir
    script_df['audio_dir'] = audio_dir
    
    return script_df

In [7]:
# load scraped scripts
ww_df = pd.read_json('WW.json')

In [8]:
# pull apart scripts
script_dfs = []
print('loading ', len(ww_df), ' scripts')

for ik, text in enumerate(ww_df['text']):
    ww_script = text
    script_dfs.append(parse_lines_characters(parse_scene_set(parse_stage_dir(make_script_df(clean_script(ww_script),ik+1)))))

# concatenate scripts
ww_dfs = pd.concat(script_dfs, axis=0, join='outer')

loading  59  scripts


In [9]:
# drop rows with character names that aren't characters
for char in ['DISSOLVE TO', 'FADE OUT.', 'SMASH CUT TO: MAIN TITLES.', 'THE WEST WING', 'ACT ONE', 'ACT TWO', 'END TEASER', 'Drop', ' ']:
    ww_dfs = ww_dfs[~ww_dfs['character'].str.contains( char)]


In [10]:
# standardize names 
names_d = {'C.J.': 'C.J. CREGG', 'DONNA':'DONNA MOSS', 'LEO':'LEO MCGARRY', 'JOSH': 'JOSH LYMAN', 'BILLY': 'BILLY KENWORTHY', 'MARY': 'MARY MARSH', 'SAM': 'SAM SEABORN', 'TOBY': 'TOBY ZIEGLER', 'PRESIDENT JED BARTLET': 'BARTLET' }

for key, value in names_d.items():
    ww_dfs['character'] = ww_dfs['character'].str.replace(value, key)
    ww_dfs['character'] = ww_dfs['character'].str.replace(key, value)

## Filter data for quotes from main characters  

I am limiting the character list to six for the purpose of this experiment.

In [11]:
# filter dataframe so it is limited to primary characters
# main characters
# main_characters = ['C.J. CREGG', 'AINSLEY', 'DONNA MOSS', 'LEO MCGARRY','JOSH LYMAN', 'CHARLIE', 'SAM SEABORN', 'TOBY ZIEGLER', 'BARTLET']
main_characters = ['C.J. CREGG', 'DONNA MOSS', 'LEO MCGARRY','SAM SEABORN', 'TOBY ZIEGLER', 'BARTLET']

filtered_ww = ww_dfs
filtered_ww = filtered_ww[filtered_ww['character'].isin(main_characters)]


### Filter lines to only included longer lines (longer than one word)

In [12]:
# add word count so that text can be filtered for longer lines
filtered_ww['word_ct'] = [len(s.translate(str.maketrans('', '', string.punctuation)).split(' ')) for s in filtered_ww['line']]

filtered_ww_longlines = filtered_ww[filtered_ww.word_ct>1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:

def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

filtered_ww_longlines['line'] = filtered_ww_longlines['line'].apply(lambda x: text_cleaner(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


# Modeling with different types of word vectors

## Bag of Words approach  

My first approach was using a Bag of Words strategy.  As usual, there were multiple avenues for building a feature set...

1.) single words or ngrams (n = 2): In hindsight I probably should have used CountVectorizer, but I didn't remember about that tool until I was fairly far in so I stayed the course and wrote by hand.  `make_common_grams` allows the user to pass `two_grams = True` to specify that 2-grams should be added to the word list.  

2.) words only or words + sentence statistics: The simplest version uses strictly words as features, but passing `sent_stat = True` to bow_features prompts the function to include information like `comma_ct`, `sent_ct`, and counts of part of speech and repeated phrases.   

In [14]:
import spacy
nlp = spacy.load('en')

In [16]:
from collections import Counter
import string

# Utility function to create a list of the 3000 most common words from a block of text.
def bag_of_words(text, **kwargs):
    
    # Filter out punctuation and stop words.
    if ('no_stop_words' in kwargs) and (kwargs['no_stop_words'] == True):
#         print('bag_of_words: no_stop_words')
        allwords = [str(token.lemma_).translate(str.maketrans('', '', string.punctuation))
                for token in text
                if not token.is_punct
                  and not token.is_stop
                  and token.is_alpha]
    else:
#         print('bag_of_words: stop_words')
        allwords = [str(token.lemma_).translate(str.maketrans('', '', string.punctuation))
                for token in text
                if not token.is_punct
                  and token.is_alpha]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

In [17]:
import inspect
# make composite list of ngrams and words
def make_common_grams(df, **kwargs):
    common_grams = []
    all_words = []
    all_ngrams = []
    if ('no_stop_words' in kwargs) and (kwargs['no_stop_words'] == True):
#         print(inspect.stack()[0][3], 'no_stop_words')
        nsw = kwargs['no_stop_words']
    else:
#         print(inspect.stack()[0][3], 'stop_words')
        nsw = False

    for char in main_characters:
        char_text = df['line'][df.character == char]
        _text = ' '.join(char_text)
        _words = bag_of_words(nlp(_text), no_stop_words = nsw)
        
        _ngrams = []
        if ('two_grams' in kwargs) and (kwargs['two_grams'] == True):
#             print(inspect.stack()[0][3], 'two_grams')
            # create 2-grams
            for line in char_text:
                grams = [token.orth_ for token in nlp(line) if (not token.is_punct)]
                ngrams = [' '.join([grams[ik].lower(), grams[ik+1].lower()]) for ik in range(len(grams)-1)]
                _ngrams = _ngrams+ ngrams
        
        all_ngrams = all_ngrams + [item[0] for item in Counter(_ngrams).most_common(2000)]
        all_words = all_words +_words
    
    common_grams = list(set(all_words))+ list(set(all_ngrams))
    print('length of word list:', len(all_words), 'length if ngram list:', len(all_ngrams), 'length of common word list:', len(common_grams))
    return common_grams

In [18]:
word_setups = [
    ['commonwords_twograms_nstop', filtered_ww_longlines, {'two_grams': True, 'no_stop_words': True}],
    ['commonwords_onegrams_nstop', filtered_ww_longlines, {'two_grams': False, 'no_stop_words': True}],
    ['commonwords_twograms_stop', filtered_ww_longlines, {'two_grams': True, 'no_stop_words': False}],
    ['commonwords_onegrams_stop', filtered_ww_longlines, {'two_grams': False, 'no_stop_words': False}]
]

In [19]:
def make_word_unit(word_setup):
    return {'label': word_setup[0], 'common_words': make_common_grams(word_setup[1], **word_setup[2]), 'params': word_setup[2]}

In [20]:
pool = mp.Pool(processes=4)
word_lists = pool.map(make_word_unit, iter(word_setups))
pool.close()

word_list_d = {word_lists[ik]['label']: word_lists[ik] for ik in range(len(word_lists))}

length of word list: 12000 length if ngram list: 0 length of common word list: 5192
length of word list: 12000 length if ngram list: 0 length of common word list: 4990
length of word list: 12000 length if ngram list: 12000 length of common word list: 10689
length of word list: 12000 length if ngram list: 12000 length of common word list: 10487


Process ForkPoolWorker-4:
Process ForkPoolWorker-2:
Process ForkPoolWorker-1:
Process ForkPoolWorker-3:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/

In [21]:
# returns a dataframe of lines and characters where each line is an nlp document
def make_quotelist(df):
    return  pd.DataFrame([[nlp(df.iloc[ik]['line']), df.iloc[ik]['character'], df.iloc[ik]['word_ct']] for ik in range(len(df))])

In [25]:
quotes_df = make_quotelist(filtered_ww_longlines)


In [26]:
# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.

# @question: I feel like [from sklearn.feature_extraction.text import CountVectorizer] should be useful here...
# POS dict

# print(inspect.stack()[0][3], 'no_stop_words') #useful for printing function name with a tag
pos_d = {'VERB':'verb_ct', 'NOUN':'noun_ct', 'ADV':'adv_ct', 'ADP':'adp_ct', 
         'PROPN':'propn_ct', 'ADJ':'adj_ct', 'DET':'det_ct', 'PUNCT':'punct_ct'}

def bow_features(quotes, **kwargs):
    common_words = kwargs['common_words']
    print(len(quotes))
    
    # sentence stats
    sent_stats = ['comma_ct', 'sent_ct', 'word_ct', 'repeats_ct','question_ct', 'comma_freq', 'adv_ct', 'adp_ct', 'propn_ct', 'adj_ct', 'punct_ct','verb_ct', 'noun_ct','det_ct']
    if ('sent_stats' in kwargs) and (kwargs['sent_stats'] == True):
        cols = list(common_words) + sent_stats
    else:
        cols = list(common_words)
    
    df = pd.DataFrame({col: np.zeros(len(quotes[0])) for col in cols})
    df['line'] = quotes[0] 
    df['character'] = quotes[1]
    df['word_ct'] = quotes[2]
    
#     # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['line']):
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        if ('no_stop_words' in kwargs) and (kwargs['no_stop_words'] == True):
            words = [token.lemma_
                     for token in sentence
                     if (
                         not token.is_punct
                         and not token.is_stop
                         and token.lemma_ in common_words
                     )]
        else:
            words = [token.lemma_
                     for token in sentence
                     if (
                         not token.is_punct
                         and token.lemma_ in common_words
                     )]
        if ('two_grams' in kwargs) and (kwargs['two_grams'] == True):  
            grams = [token.orth_ for token in sentence if (not token.is_punct)]
            ngrams = [' '.join([grams[ik].lower(), grams[ik+1].lower()]) for ik in range(len(grams)-1)]
            words = words+ngrams
        
        # Populate the row with word counts.
        for word in words:
            try:
                df.loc[i, word] += 1
            except:
                pass
        
        # add sentence features
        if ('sent_stats' in kwargs) and (kwargs['sent_stats'] == True):
            commas = 0
            questions = 0
            for token in sentence:
                if token.orth_ == ',':
                    commas += 1
                elif token.orth_ == '?':
                    questions +=1
            df.loc[i, 'comma_ct'] = commas
            df.loc[i, 'question_ct'] = questions
                    
            # repeated sentece structure
            try: ngrams
            except NameError:
                grams = [token.orth_ for token in sentence if (not token.is_punct)]
                ngrams = [' '.join([grams[ik].lower(), grams[ik+1].lower()]) for ik in range(len(grams)-1)]
            repeated_phrases = Counter(ngrams)
            num_repeats = 0
            for phrase in repeated_phrases.keys():
                if repeated_phrases[phrase]>1:
                    num_repeats +=1
            df.loc[i, 'repeats_ct'] = num_repeats
                            
            # parts of speech count
            c = Counter([token.pos_ for token in sentence])
            for key in pos_d.keys():
                if key in c.keys():
                    df.loc[i, pos_d[key]] = c[key]
                    
            if df.loc[i, 'word_ct'] >0:
                df.loc[i, 'comma_freq'] = commas/df.loc[i, 'word_ct']
            else:
                df.loc[i, 'comma_freq'] = 0
            df.loc[i, 'sent_ct'] = len([sent for sent in sentence.sents])

        # This counter is just to make sure the kernel didn't hang.
        if i % 5000 == 0:
            print("Processing row {}".format(i))
            
    return df

In [27]:
# Prep features without sentence stats
from sklearn.model_selection import train_test_split
def make_test_train(quotes_df, **kwargs):
    word_counts2 = bow_features(quotes_df, **kwargs)
        
    print('done')
    word_counts2 = word_counts2.dropna()

    Y = word_counts2['character']
    X = word_counts2.iloc[:, ~word_counts2.columns.isin(['character','line'])]

    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        Y,
                                                        test_size=0.3,
                                                        random_state=0)
    
    return {'X_train': X_train, 'X_test': X_test, 'y_train':y_train, 'y_test':y_test}

In [28]:
def make_data_unit(model_setup):
    return {'label':model_setup[0], 'data': make_test_train(model_setup[1], **model_setup[2]), 'params': model_setup[2]}

In [29]:
model_setups = [
    ['noss_twograms_stop', quotes_df, {'common_words': word_list_d['commonwords_twograms_stop']['common_words'], 'sent_stats': False, 'no_stop_words': False, 'two_grams': True}], 
    ['ss_twograms_stop', quotes_df, {'common_words': word_list_d['commonwords_twograms_stop']['common_words'], 'sent_stats': True, 'no_stop_words': False, 'two_grams': True}], 
    ['ss_twograms_nstop', quotes_df, {'common_words': word_list_d['commonwords_twograms_nstop']['common_words'], 'sent_stats': True, 'no_stop_words': True, 'two_grams': True}],
    ['noss_twograms_nstop', quotes_df, {'common_words': word_list_d['commonwords_twograms_nstop']['common_words'], 'sent_stats': False, 'no_stop_words': True, 'two_grams': True}],
    ['ss_onegrams_stop', quotes_df, {'common_words': word_list_d['commonwords_onegrams_stop']['common_words'], 'sent_stats': True, 'no_stop_words': False, 'two_grams': False}],
    ['noss_onegrams_stop', quotes_df, {'common_words': word_list_d['commonwords_onegrams_stop']['common_words'], 'sent_stats': False, 'no_stop_words': False, 'two_grams': False}],
    ['ss_onegrams_nstop', quotes_df, {'common_words': word_list_d['commonwords_onegrams_nstop']['common_words'], 'sent_stats': True, 'no_stop_words': True, 'two_grams': False}],
    ['noss_onegrams_nstop', quotes_df, {'common_words': word_list_d['commonwords_onegrams_nstop']['common_words'], 'sent_stats': False, 'no_stop_words': True, 'two_grams': False}]
]

In [30]:
import sys
sys.stdout = open('/dev/stdout', 'w')

pool = mp.Pool(processes=3)
results = pool.map(make_data_unit, iter(model_setups))
pool.close()

18207
Processing row 0
18207
Processing row 0
18207
Processing row 0
Processing row 5000
Processing row 5000
Processing row 5000
Processing row 10000
Processing row 10000
Processing row 10000
Processing row 15000
Processing row 15000
Processing row 15000
done
18207
Processing row 0
done
done
18207
Processing row 0
18207
Processing row 0
Processing row 5000
Processing row 5000
Processing row 10000
Processing row 15000
Processing row 10000
done
18207
Processing row 0
Processing row 15000
Processing row 5000
done
18207
Processing row 0
Processing row 10000
Processing row 5000
Processing row 5000
Processing row 10000
Processing row 15000
Processing row 15000
done
done
Processing row 10000
Processing row 15000
done


<bound method Pool.close of <multiprocessing.pool.Pool object at 0x11e6ce8d0>>

Process ForkPoolWorker-5:
Process ForkPoolWorker-6:
Process ForkPoolWorker-7:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task

### Modeling

In [51]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
def train_test_log_reg(d):
    data = d['data']
    gs_params = {'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}
    lr = LogisticRegression(multi_class = 'auto', solver = 'lbfgs')
    lr.fit(data['X_train'], data['y_train'])
    d['model'] = lr
    d['train_score'] = lr.score(data['X_train'], data['y_train'])
    d['test_score'] = lr.score(data['X_test'], data['y_test'])
    print(d['label'], 'Training set score:',d['train_score'], 'Test set score:', d['test_score'] )

    y_pred = lr.predict(data['X_test'])
#     pd.crosstab(y_pred, data['y_test'], dropna=False)
    return d


In [55]:
import warnings

from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

pool = mp.Pool(processes=3)
results = pool.map(train_test_log_reg, iter(results))
pool.close()

results_d = {results[ik]['label']:results[ik] for ik in range(len(results))}

noss_twograms_stop Training set score: 0.47073132454488387 Test set score: 0.271279516749039
ss_twograms_stop Training set score: 0.4331450094161959 Test set score: 0.26578802855573863
ss_twograms_nstop Training set score: 0.40952605147520404 Test set score: 0.26285923485264506
ss_onegrams_stop Training set score: 0.3312146892655367 Test set score: 0.24693391909207396
noss_twograms_nstop Training set score: 0.4971751412429379 Test set score: 0.2736591616328025
noss_onegrams_stop Training set score: 0.3597771500313873 Test set score: 0.26578802855573863
ss_onegrams_nstop Training set score: 0.3401600753295669 Test set score: 0.25846604429800474
noss_onegrams_nstop Training set score: 0.3828468298807282 Test set score: 0.2535237049240344


The case to beat is:
    `ss_onegrams_stop Training set score: 0.3312146892655367 Test set score: 0.24693391909207396`

In [56]:
scores = [[label, results_d[label]['test_score']] for label in results_d.keys()]
label_ind = np.array([scores[ik][1] for ik in range(len(scores))]).argmax()
label = scores[label_ind][0]

In [57]:
# test alternate models
data = results_d[label]['data']

In [67]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(data['X_train'], data['y_train'])
print(label, 'Training set score:',svc.score(data['X_train'], data['y_train']), 'Test set score:', svc.score(data['X_test'], data['y_test']) )

y_pred = svc.predict(data['X_test'])
pd.crosstab(y_pred, data['y_test'], dropna=False)



noss_twograms_nstop Training set score: 0.21720025109855617 Test set score: 0.233205198608823


character,BARTLET,C.J. CREGG,DONNA MOSS,LEO MCGARRY,SAM SEABORN,TOBY ZIEGLER
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BARTLET,1274,952,534,933,952,818


In [68]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc.fit(data['X_train'], data['y_train'])
print(label, 'Training set score:',gbc.score(data['X_train'], data['y_train']), 'Test set score:', gbc.score(data['X_test'], data['y_test']) )

y_pred = gbc.predict(data['X_test'])
pd.crosstab(y_pred, data['y_test'], dropna=False)

noss_twograms_nstop Training set score: 0.38292529817953547 Test set score: 0.2756727073036793


character,BARTLET,C.J. CREGG,DONNA MOSS,LEO MCGARRY,SAM SEABORN,TOBY ZIEGLER
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BARTLET,1130,735,430,713,757,635
C.J. CREGG,50,124,23,64,48,50
DONNA MOSS,14,12,41,17,14,14
LEO MCGARRY,35,29,11,73,17,25
SAM SEABORN,20,23,14,28,68,24
TOBY ZIEGLER,25,29,15,38,48,70


In [63]:
from sklearn.model_selection import GridSearchCV
parameters = {
            'penalty':['l2'],
            'C':[1,25],
            'solver': ['lbfgs'],#,'newton-cg',  'liblinear', 'sag'],
            'class_weight': ['balanced', None],
            'max_iter': [100],
            'multi_class':['auto']
        }
lr = LogisticRegression()
GS = GridSearchCV(lr, parameters,cv=5,verbose=5, n_jobs = -1)
GS.fit(data['X_train'], data['y_train'])

new_params = GS.best_params_
print(new_params)
print(GS.score(data['X_test'], data['y_test']))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed: 11.8min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 11.9min finished


{'C': 25, 'class_weight': None, 'max_iter': 100, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'lbfgs'}
0.2791506498261029


In [65]:
from sklearn.model_selection import GridSearchCV
parameters = {
            'penalty':['l2'],
            'C':[25],
            'solver': ['lbfgs','liblinear', 'sag'],
            'class_weight': [None],
            'max_iter': [100],
            'multi_class':['auto']
        }
lr = LogisticRegression()
GS2 = GridSearchCV(lr, parameters,cv=5,verbose=5, n_jobs = -1)
GS2.fit(data['X_train'], data['y_train'])

new_params2 = GS2.best_params_
print(new_params2)
print(GS2.score(data['X_test'], data['y_test']))

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed: 11.5min remaining:  2.9min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 15.9min finished


{'C': 25, 'class_weight': None, 'max_iter': 100, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'liblinear'}
0.29452681676734394


In the end I did manage a 5% bump from the lowest score to the highest score by picking the feature set and model/model parameters exhaustively.  

# TF-IDF  
Now let's consider a TF-IDF approach

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(filtered_ww_longlines['line'], filtered_ww_longlines['character'], test_size=0.3, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.6, # drop words that occur in more than half the paragraphs
                             min_df=3, # only use words that appear at least twice
                             stop_words='english', 
                             ngram_range = (1,2),
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
ww_tfidf=vectorizer.fit_transform(filtered_ww_longlines['line'])
print("Number of features: %d" % ww_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(ww_tfidf,filtered_ww_longlines['character'],  test_size=0.3, random_state=0)



Number of features: 6307


In [72]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(500)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.fit_transform(X_test_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(25, 50,1):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[:10])

Percent variance captured by all components: 58.67301306043237
Component 25:
line
No, sir.                               0.575166
I will, sir.                           0.575166
No, sir.                               0.575166
No, sir.                               0.575166
Sir, I...                              0.575166
This is take five, sir.                0.575166
What, were you inconvenienced, Sir?    0.575166
I have my concerns, sir.               0.575166
No, sir.                               0.575166
No, sir.                               0.575166
Name: 25, dtype: float64
Component 26:
line
Please, tell me it's not...                                 0.890899
Tell her where you are.                                     0.890899
No, but I have to tell them.                                0.890899
Tell me about yourself.                                     0.890899
Tell him it's done.                                         0.890899
Tell me when it's done.                          

In [47]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train_lsa, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train_lsa, y_train))
print('\nTest set score:', lr.score(X_test_lsa, y_test))

y_pred = train.predict(X_test_lsa)
pd.crosstab(y_pred, y_test, dropna=False)



(12744,) (12744,)
Training set score: 0.33976773383553044

Test set score: 0.19897492220391727


character,BARTLET,C.J. CREGG,DONNA MOSS,LEO MCGARRY,SAM SEABORN,TOBY ZIEGLER
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BARTLET,444,304,165,306,323,259
C.J. CREGG,189,175,86,172,142,155
DONNA MOSS,47,45,25,22,30,39
LEO MCGARRY,156,128,64,137,125,99
SAM SEABORN,214,151,93,146,166,126
TOBY ZIEGLER,224,149,101,150,166,140


In the end, the LSA features only did slightly better than naive guessing.  Indeed, unlike with the Bag of Words approach, in each instance the model guessed a given line belonged to Bartlet more often than it did the actual character.  

I suspect LSA is most useful when author voices are quite distinct, or the corpus represents a number of very distinct topics.  Neither this case, nor the exercises using Jane Austen prose make an argument for its usefulness in a one author scenario.  (Small sample size, but I'm starting to suspect that differentiation is in delivery, not in gross syntax or grammer.  That said, with more time, I suspect I could develop a richer feature set that would capture more of differences between character voices.)

As an exercise, I implemented some published code from a blog post about Gensim.  It yielded very little, but I leave it here for reference.  

# Gensim

In [159]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [169]:
import nltk
from nltk.corpus import stopwords
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [167]:
train = pd.DataFrame({'line': X_train, 'character': y_train})
test = pd.DataFrame({'line': X_test, 'character': y_test})


In [170]:
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['line']), tags=[r.character]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['line']), tags=[r.character]), axis=1)

In [172]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [173]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])


100%|██████████| 14851/14851 [00:00<00:00, 584368.66it/s]


In [174]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 14851/14851 [00:00<00:00, 802711.49it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2253522.26it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1979395.87it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1611799.64it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2160433.15it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1975942.42it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2022455.56it/s]
100%|██████████| 14851/14851 [00:00<00:00, 986437.92it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2368876.54it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1905230.58it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2134448.44it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2356598.39it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1764478.18it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2089273.79it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2124909.90it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2041880.57it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2173474.60it/s]

CPU times: user 21.3 s, sys: 4.87 s, total: 26.2 s
Wall time: 18.6 s


In [175]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [176]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [177]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)



In [178]:
from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


Testing accuracy 0.17609173735469683
Testing F1 score: 0.151498149323639


In [179]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 14851/14851 [00:00<00:00, 1134022.88it/s]


In [180]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 14851/14851 [00:00<00:00, 1626615.36it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1644089.23it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1763828.65it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1816394.27it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1673147.51it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1487619.62it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2244023.66it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2127668.01it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1695878.27it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1596350.81it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2248235.35it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1471419.67it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1372804.00it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2215608.19it/s]
100%|██████████| 14851/14851 [00:00<00:00, 2234925.50it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1777925.18it/s]
100%|██████████| 14851/14851 [00:00<00:00, 1669783.63it/

CPU times: user 33.7 s, sys: 12.3 s, total: 46 s
Wall time: 33.7 s


In [181]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



Testing accuracy 0.20750863964813068
Testing F1 score: 0.20835794709831132


In [182]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [184]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [185]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [186]:
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)

In [187]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))




Testing accuracy 0.20562362551052465
Testing F1 score: 0.20666847253973306


In [326]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range = (2,2))
vectorizer.fit_transform([filtered_ww_longlines['line'].iloc[3]])
print(vectorizer.get_feature_names())
print(filtered_ww_longlines['line'].iloc[3])

['all shouting', 'at once', 'at the', 'be full', 'briefing tonight', 'folks listen', 'full briefing', 'get you', 'going to', 'in moment', 'is going', 'listen up', 'll be', 'moment or', 'my name', 'name at', 'once there', 'or two', 'president is', 'say something', 'shouting my', 'something that', 'sure to', 'that sure', 'the president', 'the white', 'there ll', 'to get', 'to say', 'tonight at', 'two the', 'up in', 'white house', 'you all']
Folks, listen up. In a moment or two, the President is going to say something that's sure to get you all shouting my name at once. There'll be a full briefing tonight at the White House.
