In [1]:
import pandas as pd
import re

In [361]:
def clean_script(script):
    script = script.lstrip('<pre>').split('\nTHE END')[0]
    script = script.replace('McGARRY', 'MCGARRY')
    script = script.replace('CUT TO:\n\n', 'CUT TO: ')
    script_ = re.sub(r'([A-Z])(\nCUT TO:)', r'\1\n\2',  script)
    script = script_.replace('FADE OUT.\nEND', '')
    script = script_.replace('FADE OUT.\n\nEND ', '')
    script = script.rstrip('\n')
     
    return script#' '.join([act.split('\n\n',1)[1] for act in script.split('ACT ')[1:]])


In [190]:
def make_script_df(script, episode_num):
    bits = script.split('\n\n')
    return pd.DataFrame({'elements':bits, 'episode': [episode_num for ik in range(len(bits))]})

In [204]:
def parse_stage_dir(script_df):
    stage_dir = []
    for ik, bit in enumerate(script_df.elements):
        if (bit.split('\n')[0].isupper() == False) and ('[' not in bit.split('\n')[0]):
            stage_dir.append(1)
            script_df.loc[ik, 'elements'] = bit.replace('\n', ' ')
        else:
            stage_dir.append(0)
            
    script_df['stage_dir'] = stage_dir
    return script_df

In [187]:
def parse_scene_set(script_df):
    scene_set = []
    for elem in script_df.elements:
        lines = elem.split('\n')
        if ('-' in lines[0]) and (lines[0].isupper()):
            scene_set.append(1)
        else:
            scene_set.append(0)

    script_df['scene_set'] = scene_set
    return script_df

In [373]:
import numpy as np

def parse_lines_characters(script_df):
    line = []
    character = []
    deliv_dir = []
    audio_dir = []

    for ik in range(len(script_df.elements)):
        if (script_df.iloc[ik]['scene_set'] == 0) and (script_df.iloc[ik]['stage_dir'] == 0):
            tmp_elem  = script_df.iloc[ik]['elements'].split('\n')
            
            if len(tmp_elem)>1:
                # character name & audio delivery notes
                tmp_audio_dir = tmp_elem[0].split(' [')
                if len(tmp_audio_dir)>1:
                    audio_dir.append(tmp_audio_dir[1].strip(']'))
                    character.append(tmp_audio_dir[0])
                else:
                    if (tmp_audio_dir[0].isupper() == True):
                        audio_dir.append(np.nan)
                        character.append(tmp_audio_dir[0])
                    else:
                        character.append(' ')
                        audio_dir.append(np.nan)

                # line & acting delivery notes
                tmp_deliv_dir = tmp_elem[1].split('] ')
                if len(tmp_deliv_dir)>1:
                    deliv_dir.append(tmp_deliv_dir[0].strip('['))
                    line.append(tmp_deliv_dir[1])
                else:
                    line.append(tmp_deliv_dir[0])
                    deliv_dir.append(np.nan)
            else:
                line.append(np.nan)
                character.append('Drop')
                deliv_dir.append(np.nan)
                audio_dir.append(np.nan)

        else:
            line.append(np.nan)
            character.append(' ')
            deliv_dir.append(np.nan)
            audio_dir.append(np.nan)

    script_df['line'] = line
    script_df['character'] = character
    script_df['deliv_dir'] = deliv_dir
    script_df['audio_dir'] = audio_dir
    
    return script_df

In [530]:
# load scraped scripts
ww_df = pd.read_json('WW.json')

In [531]:
# pull apart scripts
script_dfs = []
for ik, text in enumerate(ww_df.text):
    ww_script = text
    script_dfs.append(parse_lines_characters(parse_scene_set(parse_stage_dir(make_script_df(clean_script(ww_script),ik+1)))))

# concatenate scripts
ww_dfs = pd.concat(script_dfs, axis=0, join='outer')

In [533]:
# drop rows with character names that aren't characters
for char in ['DISSOLVE TO', 'FADE OUT.', 'SMASH CUT TO: MAIN TITLES.', 'THE WEST WING', 'ACT ONE', 'ACT TWO', 'END TEASER', 'Drop']:
    ww_dfs = ww_dfs[~ww_dfs['character'].str.contains( char)]


In [534]:
# standardize names 
names_d = {'C.J.': 'C.J. CREGG', 'DONNA':'DONNA MOSS', 'LEO':'LEO MCGARRY', 'JOSH': 'JOSH LYMAN', 'BILLY': 'BILLY KENWORTHY', 'MARY': 'MARY MARSH', 'SAM': 'SAM SEABORN', 'TOBY': 'TOBY ZIEGLER', 'PRESIDENT JED BARTLET': 'BARTLET' }

for key, value in names_d.items():
    ww_dfs['character'] = ww_dfs['character'].str.replace(value, key)
    ww_dfs['character'] = ww_dfs['character'].str.replace(key, value)

In [535]:
# check character lists
character_grps = ww_dfs.groupby('character')

for grp in character_grps:
    print(grp[0], len(grp[1]))


  1664
ABBEY 176
ADVISOR 4
AGENT 10
AIDES 2
AL 14
AL KIEFER 1
ALL 2
ANNOUNCER 1
ARMY GUY 1 3
ARMY GUY 2 3
ARMY GUY 3 1
BAMBANG 12
BARTLET 1690
BILL 1
BILLY KENWORTHY 27
BOB 13
BOBBI 5
BOBBY 29
BONNIE 31
BRUCE 7
BRUNO 17
BURNS 12
BUTTERFIELD 8
C.J 2
C.J. CREGG 1454
CABINET OFFICERS 2
CALDWELL 23
CANDY 3
CAPTAIN 5
CARL 12
CARMINE 1
CAROL 52
CATHY 62
CHARLIE 348
CHINESE AMBASSADOR 7
CHRIS 11
CIA DIRECTOR 5
CLAYPOOL 42
CONGRESSMAN 7
CONGRESSMAN 1 6
CONGRESSMAN 2 3
CONGRESSWOMAN 1
CROSSFIELD 1
CROUCH 18
CROWD 3
CUT TO: 1
CUT TO: INT. AIR FORCE ONE 1
DAISY 40
DANNY 392
DAVID 1
DAVID HASSELHOFF 1
DIRECTOR 4
DONNA MOSS 558
DONNIE 1
DRUMM 7
ED 13
EDGAR DRUMM 1
EMMA 1
EVERYONE 3
FADE OUT 1
FATHER CAVANAUGH 24
FEMALE AIDE 4
FITZWALLACE 83
FLIGHT ATTENDANT 1 1
FLIGHT ATTENDANT 2 5
FLIGHT ATTENDANT 3 1
FRED 3
FRENCHMAN 1
GENERAL 2
GEORGE 16
GINA 59
GINA TOSCANO 1
GINGER 17
GIRL 3
GIRLS 5
GIRLS IN THE CROWD 1
GLADMAN 16
GOMEZ 6
GRANT 3
GUY 2
GUY 1 19
GUY 2 5
GUY 3 3
GUYS 1
HACKETT 12
HAROLD 12
HAROL

In [536]:
# main characters
main_characters = ['C.J. CREGG', 'DONNA MOSS', 'LEO MCGARRY','JOSH LYMAN', 'CHARLIE', 'SAM SEABORN', 'TOBY ZIEGLER', 'BARTLET']
filtered_ww = ww_dfs
# for char in main_characters:
#     filtered_ww = filtered_ww[filtered_ww['character'] == char]

filtered_ww = filtered_ww[filtered_ww['character'].isin(main_characters)]

In [537]:
import spacy
nlp = spacy.load('en')

In [417]:
from collections import Counter
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

In [538]:
commonwords = []

for char in main_characters:
    _text = ' '.join(filtered_ww['line'][filtered_ww.character == char])
    _words = bag_of_words(nlp(_text))
    commonwords = list(set(commonwords+_words))

In [540]:
quotes = [[nlp(filtered_ww.iloc[ik]['line']), filtered_ww.iloc[ik]['character']] for ik in range(len(filtered_ww))]

In [541]:
quotes_df = pd.DataFrame(quotes)

In [545]:
# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(quotes, common_words, **kwargs):
    print(len(quotes))
#     df2 = {'line': [sentence[0] for sentence in sentences], 'character': [sentence[1] for sentence in sentences]}
    
    # sentence stats
    sent_stats = ['comma_ct', 'word_ct', 'adv_ct', 'adp_ct', 'propn_ct', 'adj_ct', 'punct_ct'] #'verb_ct', 'noun_ct','det_ct',
    if 'sent_stats' in kwargs:
        df = pd.DataFrame(columns=list(common_words) + sent_stats)
#         df.loc[:, sent_stats] = 0
    else:
        df = pd.DataFrame(columns=common_words)
    
    df.loc[:, common_words] = 0
    for col in df.columns:
        df[col] = np.zeros(len(quotes[0]))
    df['line'] = quotes[0] 
    df['character'] = quotes[1]
    
    print('made it to the loop')
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['line']):
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            try:
                df.loc[i, word] += 1
            except:
                print(word)
        
        # add sentence features
        if 'sent_stats' in kwargs:
            commas = 0
            for token in sentence:
                if token.orth_ == ',':
                    commas += 1
            df.loc[i, 'comma_ct'] = commas
                    
            c = Counter([token.pos_ for token in sentence])
            for key in c.keys():
                if key in pos_d.keys():
                    df.loc[i, pos_d[key]] = c[key]
            
            df.loc[i, 'word_ct'] = len([token for token in sentence if (not token.is_punct)])

        # This counter is just to make sure the kernel didn't hang.
        if i % 1000 == 0:
            print("Processing row {}".format(i))
            
    return df

In [546]:
word_counts2 = bow_features(quotes_df, words, kwargs = {'sent_stats' : True})

10474
made it to the loop
Processing row 0
line
line
line
Processing row 1000
line
line
line
Processing row 2000
line
Processing row 3000
line
Processing row 4000
line
Processing row 5000
Processing row 6000
line
line
line
line
line
line
Processing row 7000
Processing row 8000
Processing row 9000
line
line
line
Processing row 10000


In [547]:
from sklearn.model_selection import train_test_split

Y = word_counts2['character']
X = word_counts2.iloc[:, ~word_counts2.columns.isin(['character','line'])]

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=0)

In [548]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

y_pred = train.predict(X_test)
pd.crosstab(y_pred, y_test, dropna=False)



(7331, 1751) (7331,)
Training set score: 0.4442777247305961

Test set score: 0.23639834552974864


character,BARTLET,C.J. CREGG,CHARLIE,DONNA MOSS,JOSH LYMAN,LEO MCGARRY,SAM SEABORN,TOBY ZIEGLER
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BARTLET,172,70,15,14,106,71,63,70
C.J. CREGG,27,58,7,12,49,23,43,33
CHARLIE,2,7,8,0,7,4,1,7
DONNA MOSS,3,4,1,12,9,4,12,3
JOSH LYMAN,190,212,51,92,330,191,232,176
LEO MCGARRY,50,34,13,19,50,67,40,38
SAM SEABORN,38,29,11,9,45,27,59,21
TOBY ZIEGLER,23,23,10,7,45,27,30,37
