In [3]:
import pandas as pd
import pickle
import numpy as np
import re

# Simpsons Kaggle

## Cleaning Data

In [4]:
%store -r df_nation

In [5]:
# df_nation.rename(columns={'levels':'1000-level'}, inplace=True)
conditions_frequency = [(df_nation['levels'] <= 3),
             (df_nation['levels'] == 4),
             (df_nation['levels'] >= 5) & (df_nation['levels']<= 14)]
values_frequency = ['1-3', '4', '5-14']
df_nation['levels_frequency'] = np.select(conditions_frequency, values_frequency)

conditions_coverage = [(df_nation['levels']<=14),
                        (df_nation['levels']>=15) & (df_nation['levels']<=25),
                      (df_nation['levels']>=26)]
values_coverage = [df_nation['levels'], 15, df_nation['levels']]

df_nation['levels_coverage'] = np.select(conditions_coverage, values_coverage)

In [6]:
df_simpsons = pd.read_csv('simpsons_script_lines.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
from cleaning import clean_transcripts

round1 = lambda x: clean_transcripts(x)

df_simpsons['spoken_words'] = df_simpsons['spoken_words'].astype(str).apply(round1)
df_simpsons['spoken_words'] = df_simpsons['spoken_words'].apply(lambda x:x+' ')
not_str = df_simpsons[df_simpsons['character_id'].str.contains('[^\d]', na=False)].index.values
df_simpsons.drop(not_str, axis=0, inplace=True)
df_simpsons.dropna(subset=['character_id'], inplace=True)
df_simpsons['character_id'] = df_simpsons['character_id'].astype(int)

### Simpsons characteres by total number of lines

In [300]:
df_simpsons_lines = df_simpsons.groupby('raw_character_text').count().sort_values(by=['raw_text'], ascending=False)[['raw_text']][:20]
df_simpsons_lines

Unnamed: 0_level_0,raw_text
raw_character_text,Unnamed: 1_level_1
Homer Simpson,29842
Marge Simpson,14159
Bart Simpson,13777
Lisa Simpson,11502
C. Montgomery Burns,3172
Moe Szyslak,2864
Seymour Skinner,2443
Ned Flanders,2145
Grampa Simpson,1886
Milhouse Van Houten,1862


### Total number of words

In [8]:
df_simpsons_analysis = df_simpsons[['character_id', 'spoken_words']].groupby('character_id').sum()
df_simpsons_analysis = df_simpsons_analysis.reset_index()
df_simpsons_analysis.sort_values(by='character_id')
df_simpsons_analysis['tokens'] = df_simpsons_analysis['spoken_words'].apply(lambda x:len(x.split()))
#Top 100 characters
df_simpsons_analysis = df_simpsons_analysis.sort_values('tokens', ascending=False)[:100]
df_simpsons_analysis = df_simpsons_analysis.reset_index(drop=True)
df_simpsons_analysis

Unnamed: 0,character_id,spoken_words,tokens
0,2,never thrown a party what about that big bash ...,288596
1,1,lisa tell your father homer you are not allowe...,134192
2,8,victory party under the slide hey thanks for y...,117468
3,9,where s mr bergstrom that life is worth living...,106054
4,15,must turn over got to greet dignitaries absolu...,38266
...,...,...,...
95,238,that s right michael jackson the thriller h...,777
96,6,ew a bug nan it s your turn jimbo hm someone s...,769
97,2065,hey guys c amon shut up oh yes believe me my g...,765
98,2304,i am an orphan from capitol city and those who...,737


In [303]:
df_character = df_simpsons[['character_id', 'raw_character_text']]
df_character = df_character.drop_duplicates('character_id').reset_index(drop=True).sort_values('character_id')
df_character = pd.merge(df_simpsons_analysis, df_character, on='character_id')[['character_id', 'raw_character_text', 'tokens']][:100]
df_character = df_character.reset_index()
df_character['index'] = df_character['index'].apply(lambda x:'index'+str(x))
df_character

Unnamed: 0,index,character_id,raw_character_text,tokens
0,index0,2,Homer Simpson,288596
1,index1,1,Marge Simpson,134192
2,index2,8,Bart Simpson,117468
3,index3,9,Lisa Simpson,106054
4,index4,15,C. Montgomery Burns,38266
...,...,...,...,...
95,index95,238,Marty,777
96,index96,6,Dewey Largo,769
97,index97,2065,Drederick Tatum,765
98,index98,2304,Armin,737


# Exploratory Data Analysis

### Document-Term Matrix

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 
import scipy.sparse
cv_transcripts = CountVectorizer()#only misleading words excluded for the analysis
cv_matrix_transcripts = cv_transcripts.fit_transform(df_simpsons_analysis['spoken_words'])
df_dtm = pd.DataFrame.sparse.from_spmatrix(cv_matrix_transcripts, index=df_simpsons_analysis.index,
                                           columns=cv_transcripts.get_feature_names())

In [11]:
df_dtm = df_dtm.T

### Formulas

In [12]:
#behaviour
def get_frames(indices, dtm, thousand_level=None): #, thousand_level ['4'], ['5-14']
    """1. The df_dtm gives the number of times a word (all_forms) shows up in an episode/movie
       2. Then its joined to get the level and base_forms of each word
       3. The final df has the words counted for each episode so it can be seen base_forms repetition in the final df"""
    frames = []
    for index in indices:
        df_count_words = dtm[[index]].loc[dtm[index]!=0]
        df_count_words = df_count_words.reset_index()
        df_count_words.rename(columns={"index": "all_forms", index: "count"}, inplace=True)

        df_count_words_level = pd.merge(df_count_words, df_nation[['base_forms', 'all_forms',
                                'levels_frequency', 'levels', 'levels_coverage']], how='left', on='all_forms')
        df_count_words_level = df_count_words_level.dropna() ##INNER JOIN
        if thousand_level is not None:
            df_count_words_level = df_count_words_level[df_count_words_level['levels_frequency'].isin(thousand_level)]
        frames.append(df_count_words_level)
    df_count_words_level = pd.concat(frames)
    return df_count_words_level

def get_count(df_movie_nation):
    """Grouping by base_forms of word and summing their number of encounters
    Note: the 'count' column type in most cases is 'SPARSE NUMPY INT'. Then I have to use the 
    first 2 lines in the 'try' block to get 'int'. However, for some reason some columns after
    concatenating (in def get_frames)  are 'int'values. They go straigt to the 'except' block """
    try:
        int_value = [i.item() for i in df_movie_nation['count']]
        df_movie_nation['count'] = int_value
        df_count = df_movie_nation.groupby(by=['base_forms']).sum().reset_index()
        df_count = df_count.rename(columns={'count':'base_forms_encounters'})
        df_count = df_count.sort_values(by=['base_forms_encounters'], ascending=False)
    except:
        df_count = df_movie_nation.groupby(by=['base_forms']).sum().reset_index()
        df_count = df_count.rename(columns={'count':'base_forms_encounters'})
        df_count = df_count.sort_values(by=['base_forms_encounters'], ascending=False)
    return df_count

def get_statistics(df_movie_nation, df_count):
    """We merge and only get the root encounters to have a clean df when presenting data"""
    df_movie_nation.drop_duplicates(['base_forms'], inplace=True)
    df_movie_nation.drop(['all_forms','count'], axis=1, inplace=True)#Dropping columns from the df created in get_frames
    df_movie_nation = pd.merge(df_movie_nation, df_count, on='base_forms')[['base_forms_encounters']]

    conditions = [
    (df_movie_nation['base_forms_encounters'] == 1),
    (df_movie_nation['base_forms_encounters'] == 2),
    (df_movie_nation['base_forms_encounters'] >= 3) & (df_movie_nation['base_forms_encounters'] <= 4),
    (df_movie_nation['base_forms_encounters'] >= 5) & (df_movie_nation['base_forms_encounters'] <= 7),
    (df_movie_nation['base_forms_encounters'] >= 8) & (df_movie_nation['base_forms_encounters'] <= 9),
    (df_movie_nation['base_forms_encounters'] >= 10)
    ]
    values = ['1', '2', '3-4', '5-7', '8-9', '10+'] ###
    df_movie_nation['range_encounters'] = np.select(conditions, values)
    
    df_statistics = df_movie_nation.groupby('range_encounters').count()
    total = np.sum(df_statistics['base_forms_encounters'])
    #### APPLYING ON ROOT_encounter (shouldn't be no blanks in that column)
    df_statistics['%'] = df_statistics['base_forms_encounters'].apply(lambda x: round((int(x)/int(total))*100))
    df_statistics.sort_values(by=['%'], ascending=False)
    
    df_total_word_families = pd.DataFrame(df_statistics.sum()).T.rename(index={0:'Total word families'})
    df_statistics = df_statistics.append(df_total_word_families)
    df_tokens = pd.DataFrame(df_movie_nation[['base_forms_encounters']].sum(), columns=['base_forms_encounters'])
    df_tokens.rename(index={'base_forms_encounters':'Tokens'}, inplace=True)
    df_statistics = df_statistics.append(df_tokens)
    df_statistics = df_statistics.fillna(0)
    return df_statistics

### What the Top 10 characters say?

In [485]:
dict_simpsons = {}
levels = ['4', '5-14']
for level in levels:
    for index in df_simpsons_analysis.index.values[:10]:
        df_character_vocabulary = get_frames([index], df_dtm, [level])
        df_count= get_count(df_character_vocabulary)
        dict_simpsons['index%s_lvl%s' % (index, level)] = df_count

### What each character repeat the most (all levels)

In [265]:
df_vocabulary_simpsons = pd.concat(dict_simpsons)[['base_forms', 'base_forms_encounters']].reset_index()
df_vocabulary_simpsons.drop('level_1', axis=1, inplace=True)
df_vocabulary_simpsons['lvl'] = df_vocabulary_simpsons['level_0'].apply(lambda x:x.split('_')[1])
df_vocabulary_simpsons['level_0'] = df_vocabulary_simpsons['level_0'].apply(lambda x:x.split('_')[0])
df_vocabulary_simpsons.rename(columns={'level_0':'index'}, inplace=True)
df_vocabulary_simpsons = pd.merge(df_vocabulary_simpsons, df_character[['index', 'raw_character_text']])
for i in df_vocabulary_simpsons.values:
    print(i[1]+'|'+str(i[2])+'|'+i[3]+'|'+i[4])

jerk |72|lvl4|Homer Simpson
thou |46|lvl4|Homer Simpson
robot |38|lvl4|Homer Simpson
hug |38|lvl4|Homer Simpson
precious |36|lvl4|Homer Simpson
fake |33|lvl4|Homer Simpson
genius |30|lvl4|Homer Simpson
couch |28|lvl4|Homer Simpson
miracle |24|lvl4|Homer Simpson
sane |23|lvl4|Homer Simpson
poke |22|lvl4|Homer Simpson
rocked |19|lvl4|Homer Simpson
gamble |19|lvl4|Homer Simpson
grease |19|lvl4|Homer Simpson
lick |18|lvl4|Homer Simpson
baseball |18|lvl4|Homer Simpson
corn |17|lvl4|Homer Simpson
anniversary |17|lvl4|Homer Simpson
bubble |16|lvl4|Homer Simpson
humiliate |16|lvl4|Homer Simpson
handsome |16|lvl4|Homer Simpson
aye |16|lvl4|Homer Simpson
whale |16|lvl4|Homer Simpson
herb |15|lvl4|Homer Simpson
thumb |15|lvl4|Homer Simpson
wreck |14|lvl4|Homer Simpson
nightmare |13|lvl4|Homer Simpson
belly |13|lvl4|Homer Simpson
cartoon |13|lvl4|Homer Simpson
horn |13|lvl4|Homer Simpson
rocket |13|lvl4|Homer Simpson
dolphin |12|lvl4|Homer Simpson
parade |12|lvl4|Homer Simpson
celebrity |12|lvl4|H

jab |2|lvl5-14|Homer Simpson
juror |2|lvl5-14|Homer Simpson
pagan |2|lvl5-14|Homer Simpson
den |2|lvl5-14|Homer Simpson
ovulate |2|lvl5-14|Homer Simpson
rye |2|lvl5-14|Homer Simpson
hover |2|lvl5-14|Homer Simpson
sabbath |2|lvl5-14|Homer Simpson
cockney |2|lvl5-14|Homer Simpson
parlour |2|lvl5-14|Homer Simpson
moose |2|lvl5-14|Homer Simpson
humane |2|lvl5-14|Homer Simpson
safari |2|lvl5-14|Homer Simpson
parasite |2|lvl5-14|Homer Simpson
paranoid |2|lvl5-14|Homer Simpson
saga |2|lvl5-14|Homer Simpson
thump |2|lvl5-14|Homer Simpson
carpenter |2|lvl5-14|Homer Simpson
salon |2|lvl5-14|Homer Simpson
hydrant |2|lvl5-14|Homer Simpson
hotline |2|lvl5-14|Homer Simpson
hospitality |2|lvl5-14|Homer Simpson
tomb |2|lvl5-14|Homer Simpson
casa |2|lvl5-14|Homer Simpson
hock |2|lvl5-14|Homer Simpson
aspirin |2|lvl5-14|Homer Simpson
dignify |2|lvl5-14|Homer Simpson
rugged |2|lvl5-14|Homer Simpson
homicide |2|lvl5-14|Homer Simpson
cobble |2|lvl5-14|Homer Simpson
diarrhoea |2|lvl5-14|Homer Simpson
paste 

roost |1|lvl5-14|Homer Simpson
rogue |1|lvl5-14|Homer Simpson
runway |1|lvl5-14|Homer Simpson
ruse |1|lvl5-14|Homer Simpson
rusk |1|lvl5-14|Homer Simpson
roe |1|lvl5-14|Homer Simpson
casing |1|lvl5-14|Homer Simpson
rodent |1|lvl5-14|Homer Simpson
cartilage |1|lvl5-14|Homer Simpson
autopsy |1|lvl5-14|Homer Simpson
roach |1|lvl5-14|Homer Simpson
sabotage |1|lvl5-14|Homer Simpson
sacks |1|lvl5-14|Homer Simpson
sacrament |1|lvl5-14|Homer Simpson
risotto |1|lvl5-14|Homer Simpson
ripple |1|lvl5-14|Homer Simpson
sacrilege |1|lvl5-14|Homer Simpson
sadist |1|lvl5-14|Homer Simpson
ripe |1|lvl5-14|Homer Simpson
rink |1|lvl5-14|Homer Simpson
cappuccino |1|lvl5-14|Homer Simpson
retch |1|lvl5-14|Homer Simpson
ravage |1|lvl5-14|Homer Simpson
sever |1|lvl5-14|Homer Simpson
semantic |1|lvl5-14|Homer Simpson
semester |1|lvl5-14|Homer Simpson
camouflage |1|lvl5-14|Homer Simpson
redundant |1|lvl5-14|Homer Simpson
sensory |1|lvl5-14|Homer Simpson
cashew |1|lvl5-14|Homer Simpson
sensual |1|lvl5-14|Homer Sim

protagonist |1|lvl5-14|Homer Simpson
prostate |1|lvl5-14|Homer Simpson
potpourri |1|lvl5-14|Homer Simpson
potion |1|lvl5-14|Homer Simpson
posture |1|lvl5-14|Homer Simpson
possum |1|lvl5-14|Homer Simpson
plantation |1|lvl5-14|Homer Simpson
plankton |1|lvl5-14|Homer Simpson
plank |1|lvl5-14|Homer Simpson
planed |1|lvl5-14|Homer Simpson
plagiarise |1|lvl5-14|Homer Simpson
ambush |1|lvl5-14|Homer Simpson
pixie |1|lvl5-14|Homer Simpson
amidst |1|lvl5-14|Homer Simpson
piranha |1|lvl5-14|Homer Simpson
clench |1|lvl5-14|Homer Simpson
piper |1|lvl5-14|Homer Simpson
pint |1|lvl5-14|Homer Simpson
pinpoint |1|lvl5-14|Homer Simpson
pinnacle |1|lvl5-14|Homer Simpson
pined |1|lvl5-14|Homer Simpson
clerical |1|lvl5-14|Homer Simpson
pimp |1|lvl5-14|Homer Simpson
piggyback |1|lvl5-14|Homer Simpson
pigeon |1|lvl5-14|Homer Simpson
clink |1|lvl5-14|Homer Simpson
pier |1|lvl5-14|Homer Simpson
plasma |1|lvl5-14|Homer Simpson
platinum |1|lvl5-14|Homer Simpson
plaza |1|lvl5-14|Homer Simpson
polaroid |1|lvl5-14

wrestle |3|lvl5-14|Marge Simpson
smother |3|lvl5-14|Marge Simpson
beep |3|lvl5-14|Marge Simpson
appetite |3|lvl5-14|Marge Simpson
stove |3|lvl5-14|Marge Simpson
boogie |3|lvl5-14|Marge Simpson
pornography |3|lvl5-14|Marge Simpson
rag |3|lvl5-14|Marge Simpson
quilt |3|lvl5-14|Marge Simpson
subway |3|lvl5-14|Marge Simpson
mayonnaise |3|lvl5-14|Marge Simpson
vow |3|lvl5-14|Marge Simpson
usurp |3|lvl5-14|Marge Simpson
astronaut |3|lvl5-14|Marge Simpson
kidney |3|lvl5-14|Marge Simpson
lad |3|lvl5-14|Marge Simpson
flatter |3|lvl5-14|Marge Simpson
sour |2|lvl5-14|Marge Simpson
shoplift |2|lvl5-14|Marge Simpson
gopher |2|lvl5-14|Marge Simpson
eel |2|lvl5-14|Marge Simpson
sofa |2|lvl5-14|Marge Simpson
soothe |2|lvl5-14|Marge Simpson
shred |2|lvl5-14|Marge Simpson
squabble |2|lvl5-14|Marge Simpson
spurt |2|lvl5-14|Marge Simpson
gloat |2|lvl5-14|Marge Simpson
glum |2|lvl5-14|Marge Simpson
goggle |2|lvl5-14|Marge Simpson
shorts |2|lvl5-14|Marge Simpson
smear |2|lvl5-14|Marge Simpson
sober |2|lvl5-

pester |1|lvl5-14|Marge Simpson
whine |1|lvl5-14|Marge Simpson
pew |1|lvl5-14|Marge Simpson
pharmacy |1|lvl5-14|Marge Simpson
phenomenal |1|lvl5-14|Marge Simpson
whimsy |1|lvl5-14|Marge Simpson
sponge |1|lvl5-14|Marge Simpson
wheat |1|lvl5-14|Marge Simpson
splurge |1|lvl5-14|Marge Simpson
piazza |1|lvl5-14|Marge Simpson
whatsoever |1|lvl5-14|Marge Simpson
splendid |1|lvl5-14|Marge Simpson
pier |1|lvl5-14|Marge Simpson
splay |1|lvl5-14|Marge Simpson
pigeon |1|lvl5-14|Marge Simpson
wench |1|lvl5-14|Marge Simpson
piggyback |1|lvl5-14|Marge Simpson
wee |1|lvl5-14|Marge Simpson
pike |1|lvl5-14|Marge Simpson
pilfer |1|lvl5-14|Marge Simpson
racy |1|lvl5-14|Marge Simpson
swivel |1|lvl5-14|Marge Simpson
radiator |1|lvl5-14|Marge Simpson
screenplay |1|lvl5-14|Marge Simpson
scrub |1|lvl5-14|Marge Simpson
tiara |1|lvl5-14|Marge Simpson
thyme |1|lvl5-14|Marge Simpson
supple |1|lvl5-14|Marge Simpson
thug |1|lvl5-14|Marge Simpson
thrift |1|lvl5-14|Marge Simpson
tether |1|lvl5-14|Marge Simpson
tetanus

armour |3|lvl4|Bart Simpson
romance |3|lvl4|Bart Simpson
hammer |3|lvl4|Bart Simpson
beard |3|lvl4|Bart Simpson
monopoly |3|lvl4|Bart Simpson
parade |3|lvl4|Bart Simpson
rack |3|lvl4|Bart Simpson
choke |3|lvl4|Bart Simpson
costume |3|lvl4|Bart Simpson
cave |3|lvl4|Bart Simpson
wit |3|lvl4|Bart Simpson
mighty |3|lvl4|Bart Simpson
pearl |3|lvl4|Bart Simpson
upgrade |3|lvl4|Bart Simpson
candle |3|lvl4|Bart Simpson
pilgrim |3|lvl4|Bart Simpson
rattle |3|lvl4|Bart Simpson
cult |3|lvl4|Bart Simpson
dam |3|lvl4|Bart Simpson
cabin |3|lvl4|Bart Simpson
stripe |3|lvl4|Bart Simpson
conscience |3|lvl4|Bart Simpson
pledge |3|lvl4|Bart Simpson
pope |3|lvl4|Bart Simpson
click |3|lvl4|Bart Simpson
preach |3|lvl4|Bart Simpson
thigh |3|lvl4|Bart Simpson
obey |2|lvl4|Bart Simpson
warehouse |2|lvl4|Bart Simpson
ash |2|lvl4|Bart Simpson
homosexual |2|lvl4|Bart Simpson
herd |2|lvl4|Bart Simpson
arrow |2|lvl4|Bart Simpson
patriot |2|lvl4|Bart Simpson
needle |2|lvl4|Bart Simpson
activate |2|lvl4|Bart Simpson


ranch |1|lvl5-14|Bart Simpson
trudge |1|lvl5-14|Bart Simpson
trough |1|lvl5-14|Bart Simpson
rash |1|lvl5-14|Bart Simpson
rand |1|lvl5-14|Bart Simpson
yap |1|lvl5-14|Bart Simpson
yelp |1|lvl5-14|Bart Simpson
trolley |1|lvl5-14|Bart Simpson
possum |1|lvl5-14|Bart Simpson
yen |1|lvl5-14|Bart Simpson
underhand |1|lvl5-14|Bart Simpson
willow |1|lvl5-14|Bart Simpson
undersea |1|lvl5-14|Bart Simpson
vin |1|lvl5-14|Bart Simpson
pornography |1|lvl5-14|Bart Simpson
whitewash |1|lvl5-14|Bart Simpson
pong |1|lvl5-14|Bart Simpson
prehistoric |1|lvl5-14|Bart Simpson
prude |1|lvl5-14|Bart Simpson
vine |1|lvl5-14|Bart Simpson
poky |1|lvl5-14|Bart Simpson
whittle |1|lvl5-14|Bart Simpson
werewolf |1|lvl5-14|Bart Simpson
vicious |1|lvl5-14|Bart Simpson
puberty |1|lvl5-14|Bart Simpson
portable |1|lvl5-14|Bart Simpson
preen |1|lvl5-14|Bart Simpson
venom |1|lvl5-14|Bart Simpson
whiz |1|lvl5-14|Bart Simpson
vengeful |1|lvl5-14|Bart Simpson
poo |1|lvl5-14|Bart Simpson
whirl |1|lvl5-14|Bart Simpson
poodle |1|l

humble |3|lvl4|Lisa Simpson
shallow |3|lvl4|Lisa Simpson
hurricane |3|lvl4|Lisa Simpson
axe |3|lvl4|Lisa Simpson
hybrid |3|lvl4|Lisa Simpson
forbid |3|lvl4|Lisa Simpson
immigrate |3|lvl4|Lisa Simpson
flick |3|lvl4|Lisa Simpson
dignity |3|lvl4|Lisa Simpson
dim |3|lvl4|Lisa Simpson
scholarship |3|lvl4|Lisa Simpson
petition |3|lvl4|Lisa Simpson
trophy |3|lvl4|Lisa Simpson
curse |3|lvl4|Lisa Simpson
organism |3|lvl4|Lisa Simpson
optic |3|lvl4|Lisa Simpson
tuck |3|lvl4|Lisa Simpson
deprive |3|lvl4|Lisa Simpson
outfit |3|lvl4|Lisa Simpson
opt |3|lvl4|Lisa Simpson
ancestor |3|lvl4|Lisa Simpson
tribute |3|lvl4|Lisa Simpson
prey |3|lvl4|Lisa Simpson
pill |3|lvl4|Lisa Simpson
tick |3|lvl4|Lisa Simpson
stack |3|lvl4|Lisa Simpson
tragic |3|lvl4|Lisa Simpson
monopoly |3|lvl4|Lisa Simpson
colonel |3|lvl4|Lisa Simpson
campus |3|lvl4|Lisa Simpson
strap |3|lvl4|Lisa Simpson
temple |3|lvl4|Lisa Simpson
cemetery |3|lvl4|Lisa Simpson
pinch |3|lvl4|Lisa Simpson
outlet |2|lvl4|Lisa Simpson
veterinarian |2|l

relish |1|lvl5-14|Lisa Simpson
remnant |1|lvl5-14|Lisa Simpson
vinegar |1|lvl5-14|Lisa Simpson
auditorium |1|lvl5-14|Lisa Simpson
vole |1|lvl5-14|Lisa Simpson
revolve |1|lvl5-14|Lisa Simpson
renounce |1|lvl5-14|Lisa Simpson
vintage |1|lvl5-14|Lisa Simpson
asylum |1|lvl5-14|Lisa Simpson
voila |1|lvl5-14|Lisa Simpson
visa |1|lvl5-14|Lisa Simpson
vixen |1|lvl5-14|Lisa Simpson
replica |1|lvl5-14|Lisa Simpson
revamp |1|lvl5-14|Lisa Simpson
reunion |1|lvl5-14|Lisa Simpson
repo |1|lvl5-14|Lisa Simpson
repulse |1|lvl5-14|Lisa Simpson
reservoir |1|lvl5-14|Lisa Simpson
agitate |1|lvl5-14|Lisa Simpson
resurrect |1|lvl5-14|Lisa Simpson
resilience |1|lvl5-14|Lisa Simpson
relinquish |1|lvl5-14|Lisa Simpson
relic |1|lvl5-14|Lisa Simpson
relent |1|lvl5-14|Lisa Simpson
reek |1|lvl5-14|Lisa Simpson
recollect |1|lvl5-14|Lisa Simpson
rink |1|lvl5-14|Lisa Simpson
rectangle |1|lvl5-14|Lisa Simpson
restaurateur |1|lvl5-14|Lisa Simpson
voyage |1|lvl5-14|Lisa Simpson
rile |1|lvl5-14|Lisa Simpson
viceroy |1|lvl

icon |1|lvl5-14|Lisa Simpson
hype |1|lvl5-14|Lisa Simpson
brawn |1|lvl5-14|Lisa Simpson
hurl |1|lvl5-14|Lisa Simpson
hunker |1|lvl5-14|Lisa Simpson
hummus |1|lvl5-14|Lisa Simpson
jug |1|lvl5-14|Lisa Simpson
bourbon |1|lvl5-14|Lisa Simpson
bouquet |1|lvl5-14|Lisa Simpson
lubricate |1|lvl5-14|Lisa Simpson
majesty |1|lvl5-14|Lisa Simpson
magnate |1|lvl5-14|Lisa Simpson
maestro |1|lvl5-14|Lisa Simpson
lunatic |1|lvl5-14|Lisa Simpson
luminous |1|lvl5-14|Lisa Simpson
luggage |1|lvl5-14|Lisa Simpson
ludicrous |1|lvl5-14|Lisa Simpson
lozenge |1|lvl5-14|Lisa Simpson
bobby |1|lvl5-14|Lisa Simpson
boggle |1|lvl5-14|Lisa Simpson
lotion |1|lvl5-14|Lisa Simpson
loot |1|lvl5-14|Lisa Simpson
loophole |1|lvl5-14|Lisa Simpson
lollipop |1|lvl5-14|Lisa Simpson
logo |1|lvl5-14|Lisa Simpson
lofty |1|lvl5-14|Lisa Simpson
malaria |1|lvl5-14|Lisa Simpson
manna |1|lvl5-14|Lisa Simpson
bosom |1|lvl5-14|Lisa Simpson
mascot |1|lvl5-14|Lisa Simpson
meaner |1|lvl5-14|Lisa Simpson
meagre |1|lvl5-14|Lisa Simpson
meado

twitch |1|lvl5-14|C. Montgomery Burns
twirl |1|lvl5-14|C. Montgomery Burns
twilight |1|lvl5-14|C. Montgomery Burns
twain |1|lvl5-14|C. Montgomery Burns
turtle |1|lvl5-14|C. Montgomery Burns
satan |1|lvl5-14|C. Montgomery Burns
satchel |1|lvl5-14|C. Montgomery Burns
satire |1|lvl5-14|C. Montgomery Burns
transpire |1|lvl5-14|C. Montgomery Burns
toddle |1|lvl5-14|C. Montgomery Burns
serene |1|lvl5-14|C. Montgomery Burns
sewer |1|lvl5-14|C. Montgomery Burns
shack |1|lvl5-14|C. Montgomery Burns
shampoo |1|lvl5-14|C. Montgomery Burns
shanty |1|lvl5-14|C. Montgomery Burns
titular |1|lvl5-14|C. Montgomery Burns
shark |1|lvl5-14|C. Montgomery Burns
sheikh |1|lvl5-14|C. Montgomery Burns
titan |1|lvl5-14|C. Montgomery Burns
shenanigan |1|lvl5-14|C. Montgomery Burns
shepherd |1|lvl5-14|C. Montgomery Burns
shin |1|lvl5-14|C. Montgomery Burns
tincture |1|lvl5-14|C. Montgomery Burns
shire |1|lvl5-14|C. Montgomery Burns
shoo |1|lvl5-14|C. Montgomery Burns
shortcut |1|lvl5-14|C. Montgomery Burns
shorts

dregs |1|lvl5-14|Moe Szyslak
dreary |1|lvl5-14|Moe Szyslak
dodge |1|lvl5-14|Moe Szyslak
dispense |1|lvl5-14|Moe Szyslak
dislocate |1|lvl5-14|Moe Szyslak
dimple |1|lvl5-14|Moe Szyslak
dice |1|lvl5-14|Moe Szyslak
dentist |1|lvl5-14|Moe Szyslak
delta |1|lvl5-14|Moe Szyslak
debonair |1|lvl5-14|Moe Szyslak
daisy |1|lvl5-14|Moe Szyslak
dabble |1|lvl5-14|Moe Szyslak
dab |1|lvl5-14|Moe Szyslak
chateau |1|lvl5-14|Moe Szyslak
cereal |1|lvl5-14|Moe Szyslak
cavern |1|lvl5-14|Moe Szyslak
awesome |1|lvl5-14|Moe Szyslak
appetite |1|lvl5-14|Moe Szyslak
apron |1|lvl5-14|Moe Szyslak
apse |1|lvl5-14|Moe Szyslak
aquarium |1|lvl5-14|Moe Szyslak
armpit |1|lvl5-14|Moe Szyslak
arson |1|lvl5-14|Moe Szyslak
bachelor |1|lvl5-14|Moe Szyslak
beau |1|lvl5-14|Moe Szyslak
badge |1|lvl5-14|Moe Szyslak
bail |1|lvl5-14|Moe Szyslak
barb |1|lvl5-14|Moe Szyslak
barbecue |1|lvl5-14|Moe Szyslak
barber |1|lvl5-14|Moe Szyslak
bash |1|lvl5-14|Moe Szyslak
appease |1|lvl5-14|Moe Szyslak
apostrophe |1|lvl5-14|Moe Szyslak
anti |1|l

hum |1|lvl5-14|Seymour Skinner
hurdle |1|lvl5-14|Seymour Skinner
hush |1|lvl5-14|Seymour Skinner
intermission |1|lvl5-14|Seymour Skinner
intoxicate |1|lvl5-14|Seymour Skinner
hay |1|lvl5-14|Seymour Skinner
macaroni |1|lvl5-14|Seymour Skinner
lite |1|lvl5-14|Seymour Skinner
loafed |1|lvl5-14|Seymour Skinner
lousy |1|lvl5-14|Seymour Skinner
luggage |1|lvl5-14|Seymour Skinner
luminary |1|lvl5-14|Seymour Skinner
luncheon |1|lvl5-14|Seymour Skinner
lure |1|lvl5-14|Seymour Skinner
majesty |1|lvl5-14|Seymour Skinner
levitate |1|lvl5-14|Seymour Skinner
malign |1|lvl5-14|Seymour Skinner
mall |1|lvl5-14|Seymour Skinner
mandatory |1|lvl5-14|Seymour Skinner
mar |1|lvl5-14|Seymour Skinner
marijuana |1|lvl5-14|Seymour Skinner
marital |1|lvl5-14|Seymour Skinner
marshal |1|lvl5-14|Seymour Skinner
liquor |1|lvl5-14|Seymour Skinner
lettuce |1|lvl5-14|Seymour Skinner
inventory |1|lvl5-14|Seymour Skinner
keg |1|lvl5-14|Seymour Skinner
ivory |1|lvl5-14|Seymour Skinner
ivy |1|lvl5-14|Seymour Skinner
jelly |

ukelele |1|lvl5-14|Krusty the Clown
uphold |1|lvl5-14|Krusty the Clown
upstage |1|lvl5-14|Krusty the Clown
yoo |1|lvl5-14|Krusty the Clown
yoga |1|lvl5-14|Krusty the Clown
yank |1|lvl5-14|Krusty the Clown
writhe |1|lvl5-14|Krusty the Clown
woo |1|lvl5-14|Krusty the Clown
watt |1|lvl5-14|Krusty the Clown
warranty |1|lvl5-14|Krusty the Clown
wallet |1|lvl5-14|Krusty the Clown
wacky |1|lvl5-14|Krusty the Clown
vulture |1|lvl5-14|Krusty the Clown
volt |1|lvl5-14|Krusty the Clown
vodka |1|lvl5-14|Krusty the Clown
viral |1|lvl5-14|Krusty the Clown
ventriloquist |1|lvl5-14|Krusty the Clown
vendetta |1|lvl5-14|Krusty the Clown
vend |1|lvl5-14|Krusty the Clown
vase |1|lvl5-14|Krusty the Clown
vampire |1|lvl5-14|Krusty the Clown
vacuum |1|lvl5-14|Krusty the Clown
utterly |1|lvl5-14|Krusty the Clown
usurp |1|lvl5-14|Krusty the Clown
swab |1|lvl5-14|Krusty the Clown
superhero |1|lvl5-14|Krusty the Clown
rancid |1|lvl5-14|Krusty the Clown
rife |1|lvl5-14|Krusty the Clown
scroll |1|lvl5-14|Krusty th

### Who has the highest % hard word in their lines?

In [304]:
df_vocabulary_character = pd.concat([
    df_character.set_index('raw_character_text'),
    df_vocabulary_simpsons.groupby('raw_character_text').sum(),
    df_simpsons_lines
], axis=1)[['raw_text', 'tokens', 'base_forms_encounters']] #total lines, total tokens, tokens lvl4+

In [473]:
df_vocabulary_character['%'] = round(100 * (df_vocabulary_character['base_forms_encounters']/
                                                   df_vocabulary_character['tokens']), 2)
df_vocabulary_character.sort_values('%', ascending=False)[:10]
for i in df_vocabulary_character.sort_values('%', ascending=False).reset_index()[:10].values:
    print(i[0] + ',' + f'{i[1]:.0f}'',' + f'{i[2]:.0f}'',' + f'{i[3]:.0f}'',' + f'{i[4]:.2f}')

C. Montgomery Burns,3172,38266,2260,5.91
Seymour Skinner,2443,29601,1461,4.94
Krusty the Clown,1772,21873,1049,4.80
Ned Flanders,2145,24224,1099,4.54
Bart Simpson,13777,117468,5136,4.37
Grampa Simpson,1886,21125,868,4.11
Lisa Simpson,11502,106054,4328,4.08
Homer Simpson,29842,288596,11667,4.04
Moe Szyslak,2864,34627,1400,4.04
Marge Simpson,14159,134192,4413,3.29
