# *Frankenstein* by Mary Shelley

In [1]:
import pandas as pd
import numpy as np

title = "Frankenstein"
unfiltered = "Frankenstein.csv"
filtered = "Frankenstein_filtered.csv"

top_unfiltered = pd.read_csv(unfiltered, names=["Word", "Count"]).sort_values(["Count", "Word"], ascending=False).reset_index(drop=True)
print(f"The top five words in {title}:")
top_unfiltered.head()

The top five words in Frankenstein:


Unnamed: 0,Word,Count
0,The,4194
1,And,2976
2,I,2850
3,Of,2642
4,To,2094


### Top hundred ranked words (unfiltered list)

In [2]:
one_hundred = top_unfiltered[:100]
hundred_total = one_hundred["Count"].sum()/ top_unfiltered["Count"].sum()
print(f'{str(hundred_total * 100)[:5]}%')

55.33%


In [3]:
print(f'Percentage overall of the 100 words: {str(((len(one_hundred)/len(top_unfiltered))*100))[:4]}%')

Percentage overall of the 100 words: 1.43%


### Words needed for ~80% comprehension

In [4]:
eighty = top_unfiltered[:900]
words = 900
eighty_total = eighty["Count"].sum()/ top_unfiltered["Count"].sum()
print(f'It takes {words} words (rounded to nearest 25) to reach {str(eighty_total * 100)[:5]}% comprehension.')
eighty.tail()

It takes 900 words (rounded to nearest 25) to reach 80.12% comprehension.


Unnamed: 0,Word,Count
895,Garden,10
896,Forms,10
897,Forget,10
898,Follow,10
899,Five,10


In [5]:
print(f'Percentage of the {words} words in book: {str((len(eighty)/len(top_unfiltered))*100)[:4]}%')

Percentage of the 900 words in book: 12.8%


### Now to transfer that to our filtered list...

In [6]:
top = pd.read_csv(filtered, names=["Word", "Count"]).sort_values(["Count", "Word"], ascending=False).reset_index(drop=True)
print(top.loc[top['Word']=='Five'])
vocab = top[:872]
print(vocab.tail())
vocab_hundred = vocab[:100]

     Word  Count
871  Five     10
           Word  Count
867      Gloomy     10
868  Gentleness     10
869      Forget     10
870      Follow     10
871        Five     10


### Comparisons to Modern English words (according to Google's n-gram)

In [7]:
n_gram = pd.read_csv('edited_n_gram.csv', names = ["drop", "Word", "Count"])
n_gram.drop(["drop"], axis=1, inplace=True)
n_gram.drop(index=[0], inplace=True)
n_hundred= n_gram[:100]
x = vocab_hundred.assign(hundred_in=vocab_hundred.Word.isin(n_hundred.Word).astype(int))
h = x[x["hundred_in"] == 0]
h = list(h["Word"].unique())
print(f'Top 100 {title} words NOT in 100 Top Modern English Words: \n\n{h}')

Top 100 Frankenstein words NOT in 100 Top Modern English Words: 

['Had', 'Her', 'She', 'Him', 'Could', 'Should', 'Before', 'Myself', 'Father', 'Man', 'Them', 'Upon', 'Into', 'Friend', 'Being', 'Yet', 'Did', 'Thought', 'Life', 'Then', 'Might', 'Own', 'Eyes', 'Every', 'Feeling', 'Said', 'Shall', 'Toward', 'Saw', 'Night', 'Mind', 'Those', 'Most', 'Found', 'Heart', 'Ever']


In [8]:
y = vocab.assign(vocab_in=vocab.Word.isin(n_gram.Word).astype(int))
t = y[y["vocab_in"] == 0]
t = list(t["Word"].unique())

print(f'{title} Vocab List Words NOT in Top 1000 Modern Words (names/locations excluded): \n\n {t}')

Frankenstein Vocab List Words NOT in Top 1000 Modern Words (names/locations excluded): 

 ['Myself', 'Father', 'Eyes', 'Feeling', 'Toward', 'Saw', 'Mind', 'Whom', 'Felt', 'Passed', 'Dear', 'Spirit', 'Misery', 'Miserable', 'Heard', 'Became', 'Soon', 'Hour', 'Appeared', 'Mountain', 'Moment', 'Indeed', 'Horror', 'Cottage', 'Whose', 'Thus', 'Manner', 'Hope', 'Happiness', 'Fear', 'Affection', 'Despair', 'Voice', 'Cannot', 'Happy', 'Sensation', 'Scene', 'Creature', 'Alone', 'Ice', 'Joy', 'Companion', 'Came', 'Letter', 'Soul', 'Countenance', 'Wood', 'Poor', 'Entered', 'Therefore', 'Tear', 'Sometimes', 'Pleasure', 'Morning', 'Himself', 'Almost', 'Took', 'Seemed', 'Rest', 'Possessed', 'Kind', 'Journey', 'Earth', 'Delight', 'Wind', 'Gentle', 'Remain', 'Discovered', 'Continued', 'Cause', 'Object', 'Mother', 'Monster', 'Idea', 'Fiend', 'Fellow', 'Tale', 'Sea', 'Remained', 'Lost', 'Labour', 'Knew', 'Gave', 'Believe', 'Strange', 'Sight', 'Returned', 'Resolved', 'Nor', 'Heaven', 'Existence', 'Enemy',

In [9]:
print(f'Length of vocab list: {len(t)}')

Length of vocab list: 542


In [10]:
print(f'Overall word count of book: {top_unfiltered["Count"].sum()}')

Overall word count of book: 75265


In [11]:
print(f'{str(round(hundred_total * 100, 2))}', f'{str(round((len(one_hundred)/len(top_unfiltered))*100, 2))}', words, f'{str(round(eighty_total * 100, 2))}', f'{len(t)}', f'{top_unfiltered["Count"].sum()}')

55.33 1.43 900 80.13 542 75265


In [2]:
body = ["Mouth", "Lips", "Head", "Face", "Hair", "Eyes", "Eye", "Ear", "Ears", "Butt", "Ass", "Feet", "Foot", "Leg", "Legs", "Hand", "Hands"]

for part in body:
    try:
        x = top_unfiltered.loc[top_unfiltered['Word']==part]
        print(part, x["Count"])
    except:
        continue

Mouth Series([], Name: Count, dtype: int64)
Lips 502    18
Name: Count, dtype: int64
Head 352    25
Name: Count, dtype: int64
Face 444    20
Name: Count, dtype: int64
Hair 1109    8
Name: Count, dtype: int64
Eyes 80    104
Name: Count, dtype: int64
Eye 509    18
Name: Count, dtype: int64
Ear 2132    4
Name: Count, dtype: int64
Ears 1127    8
Name: Count, dtype: int64
Butt Series([], Name: Count, dtype: int64)
Ass 3921    2
Name: Count, dtype: int64
Feet 686    13
Name: Count, dtype: int64
Foot 1723    5
Name: Count, dtype: int64
Leg Series([], Name: Count, dtype: int64)
Legs Series([], Name: Count, dtype: int64)
Hand 334    26
Name: Count, dtype: int64
Hands 207    38
Name: Count, dtype: int64


In [None]:
frankenstein = {"Mouth/Lips": 18, "Head": 25, "Face": 20, "Hair": 8, "Eye(s)": 122, "Ear(s)": 12, "Butt/Ass": 2, "Foot/Feet": 18, "Hand(s)": 64 }