# *Pride and Prejudice* by Jane Austen

In [4]:
import pandas as pd
import numpy as np

title = "Pride and Prejudice"
unfiltered = "Pride_prejudice.csv"
filtered = "Pride_filtered.csv"

top_unfiltered = pd.read_csv(unfiltered, names=["Word", "Count"]).sort_values(["Count", "Word"], ascending=False).reset_index(drop=True)
print(f"The top five words in {title}:")
top_unfiltered.head()

The top five words in Pride and Prejudice:


Unnamed: 0,Word,Count
0,The,4332
1,To,4163
2,Of,3612
3,And,3584
4,Her,2225


### Top 100 ranked words (unfiltered list)

In [2]:
one_hundred = top_unfiltered[:100]
hundred_total = one_hundred["Count"].sum()/ top_unfiltered["Count"].sum()
print(f'{str(hundred_total * 100)[:5]}%')

59.02%


In [3]:
print(f'Percentage overall of the 100 words: {str(((len(one_hundred)/len(top_unfiltered))*100))[:4]}%')

Percentage overall of the 100 words: 1.59%


### How many words does it take to get to ~80% comprehension?

In [4]:
eighty = top_unfiltered[:525]
words = 525
eighty_total = eighty["Count"].sum()/ top_unfiltered["Count"].sum()
print(f'It takes {words} words (rounded to nearest 25) to reach {str(eighty_total * 100)[:5]}% comprehension.')
eighty.tail()

It takes 525 words (rounded to nearest 25) to reach 80.53% comprehension.


Unnamed: 0,Word,Count
520,Forward,28
521,Early,28
522,Circumstances,28
523,Call,28
524,Assured,28


In [5]:
print(f'Percentage of the {words} words: {str((len(eighty)/len(top_unfiltered))*100)[:4]}%')

Percentage of the 525 words: 8.38%


### Now to transfer that to our filtered list...

In [6]:
top = pd.read_csv(filtered, names=["Word", "Count"]).sort_values(["Count", "Word"], ascending=False).reset_index(drop=True)
top.loc[top['Word']=='Assured']
vocab = top[:496]
vocab_hundred = vocab[:100]
#print(vocab_hundred["Word"].unique())

### Comparisons to Modern English words (according to Google's n-gram)

In [7]:
n_gram = pd.read_csv('edited_n_gram.csv', names = ["drop", "Word", "Count"])
n_gram.drop(["drop"], axis=1, inplace=True)
n_gram.drop(index=[0], inplace=True)
n_hundred= n_gram[:100]
x = vocab_hundred.assign(hundred_in=vocab_hundred.Word.isin(n_hundred.Word).astype(int))
h = x[x["hundred_in"] == 0]
h = list(h["Word"].unique())
print(f'Top 100 {title} words NOT in 100 Top Modern English Words: \n\n{h}')

Top 100 Pride and Prejudice words NOT in 100 Top Modern English Words: 

['Her', 'She', 'Had', 'Him', 'Could', 'Very', 'Them', 'Said', 'Such', 'Much', 'Must', 'Sister', 'Did', 'Should', 'Know', 'Herself', 'Before', 'Think', 'Soon', 'Never', 'Though', 'Might', 'Well', 'Little', 'Good', 'Most', 'Own', 'Every', 'Then', 'Being', 'Again', 'Friend', 'Without', 'Make', 'Nothing', 'After', 'Room']


In [8]:
y = vocab.assign(vocab_in=vocab.Word.isin(n_gram.Word).astype(int))
t = y[y["vocab_in"] == 0]
t = list(t["Word"].unique())

print(f'{title} Vocab List Words NOT in Top 1000 Modern Words (names/locations excluded): \n\n {t}')

Pride and Prejudice Vocab List Words NOT in Top 1000 Modern Words (names/locations excluded): 

 ['Sister', 'Herself', 'Soon', 'Nothing', 'Dear', 'Manner', 'Mother', 'Father', 'Daughter', 'Letter', 'Lady', 'Himself', 'Feeling', 'Hope', 'Cannot', 'Saw', 'Marry', 'Felt', 'Wish', 'Quite', 'Myself', 'Attention', 'Pleasure', 'Cry', 'Came', 'Till', 'Heard', 'Aunt', 'Whom', 'Believe', 'Morning', 'Moment', 'Happy', 'Brother', 'Anything', 'Evening', 'Opinion', 'Indeed', 'Therefore', 'Looked', 'Ill', 'Hear', 'Happiness', 'Toward', 'Uncle', 'Told', 'Speak', 'Went', 'Character', 'Seemed', 'Nor', 'Marriage', 'Answer', 'Hour', 'Certainly', 'Leave', 'Kind', 'Gone', 'Conversation', 'Affection', 'Woman', 'Passed', 'Mind', 'Coming', 'Seeing', 'Immediately', 'Cousin', 'Whose', 'Began', 'Knew', 'Almost', 'Perhap', 'Reason', 'Rather', 'Object', 'Behaviour', 'Walk', 'Husband', 'Gave', 'Certain', 'Took', 'Eyes', 'Acquaintance', 'Continued', 'Yourself', 'Idea', 'Civility', 'Suppose', 'Spirit', 'Settled', 'Reg

In [9]:
print(f'Length of vocab list: {len(t)}')

Length of vocab list: 221


In [10]:
print(f'Overall word count of book: {top_unfiltered["Count"].sum()}')

Overall word count of book: 122757


In [11]:
print(f'{str(round(hundred_total * 100, 2))}', f'{str(round((len(one_hundred)/len(top_unfiltered))*100, 2))}', words, f'{str(round(eighty_total * 100, 2))}', f'{len(t)}', f'{top_unfiltered["Count"].sum()}')

59.02 1.6 525 80.54 221 122757


In [5]:
body = ["Mouth", "Lips", "Head", "Face", "Hair", "Eyes", "Eye", "Ear", "Ears", "Butt", "Ass", "Feet", "Foot", "Leg", "Legs", "Hand", "Hands"]

for part in body:
    try:
        x = top_unfiltered.loc[top_unfiltered['Word']==part]
        print(part, x["Count"])
    except:
        continue

Mouth 4901    1
Name: Count, dtype: int64
Lips 1253    9
Name: Count, dtype: int64
Head 464    32
Name: Count, dtype: int64
Face 538    27
Name: Count, dtype: int64
Hair 2738    3
Name: Count, dtype: int64
Eyes 306    52
Name: Count, dtype: int64
Eye 868    15
Name: Count, dtype: int64
Ear 5567    1
Name: Count, dtype: int64
Ears Series([], Name: Count, dtype: int64)
Butt Series([], Name: Count, dtype: int64)
Ass Series([], Name: Count, dtype: int64)
Feet 5429    1
Name: Count, dtype: int64
Foot 5392    1
Name: Count, dtype: int64
Leg Series([], Name: Count, dtype: int64)
Legs Series([], Name: Count, dtype: int64)
Hand 569    25
Name: Count, dtype: int64
Hands 1533    7
Name: Count, dtype: int64


In [None]:
pride = {"Mouth/Lips": 10, "Head": 32, "Face": 27, "Hair": 3, "Eye(s)": 67, "Ear(s)": 1, "Foot/Feet": 2, "Hand(s)": 32 }