# *A Tale of Two Cities* by Charles Dickens

In [1]:
import pandas as pd
import numpy as np

title = "A Tale of Two Cities"
unfiltered = "Two_cities.csv"
filtered = "Two_cities_filtered.csv"

top_unfiltered = pd.read_csv(unfiltered, names=["Word", "Count"]).sort_values(["Count", "Word"], ascending=False).reset_index(drop=True)
print(f"The top five words in {title}:")
top_unfiltered.head()

The top five words in A Tale of Two Cities:


Unnamed: 0,Word,Count
0,The,7996
1,And,4999
2,Of,4002
3,To,3567
4,A,2937


### Top hundred ranked words (unfiltered list)

In [2]:
one_hundred = top_unfiltered[:100]
hundred_total = one_hundred["Count"].sum()/ top_unfiltered["Count"].sum()
print(f'{str(hundred_total * 100)[:5]}%')

55.63%


In [3]:
print(f'Percentage overall of the 100 words: {str(((len(one_hundred)/len(top_unfiltered))*100))[:4]}%')

Percentage overall of the 100 words: 1.03%


### Words needed for ~80% comprehension

In [4]:
eighty = top_unfiltered[:750]
words = 750
eighty_total = eighty["Count"].sum()/ top_unfiltered["Count"].sum()
print(f'It takes {words} words (rounded to nearest 25) to reach {str(eighty_total * 100)[:5]}% comprehension.')
eighty.tail()

It takes 750 words (rounded to nearest 25) to reach 80.24% comprehension.


Unnamed: 0,Word,Count
745,Surely,18
746,Suppose,18
747,Stand,18
748,Soul,18
749,Son,18


In [5]:
print(f'Percentage of the {words} words in book: {str((len(eighty)/len(top_unfiltered))*100)[:4]}%')

Percentage of the 750 words in book: 7.74%


### Now to transfer that to our filtered list...

In [6]:
top = pd.read_csv(filtered, names=["Word", "Count"]).sort_values(["Count", "Word"], ascending=False).reset_index(drop=True)
#print(top.loc[top['Word']=='Suppose'])
vocab = top[:690]
#print(vocab.tail())
vocab_hundred = vocab[:100]

### Comparisons to Modern English words (according to Google's n-gram)

In [7]:
n_gram = pd.read_csv('edited_n_gram.csv', names = ["drop", "Word", "Count"])
n_gram.drop(["drop"], axis=1, inplace=True)
n_gram.drop(index=[0], inplace=True)
n_hundred= n_gram[:100]
x = vocab_hundred.assign(hundred_in=vocab_hundred.Word.isin(n_hundred.Word).astype(int))
h = x[x["hundred_in"] == 0]
h = list(h["Word"].unique())
print(f'Top 100 {title} words NOT in 100 Top Modern English Words: \n\n{h}')

Top 100 A Tale of Two Cities words NOT in 100 Top Modern English Words: 

['Had', 'Her', 'Him', 'Said', 'She', 'Them', 'Hand', 'Into', 'Man', 'Could', 'Upon', 'Little', 'Then', 'Know', 'Down', 'Night', 'Himself', 'Before', 'Again', 'Head', 'Very', 'Day', 'Looked', 'Two', 'Father', 'Face', 'Way', 'Made', 'Long', 'Much', 'Over', 'Never', 'Good', 'Prisoner', 'Old', 'Eyes', 'Door']


In [8]:
y = vocab.assign(vocab_in=vocab.Word.isin(n_gram.Word).astype(int))
t = y[y["vocab_in"] == 0]
t = list(t["Word"].unique())

print(f'{title} Vocab List Words NOT in Top 1000 Modern Words (names/locations excluded): \n\n {t}')

A Tale of Two Cities Vocab List Words NOT in Top 1000 Modern Words (names/locations excluded): 

 ['Himself', 'Looked', 'Father', 'Prisoner', 'Eyes', 'Door', 'Came', 'Nothing', 'Mind', 'Wine', 'Turned', 'Dear', 'Arm', 'Went', 'Took', 'Don', 'Sat', 'Wife', 'Husband', 'Sir', 'Manner', 'Hour', 'Stood', 'Moment', 'Prison', 'Dark', 'Asked', 'Saw', 'Poor', 'Cry', 'Knew', 'Voice', 'Hope', 'Brought', 'Returned', 'Woman', 'Gentleman', 'Daughter', 'Gone', 'Whom', 'Passed', 'Lay', 'Wall', 'Brother', 'Strong', 'Round', 'Madame', 'Hair', 'Corner', 'Quite', 'Heard', 'Stopped', 'Spy', 'Lady', 'Seemed', 'Myself', 'Dead', 'Toward', 'Carry', 'Mother', 'Breast', 'Speak', 'Morning', 'Held', 'Anything', 'Faces', 'Coach', 'Behind', 'Finger', 'Bed', 'Answered', 'Answer', 'Laid', 'Alone', 'Village', 'Struck', 'Stone', 'Carriage', 'Soon', 'Hold', 'Hear', 'Fellow', 'Shadow', 'Opened', 'Itself', 'Coming', 'Usual', 'Mender', 'Began', 'Lip', 'Monde', 'Ll', 'Evrã', 'Doubt', 'Whose', 'Told', 'Passenger', 'Letter', '

In [9]:
print(f'Length of vocab list: {len(t)}')

Length of vocab list: 350


In [10]:
print(f'Overall word count of book: {top_unfiltered["Count"].sum()}')

Overall word count of book: 138015


In [11]:
print(f'{str(round(hundred_total * 100, 2))}', f'{str(round((len(one_hundred)/len(top_unfiltered))*100, 2))}', words, f'{str(round(eighty_total * 100, 2))}', f'{len(t)}', f'{top_unfiltered["Count"].sum()}')

55.64 1.03 750 80.24 350 138015


In [2]:
body = ["Mouth", "Lips", "Head", "Face", "Hair", "Eyes", "Eye", "Ear", "Ears", "Butt", "Ass", "Feet", "Foot", "Leg", "Legs", "Hand", "Hands"]

for part in body:
    try:
        x = top_unfiltered.loc[top_unfiltered['Word']==part]
        print(part, x["Count"])
    except:
        continue

Mouth 663    21
Name: Count, dtype: int64
Lips 353    46
Name: Count, dtype: int64
Head 107    177
Name: Count, dtype: int64
Face 98    187
Name: Count, dtype: int64
Hair 266    63
Name: Count, dtype: int64
Eyes 110    165
Name: Count, dtype: int64
Eye 456    34
Name: Count, dtype: int64
Ear 1112    12
Name: Count, dtype: int64
Ears 622    23
Name: Count, dtype: int64
Butt 5294    2
Name: Count, dtype: int64
Ass Series([], Name: Count, dtype: int64)
Feet 370    44
Name: Count, dtype: int64
Foot 599    24
Name: Count, dtype: int64
Leg 3504    3
Name: Count, dtype: int64
Legs 1280    10
Name: Count, dtype: int64
Hand 70    249
Name: Count, dtype: int64
Hands 165    113
Name: Count, dtype: int64


In [None]:
two_cities = {"Mouth/Lips": 67, "Head": 177, "Face": 187, "Hair": 63, "Eye(s)": 199, "Ear(s)": 35, "Butt/Ass": 2, "Foot/Feet": 68, "Leg(s)": 13, "Hand(s)": 362 }