# Shakespearean Cannon

In [1]:
import pandas as pd
import numpy as np

title = "Shakespearean Cannon"
unfiltered = "Shakespeare.csv"
filtered = "Shakespeare_filtered.csv"

top_unfiltered = pd.read_csv(unfiltered, names=["Word", "Count"]).sort_values(["Count", "Word"], ascending=False).reset_index(drop=True)
#top_unfiltered.drop(top_unfiltered.index[0], inplace=True)

print(f"The top five words in {title}:")
top_unfiltered.head()

The top five words in Shakespearean Cannon:


Unnamed: 0,Word,Count
0,The,29141
1,And,28028
2,I,23671
3,To,21015
4,Of,18368


### Top hundred ranked words (unfiltered list)


In [15]:
one_hundred = top_unfiltered[:100]
hundred_total = one_hundred["Count"].sum()/ top_unfiltered["Count"].sum()
print(f'{str(hundred_total * 100)[:5]}%')

52.62%


In [16]:
print(f'Percentage overall of the 100 words: {str(((len(one_hundred)/len(top_unfiltered))*100))[:4]}%')

Percentage overall of the 100 words: 0.38%


### How many words for 80% comprehension

In [17]:
eighty = top_unfiltered[:1025]
words = 1025
eighty_total = eighty["Count"].sum()/ top_unfiltered["Count"].sum()
print(f'It takes {words} words (rounded to nearest 25) to reach {str(eighty_total * 100)[:5]}% comprehension.')
eighty.tail()

It takes 1025 words (rounded to nearest 25) to reach 80.20% comprehension.


Unnamed: 0,Word,Count
1020,Virtuous,102
1021,Treason,102
1022,Title,102
1023,Seal,102
1024,Harm,102


In [18]:
print(f'Percentage of the {words} words in book: {str((len(eighty)/len(top_unfiltered))*100)[:4]}%')

Percentage of the 1025 words in book: 3.97%


### Now to transfer that to our filtered list...

In [19]:
top = pd.read_csv(filtered, names=["Word", "Count"]).sort_values(["Count", "Word"], ascending=False).reset_index(drop=True)
print(top.loc[top['Word']=='Title'])
vocab = top[:778]
print(vocab.tail())
vocab_hundred = vocab[:100]

      Word  Count
777  Title    101
       Word  Count
773  Health    102
774    Fine    102
775     Buy    102
776   Birth    102
777   Title    101


### Compare to modern English word rankings (from Google's n-gram)

In [20]:
n_gram = pd.read_csv('edited_n_gram.csv', names = ["drop", "Word", "Count"])
n_gram.drop(["drop"], axis=1, inplace=True)
n_gram.drop(index=[0], inplace=True)
n_hundred= n_gram[:100]
x = vocab_hundred.assign(hundred_in=vocab_hundred.Word.isin(n_hundred.Word).astype(int))
h = x[x["hundred_in"] == 0]
h = list(h["Word"].unique())
h.remove("Ll")
# h.remove("Ve")
print(f'Top 100 {title} words NOT in 100 Top Modern English Words: \n\n{h}')

Top 100 Shakespearean Cannon words NOT in 100 Top Modern English Words: 

['Him', 'Thou', 'Her', 'Thy', 'Thee', 'Shall', 'She', 'O', 'Then', 'Good', 'Lord', 'Love', 'Them', 'Man', 'Know', 'Sir', 'Well', 'Hath', 'Come', 'Say', 'Make', 'Why', 'Let', 'Upon', 'Should', 'Did', 'Must', 'Where', 'Had', 'Such', 'Too', 'Heart', 'Yet', 'Hand', 'Go', 'St']


In [21]:
y = vocab.assign(vocab_in=vocab.Word.isin(n_gram.Word).astype(int))
t = y[y["vocab_in"] == 0]
t = list(t["Word"].unique())
t.remove("Ll")

print(f'{title} Vocab List Words NOT in Top 1000 Modern Words (names/locations excluded): \n\n {t}')

Shakespearean Cannon Vocab List Words NOT in Top 1000 Modern Words (names/locations excluded): 

 ['Thou', 'Thy', 'Thee', 'O', 'Lord', 'Sir', 'Hath', 'Speak', 'Mine', 'Father', 'Th', 'Doth', 'Hear', 'Son', 'Honour', 'Tis', 'Eyes', 'Fair', 'Cannot', 'Heaven', 'Lady', 'Fear', 'Sweet', 'Blood', 'Er', 'Brother', 'Nor', 'Leave', 'Thus', 'Stand', 'Bear', 'Noble', 'Comes', 'Master', 'Soul', 'Pray', 'Poor', 'Nothing', 'Dead', 'Hast', 'Daughter', 'Myself', 'Wife', 'Fool', 'Eye', 'Die', 'Hold', 'Gone', 'Hour', 'Thine', 'Bring', 'Himself', 'Woman', 'Mind', 'Till', 'Mother', 'Arm', 'Tongue', 'Answer', 'Dear', 'Fall', 'Whose', 'Sword', 'Thank', 'Peace', 'Forth', 'Matter', 'Rest', 'Fellow', 'Spirit', 'Ear', 'Tear', 'Unto', 'Letter', 'Hope', 'Therefore', 'Husband', 'Heard', 'Gentle', 'Reason', 'Indeed', 'Follow', 'Whom', 'Wrong', 'Crown', 'Kill', 'Boy', 'Wit', 'Thousand', 'Bed', 'Cause', 'Marry', 'Bid', 'Dost', 'Cry', 'Truth', 'Seem', 'Faith', 'Madam', 'Shame', 'Earth', 'Came', 'Hence', 'Grace', 'Ere

In [22]:
print(f'Length of vocab list: {len(t)}')

Length of vocab list: 434


In [23]:
print(f'Overall word count of book: {top_unfiltered["Count"].sum()}')

Overall word count of book: 971373


In [24]:
print(f'{str(round(hundred_total * 100, 2))}', f'{str(round((len(one_hundred)/len(top_unfiltered))*100, 2))}', words, f'{str(round(eighty_total * 100, 2))}', f'{len(t)}', f'{top_unfiltered["Count"].sum()}')

52.62 0.39 1025 80.21 434 971373


In [2]:
body = ["Mouth", "Lips", "Head", "Face", "Hair", "Eyes", "Eye", "Ear", "Ears", "Butt", "Ass", "Feet", "Foot", "Leg", "Legs", "Hand", "Hands"]

for part in body:
    try:
        x = top_unfiltered.loc[top_unfiltered['Word']==part]
        print(part, x["Count"])
    except:
        continue

Mouth 749    146
Name: Count, dtype: int64
Lips 720    153
Name: Count, dtype: int64
Head 207    560
Name: Count, dtype: int64
Face 254    468
Name: Count, dtype: int64
Hair 855    125
Name: Count, dtype: int64
Eyes 153    773
Name: Count, dtype: int64
Eye 237    494
Name: Count, dtype: int64
Ear 519    215
Name: Count, dtype: int64
Ears 643    173
Name: Count, dtype: int64
Butt 4090    17
Name: Count, dtype: int64
Ass 1099    94
Name: Count, dtype: int64
Feet 1613    59
Name: Count, dtype: int64
Foot 562    197
Name: Count, dtype: int64
Leg 2189    40
Name: Count, dtype: int64
Legs 1360    73
Name: Count, dtype: int64
Hand 136    910
Name: Count, dtype: int64
Hands 328    336
Name: Count, dtype: int64


In [None]:
gatsby = {"Mouth/Lips": 299, "Head": 560, "Face": 468, "Hair": 125, "Eye(s)": 1267, "Ear(s)": 388, "Butt/Ass": 111, "Foot/Feet": 256, "Leg(s)": 113, "Hand(s)": 1246 }