# *The Great Gatsby* by F. Scott Fitzgerald

In [1]:
import pandas as pd
import numpy as np

title = "The Great Gatsby"
unfiltered = "Gatsby.csv"
filtered = "Gatsby_filtered.csv"

top_unfiltered = pd.read_csv(unfiltered, names=["Word", "Count"]).sort_values(["Count", "Word"], ascending=False).reset_index(drop=True)
top_unfiltered.drop(top_unfiltered.index[0], inplace=True)

print(f"The top five words in {title}:")
top_unfiltered.head()

The top five words in The Great Gatsby:


Unnamed: 0,Word,Count
1,The,2400
2,And,1566
3,A,1411
4,I,1193
5,To,1130


### Top hundred ranked words (unfiltered list)

In [2]:
one_hundred = top_unfiltered[:100]
hundred_total = one_hundred["Count"].sum()/ top_unfiltered["Count"].sum()
print(f'{str(hundred_total * 100)[:5]}%')

53.86%


In [3]:
print(f'Percentage overall of the 100 words: {str(((len(one_hundred)/len(top_unfiltered))*100))[:4]}%')

Percentage overall of the 100 words: 1.67%


### How many words for 80% comprehension

In [4]:
eighty = top_unfiltered[:725]
words = 725
eighty_total = eighty["Count"].sum()/ top_unfiltered["Count"].sum()
print(f'It takes {words} words (rounded to nearest 25) to reach {str(eighty_total * 100)[:5]}% comprehension.')
eighty.tail()

It takes 725 words (rounded to nearest 25) to reach 80.19% comprehension.


Unnamed: 0,Word,Count
721,Music,8
722,Meet,8
723,Lady,8
724,June,8
725,Invited,8


In [5]:
print(f'Percentage of the {words} words in book: {str((len(eighty)/len(top_unfiltered))*100)[:4]}%')

Percentage of the 725 words in book: 12.1%


### Now to transfer that to our filtered list...

In [6]:
top = pd.read_csv(filtered, names=["Word", "Count"]).sort_values(["Count", "Word"], ascending=False).reset_index(drop=True)
print(top.loc[top['Word']=='Invited'])
vocab = top[:660]
print(vocab.tail())
vocab_hundred = vocab[:100]

        Word  Count
659  Invited      8
          Word  Count
655       Noon      8
656  Neighbour      8
657      Music      8
658       Meet      8
659    Invited      8


### Comparisons to Modern English words (according to Google's n-gram)

In [7]:
n_gram = pd.read_csv('edited_n_gram.csv', names = ["drop", "Word", "Count"])
n_gram.drop(["drop"], axis=1, inplace=True)
n_gram.drop(index=[0], inplace=True)
n_hundred= n_gram[:100]
x = vocab_hundred.assign(hundred_in=vocab_hundred.Word.isin(n_hundred.Word).astype(int))
h = x[x["hundred_in"] == 0]
h = list(h["Word"].unique())
h.remove("Ll")
h.remove("Ve")
print(f'Top 100 {title} words NOT in 100 Top Modern English Words: \n\n{h}')

Top 100 The Great Gatsby words NOT in 100 Top Modern English Words: 

['She', 'Had', 'Her', 'Him', 'Said', 'Into', 'Then', 'Over', 'Down', 'Man', 'Came', 'Know', 'Little', 'Hand', 'House', 'Went', 'Just', 'Eyes', 'Car', 'Before', 'Looked', 'Got', 'Didn', 'Them', 'Old', 'Room', 'Night', 'Don', 'Way', 'Away', 'Moment', 'Door', 'Around', 'Think', 'Two', 'Girl', 'Want', 'Going', 'After']


In [8]:
y = vocab.assign(vocab_in=vocab.Word.isin(n_gram.Word).astype(int))
t = y[y["vocab_in"] == 0]
t = list(t["Word"].unique())
t.remove("Ll")
t.remove("Ve")
t.remove("Re")
print(f'{title} Vocab List Words NOT in Top 1000 Modern Words (names/locations excluded): \n\n {t}')

The Great Gatsby Vocab List Words NOT in Top 1000 Modern Words (names/locations excluded): 

 ['Came', 'Went', 'Eyes', 'Looked', 'Didn', 'Don', 'Moment', 'Door', 'Turned', 'Voice', 'Knew', 'Saw', 'Toward', 'Began', 'Sport', 'Took', 'Hour', 'Told', 'Heard', 'Afternoon', 'Asked', 'Seemed', 'Cry', 'Wasn', 'Minute', 'Nothing', 'Alone', 'Suddenly', 'Sat', 'Clock', 'Morning', 'Himself', 'Stood', 'Garage', 'Couldn', 'Arm', 'Wanted', 'Lawn', 'O', 'Moved', 'Inquired', 'Gave', 'Marry', 'Dark', 'Across', 'Rather', 'Felt', 'Demanded', 'Anything', 'Wife', 'Walked', 'Matter', 'Hear', 'Gone', 'Feet', 'Dog', 'Broke', 'Hair', 'Wouldn', 'Woman', 'Rose', 'Hadn', 'Chair', 'Beside', 'Stopped', 'Met', 'Loved', 'Butler', 'Trying', 'Train', 'Telephone', 'Summer', 'Idea', 'Evening', 'Dress', 'Shoulder', 'Passed', 'Ought', 'Itself', 'Inside', 'Hundred', 'Happened', 'Grey', 'Dozen', 'Brought', 'Behind', 'Talked', 'Suit', 'Slowly', 'Shook', 'Rain', 'Opened', 'Myself', 'Fell', 'Except', 'Coat', 'Answered', 'Almost

In [9]:
print(f'Length of vocab list: {len(t)}')

Length of vocab list: 335


In [10]:
print(f'Overall word count of book: {top_unfiltered["Count"].sum()}')

Overall word count of book: 49921


In [11]:
print(f'{str(round(hundred_total * 100, 2))}', f'{str(round((len(one_hundred)/len(top_unfiltered))*100, 2))}', words, f'{str(round(eighty_total * 100, 2))}', f'{len(t)}', f'{top_unfiltered["Count"].sum()}')

53.87 1.67 725 80.19 335 49921


In [2]:
body = ["Mouth", "Lips", "Head", "Face", "Hair", "Eyes", "Eye", "Ear", "Ears", "Butt", "Ass", "Feet", "Foot", "Leg", "Legs", "Hand", "Hands"]

for part in body:
    try:
        x = top_unfiltered.loc[top_unfiltered['Word']==part]
        print(part, x["Count"])
    except:
        continue

Mouth 448    14
Name: Count, dtype: int64
Lips 597    10
Name: Count, dtype: int64
Head 177    40
Name: Count, dtype: int64
Face 135    55
Name: Count, dtype: int64
Hair 292    22
Name: Count, dtype: int64
Eyes 76    88
Name: Count, dtype: int64
Eye 1702    3
Name: Count, dtype: int64
Ear 675    9
Name: Count, dtype: int64
Ears 5057    1
Name: Count, dtype: int64
Butt Series([], Name: Count, dtype: int64)
Ass Series([], Name: Count, dtype: int64)
Feet 284    23
Name: Count, dtype: int64
Foot 735    8
Name: Count, dtype: int64
Leg 2282    2
Name: Count, dtype: int64
Legs 2280    2
Name: Count, dtype: int64
Hand 122    63
Name: Count, dtype: int64
Hands 188    38
Name: Count, dtype: int64


In [None]:
gatsby = {"Mouth/Lips": 24, "Head": 40, "Face": 55, "Hair": 22, "Eye(s)": 91, "Ear(s)": 10, "Foot/Feet": 31, "Leg(s)": 4, "Hand(s)": 101 }