# *Alice's Adventures in Wonderland* by Lewis Carroll

In [1]:
import pandas as pd
import numpy as np

title = "Alice's Adventures in Wonderland"
unfiltered = "Alice.csv"
filtered = "Alice_filtered.csv"

top_unfiltered = pd.read_csv(unfiltered, names=["Word", "Count"]).sort_values(["Count", "Word"], ascending=False).reset_index(drop=True)
print(f"The top five words in {title}:")
top_unfiltered.head()

The top five words in Alice's Adventures in Wonderland:


Unnamed: 0,Word,Count
0,The,1630
1,And,869
2,To,724
3,A,624
4,It,595


### Top hundred ranked words (unfiltered list)

In [4]:
one_hundred = top_unfiltered[:100]
hundred_total = one_hundred["Count"].sum()/ top_unfiltered["Count"].sum()
print(f'{str(hundred_total * 100)[:5]}%')

61.03%


In [5]:
print(f'Percentage overall of the 100 words: {str(((len(one_hundred)/len(top_unfiltered))*100))[:4]}%')

Percentage overall of the 100 words: 3.92%


### Words needed for ~80% comprehension

In [6]:
eighty = top_unfiltered[:350]
words = 350
eighty_total = eighty["Count"].sum()/ top_unfiltered["Count"].sum()
print(f'It takes {words} words (rounded to nearest 25) to reach {str(eighty_total * 100)[:5]}% comprehension.')
eighty.tail()

It takes 350 words (rounded to nearest 25) to reach 80.28% comprehension.


Unnamed: 0,Word,Count
345,Word,10
346,Witness,10
347,Walked,10
348,Those,10
349,Tears,10


In [7]:
print(f'Percentage of the {words} words in book: {str((len(eighty)/len(top_unfiltered))*100)[:4]}%')

Percentage of the 350 words in book: 13.7%


### Now to transfer that to our filtered list...

In [8]:
top = pd.read_csv(filtered, names=["Word", "Count"]).sort_values(["Count", "Word"], ascending=False).reset_index(drop=True)
#print(top.loc[top['Word']=='Walked'])
vocab = top[:316]
print(vocab.tail())
vocab_hundred = vocab[:100]

         Word  Count
311      Bird     11
312       Ask     11
313  Yourself     10
314   Witness     10
315    Walked     10


### Comparisons to Modern English words (according to Google's n-gram)

In [9]:
n_gram = pd.read_csv('edited_n_gram.csv', names = ["drop", "Word", "Count"])
n_gram.drop(["drop"], axis=1, inplace=True)
n_gram.drop(index=[0], inplace=True)
n_hundred= n_gram[:100]
x = vocab_hundred.assign(hundred_in=vocab_hundred.Word.isin(n_hundred.Word).astype(int))
h = x[x["hundred_in"] == 0]
h = list(h["Word"].unique())
h.remove("Ll")
h.remove("Ve")
print(f'Top 100 {title} words NOT in 100 Top Modern English Words: \n\n{h}')

Top 100 Alice's Adventures in Wonderland words NOT in 100 Top Modern English Words: 

['She', 'Said', 'Her', 'Had', 'Very', 'Little', 'Down', 'Then', 'Know', 'Them', 'Could', 'Went', 'Herself', 'Again', 'Thing', 'Thought', 'Did', 'Into', 'Off', 'Head', 'Began', 'Way', 'Quite', 'Think', 'Don', 'Say', 'Much', 'Voice', 'Go', 'Looked', 'Just', 'Got', 'Never', 'Must', 'Him', 'Round', 'Why']


In [10]:
y = vocab.assign(vocab_in=vocab.Word.isin(n_gram.Word).astype(int))
t = y[y["vocab_in"] == 0]
t = list(t["Word"].unique())
t.remove("Ll")
t.remove("Ve")
print(f'{title} Vocab List Words NOT in Top 1000 Modern Words (names/locations excluded): \n\n {t}')

Alice's Adventures in Wonderland Vocab List Words NOT in Top 1000 Modern Words (names/locations excluded): 

 ['Went', 'Herself', 'Began', 'Quite', 'Don', 'Voice', 'Looked', 'Round', 'Tone', 'Came', 'Re', 'Door', 'Moment', 'Heard', 'Dear', 'Nothing', 'Eyes', 'Seemed', 'Won', 'Rather', 'Poor', 'Took', 'Soon', 'Cry', 'Cat', 'Felt', 'Hurry', 'Wish', 'Till', 'Minute', 'Arm', 'Jury', 'Feet', 'Eat', 'Curious', 'Anything', 'Wonder', 'Tea', 'Spoke', 'Sat', 'Asked', 'Turned', 'Talking', 'Ran', 'Hastily', 'Doesn', 'Begin', 'Saying', 'Pig', 'Knew', 'Indeed', 'Idea', 'Gave', 'Trying', 'Sea', 'Saw', 'Perhap', 'Ought', 'Mouth', 'Mad', 'Itself', 'Hear', 'Beginning', 'Anxiously', 'Speak', 'Seem', 'Remark', 'Lesson', 'Kept', 'Happen', 'Grow', 'Gone', 'Dance', 'Cook', 'Certainly', 'Behind', 'Turning', 'Tail', 'Suppose', 'Suddenly', 'Queer', 'Mouse', 'Finished', 'Deal', 'Afraid', 'Wasn', 'Waited', 'Soldier', 'Sister', 'Silence', 'Matter', 'Hardly', 'Growing', 'Gloves', 'Glad', 'Ear', 'Conversation', 'Bir

In [11]:
print(f'Length of vocab list: {len(t)}')

Length of vocab list: 101


In [12]:
print(f'Overall word count of book: {top_unfiltered["Count"].sum()}')

Overall word count of book: 27675


In [26]:
print(f'{str(round(hundred_total * 100, 2))}', f'{str(round((len(one_hundred)/len(top_unfiltered))*100, 2))}', words, f'{str(round(eighty_total * 100, 2))}', f'{len(t)}', f'{top_unfiltered["Count"].sum()}')

61.04 3.92 350 80.29 101 27675


In [31]:
body = ["Mouth", "Lips", "Head", "Face", "Hair", "Eyes", "Eye", "Ear", "Ears", "Butt", "Ass", "Feet", "Foot", "Leg", "Legs", "Hand", "Hands"]

for part in body:
    try:
        x = top_unfiltered.loc[top_unfiltered['Word']==part]
        print(part, x["Count"])
    except:
        continue


Mouth 364    10
Name: Count, dtype: int64
Lips 1981    1
Name: Count, dtype: int64
Head 98    50
Name: Count, dtype: int64
Face 248    15
Name: Count, dtype: int64
Hair 509    7
Name: Count, dtype: int64
Eyes 157    29
Name: Count, dtype: int64
Eye 520    7
Name: Count, dtype: int64
Ear 584    6
Name: Count, dtype: int64
Ears 667    5
Name: Count, dtype: int64
Butt Series([], Name: Count, dtype: int64)
Ass Series([], Name: Count, dtype: int64)
Feet 206    19
Name: Count, dtype: int64
Foot 403    9
Name: Count, dtype: int64
Leg Series([], Name: Count, dtype: int64)
Legs 942    3
Name: Count, dtype: int64
Hand 197    21
Name: Count, dtype: int64
Hands 314    12
Name: Count, dtype: int64


In [None]:
alice = {"Mouth/Lips": 11, "Head": 50, "Face": 15, "Hair": 7, "Eye(s)": 36, "Ear(s)": 11, "Foot/Feet": 28, "Leg(s)": 3, "Hand(s)": 33 }