In [602]:
import re
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer, WhitespaceTokenizer

### Data preprocessing

In [513]:
file = open('data/alice.txt', 'r')
text = str(file.read())

In [514]:
text[:1000]

'Alice’s Adventures in Wonderland\n\nby Lewis Carroll\n\nTHE MILLENNIUM FULCRUM EDITION 3.0\n\nContents\n\n CHAPTER I.     Down the Rabbit-Hole\n CHAPTER II.    The Pool of Tears\n CHAPTER III.   A Caucus-Race and a Long Tale\n CHAPTER IV.    The Rabbit Sends in a Little Bill\n CHAPTER V.     Advice from a Caterpillar\n CHAPTER VI.    Pig and Pepper\n CHAPTER VII.   A Mad Tea-Party\n CHAPTER VIII.  The Queen’s Croquet-Ground\n CHAPTER IX.    The Mock Turtle’s Story\n CHAPTER X.     The Lobster Quadrille\n CHAPTER XI.    Who Stole the Tarts?\n CHAPTER XII.   Alice’s Evidence\n\n\n\n\nCHAPTER I.\nDown the Rabbit-Hole\n\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into\nthe book her sister was reading, but it had no pictures or\nconversations in it, “and what is the use of a book,” thought Alice\n“without pictures or conversations?”\n\nSo she was considering in her own mind (as well as she could, 

In [515]:
# Converting to lower case
text = text.lower()

In [516]:
# Removing numbers, non-alphabetic characters
for symb in '0123456789()[];,:?!.@#$%^&*"—”“_':
    text = text.replace(symb, '')

In [517]:
# Removing '\n' symbol
text = text.replace('\n', ' ')

In [518]:
# Removing ’s symbols
text = text.replace('’s', '')

In [519]:
text[:1000]

'alice adventures in wonderland  by lewis carroll  the millennium fulcrum edition   contents   chapter i     down the rabbit-hole  chapter ii    the pool of tears  chapter iii   a caucus-race and a long tale  chapter iv    the rabbit sends in a little bill  chapter v     advice from a caterpillar  chapter vi    pig and pepper  chapter vii   a mad tea-party  chapter viii  the queen croquet-ground  chapter ix    the mock turtle story  chapter x     the lobster quadrille  chapter xi    who stole the tarts  chapter xii   alice evidence     chapter i down the rabbit-hole   alice was beginning to get very tired of sitting by her sister on the bank and of having nothing to do once or twice she had peeped into the book her sister was reading but it had no pictures or conversations in it and what is the use of a book thought alice without pictures or conversations  so she was considering in her own mind as well as she could for the hot day made her feel very sleepy and stupid whether the pleasu

In [520]:
words = re.findall(r"\w+", text)

In [521]:
words[:25]

['alice',
 'adventures',
 'in',
 'wonderland',
 'by',
 'lewis',
 'carroll',
 'the',
 'millennium',
 'fulcrum',
 'edition',
 'contents',
 'chapter',
 'i',
 'down',
 'the',
 'rabbit',
 'hole',
 'chapter',
 'ii',
 'the',
 'pool',
 'of',
 'tears',
 'chapter']

In [522]:
# Create lemmatizer
lemmatizer = WordNetLemmatizer()

In [523]:
chapters_nums = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii']

In [524]:
# Removing stop words and lemmatization
proceed_words = []

stop_words = stopwords.words("english")

for word in words:
    if word not in stop_words + chapters_nums:
        proceed_words.append(lemmatizer.lemmatize(word))

In [525]:
# Proceed verbs to infinitive form
for i in range(len(proceed_words)):
    proceed_words[i] = lemmatizer.lemmatize(proceed_words[i], 'v')

In [526]:
# Initial number of words
len(words)

27066

In [527]:
# Number of words after lemmatization and removing stop words
len(proceed_words)

12262

In [529]:
proceed_words[:25]

['alice',
 'adventure',
 'wonderland',
 'lewis',
 'carroll',
 'millennium',
 'fulcrum',
 'edition',
 'content',
 'chapter',
 'rabbit',
 'hole',
 'chapter',
 'pool',
 'tear',
 'chapter',
 'caucus',
 'race',
 'long',
 'tale',
 'chapter',
 'rabbit',
 'send',
 'little',
 'bill']

### Split text into the chapters

In [530]:
# Number of chapters in the book
chapters_num = 12

In [576]:
# Split text to the chapters
chapters = [[] for _ in range(chapters_num)]

i_word = 0
i_chapter = 0

for i in range(len(proceed_words)):
    if proceed_words[i] == 'chapter':
        i_chapter += 1
        
        part = proceed_words[i_word+1:i]
        i_word = i
        if i_chapter >= 13:
            chapters[i_chapter-14] = part
            
chapters[11] = proceed_words[i_word+1:]

In [577]:
# Removing 'alice'
for chapter in chapters:
    while 'alice' in chapter:
        ind = chapter.index('alice')
        chapter.pop(ind)

### Top 10 words from each chapter  (TF-IDF)

In [584]:
# Set a TF-IDF vectorizer object
vectorizer = TfidfVectorizer()

In [585]:
# Join words for every chapter into one text for every chapter
chapters_texts = [' '.join(words) for words in chapters] 

In [586]:
X = vectorizer.fit_transform(chapters_texts).toarray()

In [587]:
vocab = vectorizer.vocabulary_
reverse_vocab = {v:k for k,v in vocab.items()}

In [588]:
idx = X.argsort(axis=1)
tfidf_max10 = idx[:,-10:]

In [589]:
# Create dataframe TF-IDF
feature_names = vectorizer.get_feature_names()
df_tfidf = pd.DataFrame(X, columns = feature_names)
df_tfidf['top10'] = [[reverse_vocab.get(item) for item in row] for row in tfidf_max10]

In [590]:
df_tfidf

Unnamed: 0,abide,able,absence,absurd,acceptance,accident,accidentally,account,accusation,accustom,...,yetit,yetoh,youall,youare,youcome,young,youth,zealand,zigzag,top10
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030767,0.0,"[key, rabbit, door, get, bat, go, little, say,..."
1,0.0,0.030907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.030907,0.0,0.0,0.0,0.0,0.0,"[cry, swim, dear, think, cat, little, pool, sa..."
2,0.0,0.0,0.0,0.025644,0.029859,0.0,0.0,0.0,0.0,0.029859,...,0.0,0.0,0.0,0.0,0.029859,0.020332,0.0,0.0,0.0,"[bird, thimble, dry, know, lory, race, prize, ..."
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025106,0.025106,0.025106,0.0,0.0,0.0,0.0,0.0,0.0,"[get, go, fan, grow, say, puppy, rabbit, windo..."
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.015832,0.139503,0.0,0.02325,"[little, father, think, size, egg, youth, serp..."
5,0.021287,0.0,0.0,0.018282,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[wow, sneeze, grin, duchess, go, mad, baby, ca..."
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01355,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.012162,0.0,0.0,0.0,"[draw, time, go, twinkle, tea, march, hare, do..."
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016091,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[five, cat, soldier, look, gardener, go, king,..."
8,0.0,0.0,0.019678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.026798,0.0,0.0,0.0,"[school, think, go, queen, moral, duchess, gry..."
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013343,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[join, soooop, soup, beautiful, lobster, dance..."


In [597]:
# Get top-10 words for every chapter
for i in range(chapters_num):
    print(f'Top-10 words in chapter # {i+1}: {", ".join(df_tfidf["top10"][i])}')

Top-10 words in chapter # 1: key, rabbit, door, get, bat, go, little, say, eat, think
Top-10 words in chapter # 2: cry, swim, dear, think, cat, little, pool, say, go, mouse
Top-10 words in chapter # 3: bird, thimble, dry, know, lory, race, prize, dodo, mouse, say
Top-10 words in chapter # 4: get, go, fan, grow, say, puppy, rabbit, window, little, bill
Top-10 words in chapter # 5: little, father, think, size, egg, youth, serpent, pigeon, caterpillar, say
Top-10 words in chapter # 6: wow, sneeze, grin, duchess, go, mad, baby, cat, footman, say
Top-10 words in chapter # 7: draw, time, go, twinkle, tea, march, hare, dormouse, say, hatter
Top-10 words in chapter # 8: five, cat, soldier, look, gardener, go, king, hedgehog, say, queen
Top-10 words in chapter # 9: school, think, go, queen, moral, duchess, gryphon, mock, turtle, say
Top-10 words in chapter # 10: join, soooop, soup, beautiful, lobster, dance, say, gryphon, mock, turtle
Top-10 words in chapter # 11: juror, officer, queen, jury, w

### Top 10 most used verbs in sentences with Alice

To find top 10 most used verbs the idea is following: 

1. Find all verbs after 'alice'
2. Filter verbs from these words
3. Find most used words from filtered verbs

In [618]:
# Removing stop words and lemmatization
proceed_words = []

stop_words = stopwords.words("english")

for word in words:
    if word not in stop_words + chapters_nums:
        proceed_words.append(lemmatizer.lemmatize(word, 'v'))

In [697]:
items_after_alice = []
items_verbs = []

for i in range(len(tagged_tokens)):
    if tagged_tokens[i][0] == 'alice':
        if not tagged_tokens[i+1][1] in ('NN', 'NNS', 'PRP', 'JJ', 'WP'):
            items_after_alice.append([tagged_tokens[i+1], tagged_tokens[i+2]])
            if tagged_tokens[i+1][1] in ('VBD', 'VBP', 'VB'):
                if tagged_tokens[i+2][1] not in ('VBD', 'VBP', 'VBN', 'VBG'):
                    items_verbs.append([tagged_tokens[i+1]])
                else:
                    items_verbs.append([tagged_tokens[i+1], tagged_tokens[i+2]])
            elif tagged_tokens[i+2][1] in ('VBD', 'VBP', 'VBN', 'VBG'):
                items_verbs.append([tagged_tokens[i+2]])

In [698]:
len(items_after_alice)

324

In [713]:
items_verbs[:25]

[[('was', 'VBD'), ('beginning', 'VBG')],
 [('think', 'VB')],
 [('started', 'VBD')],
 [('had', 'VBD')],
 [('had', 'VBD')],
 [('had', 'VBD')],
 [('began', 'VBD')],
 [('began', 'VBD')],
 [('was', 'VBD')],
 [('had', 'VBD'), ('been', 'VBN')],
 [('thought', 'VBD')],
 [('opened', 'VBD')],
 [('had', 'VBD'), ('begun', 'VBN')],
 [('was', 'VBD')],
 [('had', 'VBD'), ('got', 'VBN')],
 [('felt', 'VBD')],
 [('took', 'VBD')],
 [('had', 'VBD'), ('been', 'VBN')],
 [('thought', 'VBD')],
 [('had', 'VBD')],
 [('afraid', 'VBP')],
 [('went', 'VBD')],
 [('went', 'VBD')],
 [('thought', 'VBD')],
 [('led', 'VBD')]]

In [709]:
# Reformat data into phrases with verbs
phrases = []
for item_pair in items_verbs:
    if len(item_pair) == 2:
        phrases.append([item_pair[0][0], item_pair[1][0]])
    else:
        phrases.append([item_pair[0][0]])

In [714]:
phrases[:25]

[['was', 'beginning'],
 ['think'],
 ['started'],
 ['had'],
 ['had'],
 ['had'],
 ['began'],
 ['began'],
 ['was'],
 ['had', 'been'],
 ['thought'],
 ['opened'],
 ['had', 'begun'],
 ['was'],
 ['had', 'got'],
 ['felt'],
 ['took'],
 ['had', 'been'],
 ['thought'],
 ['had'],
 ['afraid'],
 ['went'],
 ['went'],
 ['thought'],
 ['led']]

In [715]:
# Remove useless words and 
proceed_phrases = []

for phrase in phrases:
    new_phrase = []
    for word in phrase:
        if word not in stop_words:
            new_phrase.append(word)
    if new_phrase:
        proceed_phrases.append(new_phrase)

In [717]:
proceed_phrases[:25]

[['beginning'],
 ['think'],
 ['started'],
 ['began'],
 ['began'],
 ['thought'],
 ['opened'],
 ['begun'],
 ['got'],
 ['felt'],
 ['took'],
 ['thought'],
 ['afraid'],
 ['went'],
 ['went'],
 ['thought'],
 ['led'],
 ['kept'],
 ['thought'],
 ['sighing'],
 ['replied'],
 ['began'],
 ['said'],
 ['went'],
 ['knew']]

In [721]:
# Get words into infinitive form
verbs = []

for i in range(len(proceed_phrases)):
    word = proceed_phrases[i][0]
    verbs.append(lemmatizer.lemmatize(word, 'v'))

In [723]:
verbs[:25]

['begin',
 'think',
 'start',
 'begin',
 'begin',
 'think',
 'open',
 'begin',
 'get',
 'felt',
 'take',
 'think',
 'afraid',
 'go',
 'go',
 'think',
 'lead',
 'keep',
 'think',
 'sigh',
 'reply',
 'begin',
 'say',
 'go',
 'know']

In [728]:
len(verbs)

114

In [724]:
frequency = nltk.FreqDist(verbs)

In [727]:
frequency

FreqDist({'say': 13, 'think': 12, 'reply': 11, 'begin': 10, 'look': 10, 'felt': 5, 'go': 5, 'hear': 3, 'wait': 3, 'get': 2, ...})

Top-10 verbs with Alice are: say, think, reply, begin, look, felt, go, hear, wait, get.