In [1]:
import re
from collections import Counter
from pprint import pprint

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm

In [2]:
nltk.download('stopwords')
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexander.semiletov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexander.semiletov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/alexander.semiletov/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

1.	Download Alice in Wonderland by Lewis Carroll from Project Gutenberg's website http://www.gutenberg.org/files/11/11-0.txt

In [3]:
with open('11-0.txt', 'r', encoding='utf-8') as file:
    text = file.readlines()
text = text[54:-355]
print(f"Lines in text - {len(text)}")

Lines in text - 3352


2.	Perform any necessary preprocessing on the text, including converting to lower case, removing stop words, numbers / non-alphabetic characters, lemmatization.

In [4]:
text = " ".join([line.strip() for line in text])
text = re.sub("[^A-Za-z\s]", "", text)
text = re.sub("\s+", " ", text)
print(f"Characters in text - {len(text)}")

Characters in text - 134048


In [5]:
chapters = [chapter.strip() for chapter in text.split("CHAPTER") if chapter.strip() not in [""]]
print(f"Chapters in text - {len(chapters)}")

Chapters in text - 12


In [6]:
chapters = [chapter.lower() for chapter in chapters]
lemmatizer = WordNetLemmatizer()
chapters = [[lemmatizer.lemmatize(token) for token in TreebankWordTokenizer().tokenize(chapter)] for chapter in
            chapters]
stop_words = stopwords.words("english")
chapters = [[word for word in chapter if not word in stop_words + ["wa"]] for chapter in chapters]

In [7]:
words_count = {i + 1: len(chapter) for i, chapter in enumerate(chapters)}
print("Words in chapters: ")
pprint(words_count)

Words in chapters: 
{1: 976,
 2: 1000,
 3: 824,
 4: 1224,
 5: 1037,
 6: 1218,
 7: 1116,
 8: 1175,
 9: 1116,
 10: 1014,
 11: 899,
 12: 996}


3.	Find Top 10 most important (for example, in terms of TF-IDF metric) words from each chapter in the text (not "Alice"); how would you name each chapter according to the identified tokens?

In [8]:
chapters = [" ".join(chapter) for chapter in chapters]

In [9]:
tfidf_model = TfidfVectorizer()
X_tfidf = tfidf_model.fit_transform(chapters)

In [10]:
id_to_word = dict(zip(tfidf_model.vocabulary_.values(), tfidf_model.vocabulary_.keys()))

In [11]:
for i in range(len(chapters)):
    print(f"Chapter {i + 1}")
    top_words = sorted(
        [(id_to_word[id_], tfidf) for id_, tfidf in zip(X_tfidf.toarray()[i].argsort(), sorted(X_tfidf.toarray()[i]))],
        key=lambda x: x[1], reverse=True)
    pprint(top_words[:10])

Chapter 1
[('alice', 0.29615517945409664),
 ('little', 0.1645306552522759),
 ('bat', 0.162315221600624),
 ('door', 0.14664694458776423),
 ('key', 0.14338221364669432),
 ('eat', 0.13614724395895683),
 ('think', 0.12065581385166899),
 ('like', 0.12065581385166899),
 ('way', 0.12065581385166899),
 ('either', 0.11669763767910585)]
Chapter 2
[('mouse', 0.2945290373216973),
 ('alice', 0.24915185841865883),
 ('pool', 0.18093864706387253),
 ('little', 0.17648256637988333),
 ('im', 0.1575024176206621),
 ('swam', 0.1490655936220378),
 ('cat', 0.14726451866084864),
 ('dear', 0.14415518447613193),
 ('said', 0.12457592920932942),
 ('foot', 0.12115570586204778)]
Chapter 3
[('mouse', 0.38649334391012735),
 ('said', 0.35289572334457703),
 ('dodo', 0.3071858131423766),
 ('alice', 0.23872357755662565),
 ('prize', 0.17884353644474438),
 ('lory', 0.1535929065711883),
 ('dry', 0.1356772995621639),
 ('thimble', 0.11922902429649625),
 ('know', 0.1141721457879514),
 ('bird', 0.11042666968860781)]
Chapter 4
[(

4.	Find the Top 10 most used verbs in sentences with Alice. What does Alice do most often?

In [12]:
with open('11-0.txt', 'r', encoding='utf-8') as file:
    text = file.readlines()
text = text[54:-355]
text = " ".join([line.strip() for line in text])

In [13]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = tokenizer.tokenize(text)
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")

In [14]:
for i in tqdm(range(len(sentences))):
    sentences[i] = re.sub("[^A-Za-z\s]", "", sentences[i])
    sentences[i] = re.sub("\s+", " ", sentences[i])
    sentences[i] = sentences[i].lower()
    sentences[i] = [lemmatizer.lemmatize(token) for token in TreebankWordTokenizer().tokenize(sentences[i])]
    sentences[i] = [word for word in sentences[i] if not word in stop_words + ["wa"]]

  0%|          | 0/973 [00:00<?, ?it/s]

In [15]:
alice_words = []
for sentence in tqdm(sentences):
    if "alice" in sentence:
        alice_words.extend(sentence)

  0%|          | 0/973 [00:00<?, ?it/s]

In [16]:
alice_words = [word for word, tag in nltk.pos_tag(alice_words) if tag[0] == "V"]

In [17]:
Counter(alice_words).most_common(10)

[('said', 256),
 ('went', 41),
 ('thought', 35),
 ('say', 34),
 ('know', 33),
 ('looked', 30),
 ('got', 26),
 ('go', 26),
 ('began', 24),
 ('see', 22)]