## Load the Book

In [1]:
with open("miracle_in_the_andes.txt", "r", encoding="utf8") as file:
    book = file.read()

## the most used non-article words

In [2]:
import re

In [3]:
pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book.lower())
print(findings[:5])
len(findings)

['chapter', 'before', 'it', 'was', 'friday']


86798

In [4]:
d = {}
for word in findings:
    if d.get(word):
        d[word] += 1
    else:
        d[word] = 1
d["chapter"]

11

In [5]:
d_list = [(value, key) for (key, value) in d.items()]
d_list = sorted(d_list, reverse=True)
d_list[:20]

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my'),
 (1001, 'that'),
 (946, 'he'),
 (941, 'had'),
 (800, 'it'),
 (705, 'for'),
 (700, 'as'),
 (679, 'but'),
 (632, 'with'),
 (617, 'me'),
 (576, 'on')]

In [6]:
from platform import python_version
python_version()

'3.11.3'

In [7]:
import nltk

from nltk.corpus import stopwords
# nltk.download('stopwords')

In [8]:
english_stopwords = stopwords.words("english")
english_stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [9]:
filtered_words = []
for count, word in d_list:
    if word not in english_stopwords:
        filtered_words.append((count, word))

In [10]:
filtered_words[:10]

[(575, 'would'),
 (519, 'us'),
 (292, 'said'),
 (284, 'roberto'),
 (252, 'could'),
 (249, 'one'),
 (227, 'snow'),
 (183, 'mountain'),
 (182, 'time'),
 (165, 'like')]

## sentiment analysis: what is the most positive and most negative chapters

In [11]:
from nltk.sentiment import SentimentIntensityAnalyzer
# nltk.download('vader_lexicon')

In [14]:
analyzer = SentimentIntensityAnalyzer()
dir(analyzer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_amplify_ep',
 '_amplify_qm',
 '_but_check',
 '_idioms_check',
 '_least_check',
 '_never_check',
 '_punctuation_emphasis',
 '_sift_sentiment_scores',
 'constants',
 'lexicon',
 'lexicon_file',
 'make_lex_dict',
 'polarity_scores',
 'score_valence',
 'sentiment_valence']

In [15]:
analyzer.polarity_scores("He, look how beautiful the trees are. I love them.")

{'neg': 0.0, 'neu': 0.464, 'pos': 0.536, 'compound': 0.8442}

In [24]:
scores = analyzer.polarity_scores("He, look how beautiful the trees are. I hate them.")

In [25]:
analyzer.polarity_scores("He, look how bad the trees are. I hate them. They are horrible.")

{'neg': 0.543, 'neu': 0.457, 'pos': 0.0, 'compound': -0.8934}

In [27]:
if scores["pos"] > scores["neg"]:
    print("this is a happy sentence")
else:
    print("that is one hateful bitch")
print(scores)

this is a happy sentence
{'neg': 0.253, 'neu': 0.479, 'pos': 0.267, 'compound': 0.0516}


In [28]:
analyzer.polarity_scores(book)

{'neg': 0.116, 'neu': 0.76, 'pos': 0.125, 'compound': 1.0}

## chapter sentiment analysis

In [34]:
import re
pattern = re.compile("Chapter [0-9]+")
chapters = re.split(pattern, book)
chapters = chapters[1:]

In [36]:
for nr, chapter in enumerate(chapters):
    scores = analyzer.polarity_scores(chapter)
    print(nr+1, scores)

1 {'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
2 {'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
3 {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
4 {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
5 {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
6 {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
7 {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
8 {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
9 {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
10 {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
