# Loading the book

In [10]:
with open("miracle_in_the_andes.txt", "r") as file:
    book = file.read()

In [11]:
type(book)

str

# Number of chapters

### With string method

In [12]:
book.count("Chapter")

11

### With regular expression (regex)

In [13]:
import re

In [14]:
pattern = re.compile("Chapter [0-9]")

In [15]:
re.findall(pattern, book)

['Chapter 1',
 'Chapter 2',
 'Chapter 3',
 'Chapter 4',
 'Chapter 5',
 'Chapter 6',
 'Chapter 7',
 'Chapter 8',
 'Chapter 9',
 'Chapter 1']

In [16]:
pattern = re.compile("Chapter [0-9]+")

In [17]:
re.findall(pattern,book)

['Chapter 1',
 'Chapter 2',
 'Chapter 3',
 'Chapter 4',
 'Chapter 5',
 'Chapter 6',
 'Chapter 7',
 'Chapter 8',
 'Chapter 9',
 'Chapter 10']

In [18]:
len(re.findall(pattern, book))

10

# Sentences where "love" was used

In [19]:
# start with the basics
pattern = re.compile("love")
findings = re.findall(pattern, book)

In [20]:
# add spaces and letters before and after "love"
pattern = re.compile("[a-zA-Z] love [a-zA-Z]")
findings = re.findall(pattern, book)

In [21]:
# there can be words before and after so we add * to include them
pattern = re.compile("[a-zA-Z]* love [a-zA-Z]*")
findings = re.findall(pattern, book)

In [22]:
# there are spaces before and after words so we have to count them too.
pattern = re.compile("[ a-zA-Z]* love [a-zA-Z ]*")
findings = re.findall(pattern, book)

In [23]:
# we have to consider fullstops and commas too before/after words and before/after love.
pattern = re.compile("[a-zA-Z ,.]* love [a-zA-Z ,.]*")
findings = re.findall(pattern, book)

In [24]:
# all that is a hassle which is why we use negation
pattern = re.compile("[^.]* love [^.]*") # include everything except fullstop (period)
findings = re.findall(pattern, book)

In [25]:
# add a period at the end of the sentence
pattern = re.compile("[^.]* love [^.]*.")
findings = re.findall(pattern, book)
len(findings)

49

In [26]:
# now we have to consider other cases where there are commas and such stuff after love (no alpbhabets)
pattern = re.compile("[^.]*[^a-zA-Z]+love[^a-zA-Z]+[^.]*.") # + means one or more occurrences of the preceding character or group
findings = re.findall(pattern, book)
len(findings)

67

In [27]:
# the sentence should start with a capital letter
pattern = re.compile("[A-Z]{1}[^.]*[^a-zA-Z]+love[^a-zA-Z]+[^.]*.") # {1} one time only
findings = re.findall(pattern, book)
findings

['As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives.',
 'Guido and I grew up together, playing soccer and sharing a love of motorcycles, cars, and auto racing.',
 'Under the guidance of the Christian Brothers, both of us grew to love the game of rugby with a consuming passion.',
 'That rowdiness came to an abrupt end for Guido in 1969, when he met and fell in love with the beautiful daughter of a Chilean diplomat.',
 'I believe he had a great hunger for the love and comforts of a family that was happy and whole.',
 'He shared, with my father and me, a love for cars and driving, and he loved going with us to auto races.',
 'The house had a beautiful view of the sea, and this more than anything made my mother love it.',
 'She was a true tower of strength

# Most Common Words

In [28]:
pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book.lower())


In [29]:
d = {}
for word in findings:
    if word in d.keys():
        d[word] = d[word] + 1
    else:
        d[word] = 1

In [30]:
d_list = [(value, key) for (key, value) in d.items()]
d_list = sorted(d_list, reverse=True)
d_list[:10]

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my')]

In [31]:
# we use nltk library to exclude stopwords
!pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [32]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
from nltk.corpus import stopwords
english_stopwords = stopwords.words("english")


In [52]:
filtered_words = []
for count, word in d_list:
    if word not in english_stopwords:
        filtered_words.append((word, count))

filtered_words[:10]

[('would', 575),
 ('us', 519),
 ('said', 292),
 ('roberto', 284),
 ('could', 252),
 ('one', 249),
 ('snow', 227),
 ('mountain', 183),
 ('time', 182),
 ('like', 165)]

# Paragraphs where "love" was used

In [34]:
pattern = re.compile("[^\n]+love[^\n]+")
findings = re.findall(pattern, book)
len(findings)

60

# Extracting chapter titles

In [35]:
pattern = re.compile("[a-zA-Z ]+\n\n")
findings = re.findall(pattern, book)
for item in findings:
    print(item.strip("\n\n"))

Before
Everything Precious
A Promise
Breathe Once More
Abandoned
Tomb
East
The Opposite of Death
I See a Man
After


In [36]:
pattern = re.compile("([a-zA-Z ]+)\n\n")    # only include what is mentioned inside parentheses
findings = re.findall(pattern, book)
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

# Function that finds the occurrence of any word

In [37]:
def find(word):
    pattern = re.compile("[a-zA-Z]+")
    findings = re.findall(pattern, book.lower())
    d = {}
    for item in findings:
        if item in d.keys():
            d[item] = d[item] + 1
        else:
            d[item] = 1
    try:
        return d[word]
    except:
        return f"The book does no contain the word '{word}'"

In [38]:
find("love")

83

In [39]:
find("hate")

"The book does no contain the word 'hate'"

# Sentiment Analysis: Positive and Negative chapter

In [83]:
# we need a sentiment analysizer class which comes with nlkt
from nltk.sentiment import SentimentIntensityAnalyzer

In [84]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jay/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [85]:
analyzer = SentimentIntensityAnalyzer()

In [88]:
scores = analyzer.polarity_scores(book)
scores

{'neg': 0.116, 'neu': 0.76, 'pos': 0.125, 'compound': 1.0}

In [87]:
if scores["pos"] > scores["neg"]:
    print("It is a positive text")
else:
    print("It is a negative text")

It is a positive text


In [97]:
import re
pattern = re.compile("Chapter [0-9]+")
chapters = re.split(pattern, book)
chapters

['',
 '\n\nEverything Precious\n\n\n“HERE, NANDO, ARE you thirsty?”\n\nIt was my teammate Gustavo Zerbino crouching beside me, pressing a ball of snow to my lips. The snow was cold and it burned my throat as I swallowed, but my body was so parched I gobbled it in lumps and begged for more. Several hours had passed since I woke from the coma. My mind was clearer now, and I was full of questions. When I finished with the snow, I motioned Gustavo closer.\n\n“Where is my mother?” I asked. “Where is Susy? Are they all right?”\n\nGustavo’s face betrayed no emotion. “Get some rest,” he said. “You’re still very weak.” He walked away, and for a while the others kept their distance. Again and again I pleaded with them to give me some news of my loved ones, but my voice was just a whisper and it was easy for them to pretend they didn’t hear.\n\nI lay shivering on the cold floor of the fuselage as the others bustled around me, listening for the sound of my sister’s voice and glancing about for a g

In [98]:
chapters = chapters[1:]

In [102]:
for no, chapter in enumerate(chapter):
    score = analyzer.polarity_scores(chapter)
    print(no, score)

0 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
1 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
2 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
3 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
4 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
5 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
6 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
7 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
8 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
9 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
10 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
11 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
12 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
13 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
14 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
15 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
16 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
17 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, '