# Load the book

In [1]:
with open('miracle_in_the_andes.txt', 'r', encoding='utf8') as file:
    book = file.read()

In [4]:
book[:1000]

'Chapter 1\n\nBefore\n\n\nIT WAS FRIDAY, the thirteenth of October. We joked about that—flying over the Andes on such an unlucky day, but young men make those kinds of jokes so easily. Our flight had originated one day earlier in Montevideo, my hometown, its destination Santiago, Chile. It was a chartered flight on a Fairchild twin-engine turboprop carrying my rugby team—the Old Christians Rugby Club—to play an exhibition match against a top Chilean squad. There were forty-five people aboard, including four crew members—pilot, copilot, mechanic, and steward. Most of the passengers were my teammates, but we were also accompanied by friends, family members, and other supporters of the team, including my mother, Eugenia, and my younger sister, Susy, who were sitting across the aisle and one row in front of me. Our original itinerary was to fly nonstop to Santiago, a trip of about three and a half hours. But after just a few hours of flying, reports of bad weather in the mountains ahead fo

# How many chapters

### With string methods

In [5]:
book.count('Chapter')

11

### With regex

In [5]:
import re

In [12]:
pattern = re.compile('Chapter [0-9]+')
findings = re.findall(pattern, book)
print(findings)
len(findings)

['Chapter 1', 'Chapter 2', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Chapter 6', 'Chapter 7', 'Chapter 8', 'Chapter 9', 'Chapter 10']


10

# Which are the sentences where "love" was used?

In [15]:
pattern = re.compile('[A-Z]{1}[^.]*[^a-zA-Z]+love[^a-zA-Z]+[^.]*.')
findings = re.findall(pattern, book)
len(findings)

67

# What are the most used words?

In [23]:
pattern = re.compile('[a-zA-z]+')
findings = re.findall(pattern, book.lower())
findings[:20]

['chapter',
 'before',
 'it',
 'was',
 'friday',
 'the',
 'thirteenth',
 'of',
 'october',
 'we',
 'joked',
 'about',
 'that',
 'flying',
 'over',
 'the',
 'andes',
 'on',
 'such',
 'an']

In [24]:
# Create an empty dictionary and track the number of occurancies for each word
d = {}
for word in findings:
    if word in d.keys():
        d[word] += 1
    else:
        d[word] = 1

In [27]:
# Convert the dictionary into a list of tuples in order to sort it
d_list = [(value, key) for (key, value) in d.items()]
sorted(d_list, reverse=True)[:20]

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my'),
 (1001, 'that'),
 (946, 'he'),
 (941, 'had'),
 (800, 'it'),
 (705, 'for'),
 (700, 'as'),
 (679, 'but'),
 (632, 'with'),
 (617, 'me'),
 (576, 'on')]

# Extract the paragraphs where "love" was used

In [42]:
pattern = re.compile('[^\n]+[^a-zA-Z]+love[^a-zA-Z]+[^\n]+')
findings = re.findall(pattern, book)
len(findings)

39

# Extract the chapter titles

### Method 1

In [8]:
pattern = re.compile('[a-zA-Z ,]+\n\n')
findings = re.findall(pattern, book)
findings = [item.strip('\n\n') for item in findings]
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

### Method 2

In [71]:
pattern = re.compile('([a-zA-Z ,]+)\n\n') # Parentesis tell the code to return only what is included in them
findings = re.findall(pattern, book)
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

# Function that finds the occurence of any word

### My method

In [9]:
def find_my_method(word):
    pattern = re.compile(f'[^a-zA-Z]+{word.lower()}[^a-zA-z]+')
    findings = re.findall(pattern, book.lower())
    if len(findings) > 0:
        return len(findings)
    else:
        return f'The book does not contain the word {word}'

In [10]:
find_my_method('love')

83

In [11]:
find_my_method('hate')

'The book does not contain the word hate'

### An other method constructing a dictionary of words first

In [85]:
def find(w):
    pattern = re.compile('[a-zA-z]+')
    findings = re.findall(pattern, book.lower())
    d = {}
    for word in findings:
        if word in d.keys():
            d[word] = d[word] + 1
        else:
            d[word] = 1
    try:
        return d[w]
    except:
        return f'The book does not contain the word {w}'

In [83]:
find('love')

83

In [86]:
find('hate')

'The book does not contain the word hate'