## Extract sentences from a book

In [156]:
import re
SHERLOCK = 'sher'
METAMORPHOSIS = 'meta'
PRIDE = 'prid'
chosen_book = SHERLOCK
book_files = {
    SHERLOCK: 'words_data/sherlock/the_adventures_of_sherlock_holmes-arthur_conan_doyle.txt',
    PRIDE: 'words_data/pride_and_prejudice/pride_and_prejudice-jane_austen.txt'
}
sentences = []
sentence_regex = re.compile(r'([A-Z][^\.!?]*[\.!?])', re.M)
with open(book_files[chosen_book], 'r') as bookfile:
    whole_book = bookfile.read()
    sentences = re.findall(sentence_regex, whole_book)

In [157]:
len(sentences)

7141

In [158]:
sentences[0:10]

['I.',
 'A SCANDAL IN BOHEMIA\n\n\nI.',
 'To Sherlock Holmes she is always _the_ woman.',
 'I have seldom heard him\nmention her under any other name.',
 'In his eyes she eclipses and\npredominates the whole of her sex.',
 'It was not that he felt any emotion\nakin to love for Irene Adler.',
 'All emotions, and that one particularly,\nwere abhorrent to his cold, precise but admirably balanced mind.',
 'He\nwas, I take it, the most perfect reasoning and observing machine that\nthe world has seen, but as a lover he would have placed himself in a\nfalse position.',
 'He never spoke of the softer passions, save with a gibe\nand a sneer.',
 'They were admirable things for the observer—excellent for\ndrawing the veil from men’s motives and actions.']

In [159]:
starting_sentence = {
    SHERLOCK: 0,
    METAMORPHOSIS: 8,
    PRIDE: 0
}
messages = []
for sentence in sentences[starting_sentence[chosen_book]:]:
    if len(sentence.split(' ')) >= 2:
        messages.append(sentence)
messages[0:10]

['A SCANDAL IN BOHEMIA\n\n\nI.',
 'To Sherlock Holmes she is always _the_ woman.',
 'I have seldom heard him\nmention her under any other name.',
 'In his eyes she eclipses and\npredominates the whole of her sex.',
 'It was not that he felt any emotion\nakin to love for Irene Adler.',
 'All emotions, and that one particularly,\nwere abhorrent to his cold, precise but admirably balanced mind.',
 'He\nwas, I take it, the most perfect reasoning and observing machine that\nthe world has seen, but as a lover he would have placed himself in a\nfalse position.',
 'He never spoke of the softer passions, save with a gibe\nand a sneer.',
 'They were admirable things for the observer—excellent for\ndrawing the veil from men’s motives and actions.',
 'But for the trained\nreasoner to admit such intrusions into his own delicate and finely\nadjusted temperament was to introduce a distracting factor which might\nthrow a doubt upon all his mental results.']

In [160]:
len(messages)

6757

## Sanitize messages

In [161]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/piotrm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [162]:
# Split into tokens (words+punctuation)
from nltk.tokenize import word_tokenize
tokens = [word_tokenize(row) for row in messages]
tokens[:2]

[['A', 'SCANDAL', 'IN', 'BOHEMIA', 'I', '.'],
 ['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', '_the_', 'woman', '.']]

In [163]:
# Replace puntuation with empty string
import string
table = str.maketrans('','',string.punctuation)
words = [[word.translate(table) for word in line] for line in tokens]
words[:2]

[['A', 'SCANDAL', 'IN', 'BOHEMIA', 'I', ''],
 ['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', 'the', 'woman', '']]

In [164]:
# Remove non-alphanumeric and normalize case
words = [[word.lower() for word in line if word.isalnum()] for line in words]
words[:2]

[['a', 'scandal', 'in', 'bohemia', 'i'],
 ['to', 'sherlock', 'holmes', 'she', 'is', 'always', 'the', 'woman']]

In [165]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/piotrm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [166]:
# Filter out stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [[word for word in line if word not in stop_words] for line in words]
words[:2]

[['scandal', 'bohemia'], ['sherlock', 'holmes', 'always', 'woman']]

In [167]:
# Stem words (fishing, fisher -> fish)
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
words = [[porter.stem(word) for word in line] for line in words]
words[:2]

[['scandal', 'bohemia'], ['sherlock', 'holm', 'alway', 'woman']]

## Create most common words list

In [168]:
most_common_cnt = 2000

In [169]:
most_common_words_files = {
    SHERLOCK: f'words_data/sherlock/most_common_words_{most_common_cnt}.csv',
    PRIDE: f'words_data/pride_and_prejudice/most_common_words_{most_common_cnt}.csv'
}

In [170]:
word_occurences = {}
for row in words:
    for word in row:
        if word_occurences.get(word) is None:
            word_occurences[word] = 0
        word_occurences[word] += 1

In [171]:
sorted_word_occurences = sorted(word_occurences.items(), key=lambda x: x[1], reverse=True)
len(sorted_word_occurences)

5735

In [172]:
sorted_word_occurences[:10]

[('upon', 454),
 ('said', 438),
 ('holm', 420),
 ('one', 378),
 ('would', 327),
 ('man', 296),
 ('could', 286),
 ('mr', 270),
 ('littl', 268),
 ('see', 251)]

In [173]:
import csv
with open(most_common_words_files[chosen_book], mode='w') as dict_csv:
    csvwriter = csv.writer(dict_csv)
    csvwriter.writerows(sorted_word_occurences[:most_common_cnt])

## Encode words to integers

In [174]:
import csv
dictionary_arr = []
dictionary_dict = {}
with open(most_common_words_files[chosen_book], mode='r') as dict_csv:
    reader = csv.reader(dict_csv)
    for index, row in enumerate(reader):
        dictionary_arr.append(row[0])
        dictionary_dict[row[0]] = index
len(dictionary_dict)

2000

In [175]:
encoded_rows = []
for line in words:
    encoded_line = []
    for word in line:
        if word in dictionary_dict:
            encoded_line.append(dictionary_dict[word])
    encoded_rows.append(encoded_line)
encoded_rows[:2]

[[737, 844], [59, 2, 135, 107]]

## Save encoded messages to CSV

In [176]:
encoded_words_files = {
    SHERLOCK: f'words_data/sherlock/encoded_words_{most_common_cnt}_common.csv',
    PRIDE: f'words_data/pride_and_prejudice/encoded_words_{most_common_cnt}_common.csv'
}
minimum_sequence_length = 8

In [177]:
with open(encoded_words_files[chosen_book], mode='w') as csvfile:
    writer = csv.writer(csvfile)
    for row in encoded_rows:
        if len(row) >= minimum_sequence_length:
            writer.writerow(row)