## Extract sentences from a book

In [95]:
import re
SHERLOCK = 'sher'
METAMORPHOSIS = 'meta'
PRIDE = 'prid'
chosen_book = PRIDE
book_files = {
    SHERLOCK: 'words_data/sherlock/the_adventures_of_sherlock_holmes-arthur_conan_doyle.txt',
    PRIDE: 'words_data/pride_and_prejudice/pride_and_prejudice-jane_austen.txt'
}
sentences = []
sentence_regex = re.compile(r'([A-Z][^\.!?]*[\.!?])', re.M)
with open(book_files[chosen_book], 'r') as bookfile:
    whole_book = bookfile.read()
    sentences = re.findall(sentence_regex, whole_book)

In [96]:
len(sentences)

7019

In [97]:
sentences[0:10]

['Chapter 1\n\n      It is a truth universally acknowledged, that a single man in\n      possession of a good fortune, must be in want of a wife.',
 'However little known the feelings or views of such a man may be\n      on his first entering a neighbourhood, this truth is so well\n      fixed in the minds of the surrounding families, that he is\n      considered the rightful property of some one or other of their\n      daughters.',
 'My dear Mr.',
 'Bennet,” said his lady to him one day, “have you\n      heard that Netherfield Park is let at last?',
 'Mr.',
 'Bennet replied that he had not.',
 'But it is,” returned she; “for Mrs.',
 'Long has just been here, and\n      she told me all about it.',
 'Mr.',
 'Bennet made no answer.']

In [98]:
starting_sentence = {
    SHERLOCK: 0,
    METAMORPHOSIS: 8,
    PRIDE: 0
}
messages = []
for sentence in sentences[starting_sentence[chosen_book]:]:
    if len(sentence.split(' ')) >= 2:
        messages.append(sentence)
messages[0:10]

['Chapter 1\n\n      It is a truth universally acknowledged, that a single man in\n      possession of a good fortune, must be in want of a wife.',
 'However little known the feelings or views of such a man may be\n      on his first entering a neighbourhood, this truth is so well\n      fixed in the minds of the surrounding families, that he is\n      considered the rightful property of some one or other of their\n      daughters.',
 'My dear Mr.',
 'Bennet,” said his lady to him one day, “have you\n      heard that Netherfield Park is let at last?',
 'Bennet replied that he had not.',
 'But it is,” returned she; “for Mrs.',
 'Long has just been here, and\n      she told me all about it.',
 'Bennet made no answer.',
 'Do you not want to know who has taken it?',
 'You_ want to tell me, and I have no objection to hearing it.']

In [99]:
len(messages)

6543

## Sanitize messages

In [100]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/piotrm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [101]:
# Split into tokens (words+punctuation)
from nltk.tokenize import word_tokenize
tokens = [word_tokenize(row) for row in messages]
tokens[:2]

[['Chapter',
  '1',
  'It',
  'is',
  'a',
  'truth',
  'universally',
  'acknowledged',
  ',',
  'that',
  'a',
  'single',
  'man',
  'in',
  'possession',
  'of',
  'a',
  'good',
  'fortune',
  ',',
  'must',
  'be',
  'in',
  'want',
  'of',
  'a',
  'wife',
  '.'],
 ['However',
  'little',
  'known',
  'the',
  'feelings',
  'or',
  'views',
  'of',
  'such',
  'a',
  'man',
  'may',
  'be',
  'on',
  'his',
  'first',
  'entering',
  'a',
  'neighbourhood',
  ',',
  'this',
  'truth',
  'is',
  'so',
  'well',
  'fixed',
  'in',
  'the',
  'minds',
  'of',
  'the',
  'surrounding',
  'families',
  ',',
  'that',
  'he',
  'is',
  'considered',
  'the',
  'rightful',
  'property',
  'of',
  'some',
  'one',
  'or',
  'other',
  'of',
  'their',
  'daughters',
  '.']]

In [102]:
# Replace puntuation with empty string
import string
table = str.maketrans('','',string.punctuation)
words = [[word.translate(table) for word in line] for line in tokens]
words[:2]

[['Chapter',
  '1',
  'It',
  'is',
  'a',
  'truth',
  'universally',
  'acknowledged',
  '',
  'that',
  'a',
  'single',
  'man',
  'in',
  'possession',
  'of',
  'a',
  'good',
  'fortune',
  '',
  'must',
  'be',
  'in',
  'want',
  'of',
  'a',
  'wife',
  ''],
 ['However',
  'little',
  'known',
  'the',
  'feelings',
  'or',
  'views',
  'of',
  'such',
  'a',
  'man',
  'may',
  'be',
  'on',
  'his',
  'first',
  'entering',
  'a',
  'neighbourhood',
  '',
  'this',
  'truth',
  'is',
  'so',
  'well',
  'fixed',
  'in',
  'the',
  'minds',
  'of',
  'the',
  'surrounding',
  'families',
  '',
  'that',
  'he',
  'is',
  'considered',
  'the',
  'rightful',
  'property',
  'of',
  'some',
  'one',
  'or',
  'other',
  'of',
  'their',
  'daughters',
  '']]

In [103]:
# Remove digits
words = [[word for word in line if not word.isdigit()] for line in words]
words[:2]

[['Chapter',
  'It',
  'is',
  'a',
  'truth',
  'universally',
  'acknowledged',
  '',
  'that',
  'a',
  'single',
  'man',
  'in',
  'possession',
  'of',
  'a',
  'good',
  'fortune',
  '',
  'must',
  'be',
  'in',
  'want',
  'of',
  'a',
  'wife',
  ''],
 ['However',
  'little',
  'known',
  'the',
  'feelings',
  'or',
  'views',
  'of',
  'such',
  'a',
  'man',
  'may',
  'be',
  'on',
  'his',
  'first',
  'entering',
  'a',
  'neighbourhood',
  '',
  'this',
  'truth',
  'is',
  'so',
  'well',
  'fixed',
  'in',
  'the',
  'minds',
  'of',
  'the',
  'surrounding',
  'families',
  '',
  'that',
  'he',
  'is',
  'considered',
  'the',
  'rightful',
  'property',
  'of',
  'some',
  'one',
  'or',
  'other',
  'of',
  'their',
  'daughters',
  '']]

In [104]:
# Remove non-alphanumeric and normalize case
words = [[word.lower() for word in line if word.isalnum()] for line in words]
words[:2]

[['chapter',
  'it',
  'is',
  'a',
  'truth',
  'universally',
  'acknowledged',
  'that',
  'a',
  'single',
  'man',
  'in',
  'possession',
  'of',
  'a',
  'good',
  'fortune',
  'must',
  'be',
  'in',
  'want',
  'of',
  'a',
  'wife'],
 ['however',
  'little',
  'known',
  'the',
  'feelings',
  'or',
  'views',
  'of',
  'such',
  'a',
  'man',
  'may',
  'be',
  'on',
  'his',
  'first',
  'entering',
  'a',
  'neighbourhood',
  'this',
  'truth',
  'is',
  'so',
  'well',
  'fixed',
  'in',
  'the',
  'minds',
  'of',
  'the',
  'surrounding',
  'families',
  'that',
  'he',
  'is',
  'considered',
  'the',
  'rightful',
  'property',
  'of',
  'some',
  'one',
  'or',
  'other',
  'of',
  'their',
  'daughters']]

In [105]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/piotrm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [106]:
# Filter out stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [[word for word in line if word not in stop_words] for line in words]
words[:2]

[['chapter',
  'truth',
  'universally',
  'acknowledged',
  'single',
  'man',
  'possession',
  'good',
  'fortune',
  'must',
  'want',
  'wife'],
 ['however',
  'little',
  'known',
  'feelings',
  'views',
  'man',
  'may',
  'first',
  'entering',
  'neighbourhood',
  'truth',
  'well',
  'fixed',
  'minds',
  'surrounding',
  'families',
  'considered',
  'rightful',
  'property',
  'one',
  'daughters']]

In [107]:
# Stem words (fishing, fisher -> fish)
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
words = [[porter.stem(word) for word in line] for line in words]
words[:2]

[['chapter',
  'truth',
  'univers',
  'acknowledg',
  'singl',
  'man',
  'possess',
  'good',
  'fortun',
  'must',
  'want',
  'wife'],
 ['howev',
  'littl',
  'known',
  'feel',
  'view',
  'man',
  'may',
  'first',
  'enter',
  'neighbourhood',
  'truth',
  'well',
  'fix',
  'mind',
  'surround',
  'famili',
  'consid',
  'right',
  'properti',
  'one',
  'daughter']]

## Create most common words list

In [108]:
most_common_cnt = 2000

In [109]:
word_occurences = {}
for row in words:
    for word in row:
        if word_occurences.get(word) is None:
            word_occurences[word] = 0
        word_occurences[word] += 1

In [110]:
sorted_word_occurences = sorted(word_occurences.items(), key=lambda x: x[1], reverse=True)
len(sorted_word_occurences)

3951

In [111]:
# Remove words present only once in the entire book
sorted_word_occurences = [(word, cnt) for word, cnt in sorted_word_occurences if cnt >= 5]

In [112]:
# get all words, not limit to N most common ones
most_common_cnt = len(sorted_word_occurences)
most_common_cnt

1573

In [113]:
most_common_words_files = {
    SHERLOCK: f'words_data/sherlock/most_common_words_{most_common_cnt}.csv',
    PRIDE: f'words_data/pride_and_prejudice/most_common_words_{most_common_cnt}.csv'
}

In [114]:
sorted_word_occurences[:10]

[('mr', 846),
 ('elizabeth', 622),
 ('could', 523),
 ('would', 461),
 ('darci', 383),
 ('said', 368),
 ('bennet', 323),
 ('much', 319),
 ('must', 304),
 ('bingley', 300)]

In [115]:
import csv
with open(most_common_words_files[chosen_book], mode='w') as dict_csv:
    csvwriter = csv.writer(dict_csv)
    csvwriter.writerows(sorted_word_occurences[:most_common_cnt])

## Encode words to integers

In [116]:
import csv
dictionary_arr = []
dictionary_dict = {}
with open(most_common_words_files[chosen_book], mode='r') as dict_csv:
    reader = csv.reader(dict_csv)
    for index, row in enumerate(reader):
        dictionary_arr.append(row[0])
        dictionary_dict[row[0]] = index
len(dictionary_dict)

1573

In [117]:
encoded_rows = []
for line in words:
    encoded_line = []
    for word in line:
        if word in dictionary_dict:
            encoded_line.append(dictionary_dict[word])
    encoded_rows.append(encoded_line)
print(len(encoded_rows))
encoded_rows[:2]

6543


[[196, 457, 1210, 370, 740, 49, 675, 28, 213, 8, 117, 288],
 [56,
  29,
  224,
  38,
  402,
  49,
  27,
  51,
  214,
  412,
  457,
  22,
  458,
  198,
  44,
  164,
  313,
  1105,
  13,
  57]]

In [118]:
minimum_sequence_length = 2
encoded_rows = [row for row in encoded_rows if len(row) >= minimum_sequence_length]
len(encoded_rows)

6156

## Save encoded messages to CSV

In [119]:
encoded_words_files = {
    SHERLOCK: f'words_data/sherlock/encoded_words_{most_common_cnt}_common.csv',
    PRIDE: f'words_data/pride_and_prejudice/encoded_words_{most_common_cnt}_common.csv'
}

In [120]:
with open(encoded_words_files[chosen_book], mode='w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(encoded_rows)

In [121]:
row_lengths = {}
for index, row in enumerate(encoded_rows):
    row_lengths[index] = len(row)
row_lengths[0], row_lengths[1], row_lengths[2]


(12, 20, 2)

In [122]:
sorted_row_lengths = sorted(row_lengths.items(), key=lambda x: x[1])
sorted_row_lengths[:10]

[(2, 2),
 (4, 2),
 (5, 2),
 (6, 2),
 (10, 2),
 (14, 2),
 (18, 2),
 (21, 2),
 (22, 2),
 (24, 2)]

In [123]:
max_length_row = max(sorted_row_lengths, key=lambda x:x[1])
max_length_row

(2078, 53)

In [124]:
used_rows = set()
grouped_rows = []
grouping_occured = True
while grouping_occured == True:
    grouping_occured = False
    curr_group = []
    grouped_len = 0
    for row in sorted_row_lengths:
        if row[0] not in used_rows:
            curr_len = row[1]
            if grouped_len + curr_len <= max_length_row[1]:
                curr_group.append(row)
                grouped_len += curr_len
                used_rows.add(row[0])
                grouping_occured = True
            else:
                break
    grouped_rows.append(curr_group)
    
grouped_rows[:2]

[[(2, 2),
  (4, 2),
  (5, 2),
  (6, 2),
  (10, 2),
  (14, 2),
  (18, 2),
  (21, 2),
  (22, 2),
  (24, 2),
  (27, 2),
  (33, 2),
  (34, 2),
  (40, 2),
  (46, 2),
  (47, 2),
  (48, 2),
  (49, 2),
  (51, 2),
  (59, 2),
  (62, 2),
  (69, 2),
  (70, 2),
  (71, 2),
  (73, 2),
  (77, 2)],
 [(78, 2),
  (82, 2),
  (86, 2),
  (92, 2),
  (94, 2),
  (96, 2),
  (97, 2),
  (101, 2),
  (111, 2),
  (122, 2),
  (133, 2),
  (141, 2),
  (158, 2),
  (168, 2),
  (170, 2),
  (191, 2),
  (192, 2),
  (208, 2),
  (209, 2),
  (213, 2),
  (226, 2),
  (228, 2),
  (233, 2),
  (236, 2),
  (246, 2),
  (247, 2)]]

In [125]:
grouped_rows_data = []
for grouped_row in grouped_rows:
    curr_row_data = []
    for row in grouped_row:
        curr_row_data += encoded_rows[row[0]]
    grouped_rows_data.append(curr_row_data)
print(len(grouped_rows_data[0]))
grouped_rows_data[:2]

52


[[50,
  0,
  6,
  62,
  80,
  0,
  69,
  167,
  225,
  90,
  70,
  740,
  50,
  0,
  427,
  215,
  902,
  78,
  21,
  252,
  50,
  582,
  132,
  99,
  164,
  57,
  260,
  173,
  787,
  350,
  1106,
  50,
  677,
  193,
  832,
  40,
  14,
  461,
  196,
  0,
  176,
  53,
  55,
  0,
  69,
  173,
  60,
  395,
  5,
  0,
  29,
  787],
 [862,
  1310,
  741,
  535,
  308,
  0,
  123,
  61,
  87,
  1426,
  71,
  581,
  24,
  327,
  1311,
  0,
  22,
  140,
  196,
  0,
  41,
  0,
  6,
  71,
  188,
  716,
  174,
  43,
  679,
  3,
  171,
  0,
  6,
  153,
  71,
  147,
  685,
  314,
  385,
  465,
  188,
  657,
  82,
  318,
  235,
  1223,
  50,
  97,
  53,
  243,
  174,
  51]]

In [126]:
encoded_number_files = {
    SHERLOCK: f'number_data/sherlock.txt',
    PRIDE: f'number_data/pride_and_prejudice.txt'
}

In [127]:
with open(encoded_number_files[chosen_book], mode='w') as outputfile:
    lines = []
    for row in grouped_rows_data:
        curr_line = " ".join([str(item) for item in row])+'\n'
        lines.append(curr_line)
    outputfile.writelines(lines)
    print(lines[:5])

['50 0 6 62 80 0 69 167 225 90 70 740 50 0 427 215 902 78 21 252 50 582 132 99 164 57 260 173 787 350 1106 50 677 193 832 40 14 461 196 0 176 53 55 0 69 173 60 395 5 0 29 787\n', '862 1310 741 535 308 0 123 61 87 1426 71 581 24 327 1311 0 22 140 196 0 41 0 6 71 188 716 174 43 679 3 171 0 6 153 71 147 685 314 385 465 188 657 82 318 235 1223 50 97 53 243 174 51\n', '398 490 152 136 71 86 4 93 434 282 922 0 224 535 632 420 373 0 4 89 398 72 26 144 20 135 4 692 11 9 69 523 31 742 196 0 180 0 321 1227 50 0 39 19 6 1030 117 0 9 571 351 995\n', '415 550 385 32 29 420 103 1328 902 39 169 0 2 87 351 0 62 1144 31 27 69 21 431 53 614 1470 195 16 442 1471 381 443 8 54 6 370 468 551 263 274 181 281 313 479 71 0 121 409 414 117 114 0\n', '819 314 364 24 552 546 56 88 385 0 9 0 64 89 251 238 63 280 351 47 127 1476 43 21 1481 0 167 0 1332 7 3 0 88 0 1428 275 103 226 385 1 92 0 714 221 102 46 3 714 948 0 4 34\n']
