In [39]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import re
import os
from lib import get_stop_words, get_text, get_lem_words
from multiprocessing import Pool
from itertools import repeat, chain
import pickle

In [2]:
NUM_PROCESSES = 20

In [3]:
# project gutenberg text has a lot of extra stuff at the beginning and end
def get_text_no_gutenberg(raw_text):
    return re.split('^\*\*\*(.*)\*\*\*$', raw_text, flags=re.MULTILINE)[2]
    
def remove_single_letter(text):
    text = re.sub(r'\b\w\b', ' ', text)
    return text

# split the raw text into chapters
def split_chapters(text, roman_numeral=True):
    if roman_numeral:
        return re.split(r'CHAPTER [IVXLCDM]+', text, flags=re.IGNORECASE)
    else:
        return re.split(r'CHAPTER \d+', text, flags=re.IGNORECASE)

# sometimes the contents lists chapters that are in table of contents. here we just remove chapters that are too short.
# here we also delete the first chapter, which is just the table of contents and preface
def chapter_longer_than(raw_chapters, n=150):
    return list(filter(lambda c: len(c) > n, raw_chapters))

# split remove stopwords
def remove_stopwords(text, stopwords):
    return re.sub(r'\b(' + '|'.join(stopwords) + r')\b', '', text)

# theres a ton of whitespace that we dont want
def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text)

def replace_common_dots(text):
    text =  re.sub(r'dr\.', 'dr ', text)
    text =  re.sub(r'mr\. ', 'mr ', text)
    return text

def compress_abbr(text):
    pattern = r'\b(?:[a-zA-Z]\.){2,}\b.*?'
    matches = re.finditer(pattern, text, re.MULTILINE | re.IGNORECASE)
    abbr = set(map(lambda m: m.group(), matches))
    for a in abbr:
        text = text.replace(a, a.replace('.', ''))
    return text

def get_lem(fn):
    lem = {}
    lem_regex = []
    with open(fn, 'r', encoding='utf-8-sig') as f:
        for i in f:
            text = [j for j in i.lower().split()]
            if text[0] not in lem:
                lem[text[0]] = [text[1]]
            else:
                lem[text[0]].append(text[1])
    for word, lemmas in lem.items():
        lem_regex.append([word, fr'\b({"|".join(lemmas)})\b'])
    return lem_regex
def remove_punctuation(text):
    for i in range(len(text)):
        text[i] = re.sub(r'[^\w\s]', lambda m: "." if m.group(0) == "." else " ", text[i])
    return text

def split_sentences(text):
    return re.split(r"[.]", text)

def lemmatization(text, lem):
    for lemma, lem_regex in lem:
        text = re.sub(lem_regex, lemma, text)
    return text

def trim(text):
    for i in range(len(text)):
        text[i] = text[i].strip()
    return text

In [4]:
books = [
    {
        'fn': 'a-study-in-scarlet.txt',
        'roman_numeral': True
    },
    {
        'fn': 'arsene-lupin-vs-sherlock-holmes.txt',
        'roman_numeral': True
    },
    {
        'fn': 'the-hound-of-the-baskervilles.txt',
        'roman_numeral': False
    },
    {
        'fn':  'the-mystery-of-cloomber.txt',
        'roman_numeral': True
    },
    {
        'fn': 'the-sign-of-four.txt',
        'roman_numeral': True
    },
]
basedir = '../books/'
stop_words = get_stop_words('stopwords.txt')
lem = get_lem('lemmatization-en.txt')
book_i = 2
book_fn = books[book_i]['fn']
book_rom_num = books[book_i]['roman_numeral']

def clean_all_text(book_fn, base_dir, stop_words, lem, book_rom_num):

    # get the raw text and make it all lower case
    raw_text = get_text(book_fn, base_dir)
    raw_text = raw_text.lower()
    raw_text = get_text_no_gutenberg(raw_text)
    raw_text = remove_stopwords(raw_text, stop_words)
    raw_text = remove_extra_spaces(raw_text)
    raw_text = replace_common_dots(raw_text)
    raw_text = compress_abbr(raw_text)

    raw_chapters = split_chapters(raw_text, book_rom_num)
    raw_chapters = chapter_longer_than(raw_chapters)[1:]
    raw_chapters = list(map(lambda c: remove_stopwords(c, stop_words), raw_chapters))
    raw_chapters = list(map(remove_extra_spaces, raw_chapters))
    raw_chapters  = remove_punctuation(raw_chapters)
    raw_chapters = list(map(remove_extra_spaces, raw_chapters))

    # lemmatization by chapter so we can parallelize it
    with Pool(NUM_PROCESSES) as pool:
        raw_chapters = pool.starmap(lemmatization, zip(raw_chapters, repeat(lem)))

    def sentence_helper(text):
        s = split_sentences(text)
        s = trim(s)
        s = list(filter(lambda x: x != '', s))
        return s
    raw_chapter_sentences = list(map(sentence_helper, raw_chapters))

    raw_text = '. '.join(chain(*raw_chapter_sentences))
    raw_text = remove_extra_spaces(raw_text)

    # here we are seperating each sentence by new line and appending the chapter and sentence number to the beginning of each sentence
    marked_text = ''
    for i in range(len(raw_chapter_sentences)):
        for j in range(len(raw_chapter_sentences[i])):
            marked_text += f'((({i};;;{j}))) {raw_chapter_sentences[i][j]}\n'
    return raw_text, raw_chapters, raw_chapter_sentences, marked_text



In [11]:
# save the cleaned text
def save_cleaned_text(book_fn, base_dir, raw_text, raw_chapters, raw_chapter_sentences, marked_text):
    with open(os.path.join(base_dir, book_fn + '_cleaned.txt'), 'w', encoding='utf-8') as f:
        f.write(raw_text)
    with open(os.path.join(base_dir, book_fn + '_cleaned_chapters.pkl'), 'wb') as f:
        pickle.dump(raw_chapters, f)
    with open(os.path.join(base_dir, book_fn + '_cleaned_chapter_sentences.pkl'), 'wb') as f:
        pickle.dump(raw_chapter_sentences, f)
    with open(os.path.join(base_dir, book_fn + '_cleaned_marked.txt'), 'w', encoding='utf-8') as f:
        f.write(marked_text)

# read in the cleaned text
def read_cleaned_text(book_fn, base_dir):
    with open(os.path.join(base_dir, book_fn + '_cleaned.txt'), 'r', encoding='utf-8') as f:
        raw_text = f.read()
    with open(os.path.join(base_dir, book_fn + '_cleaned_chapters.pkl'), 'rb') as f:
        raw_chapters = pickle.load(f)
    with open(os.path.join(base_dir, book_fn + '_cleaned_chapter_sentences.pkl'), 'rb') as f:
        raw_chapter_sentences = pickle.load(f)
    with open(os.path.join(base_dir, book_fn + '_cleaned_marked.txt'), 'r', encoding='utf-8') as f:
        marked_text = f.read()

    return raw_text, raw_chapters, raw_chapter_sentences, marked_text

In [10]:
# go through the process of cleaning the text then write out the cleaned text to files
basedir = '../books/'
stop_words = get_stop_words('stopwords.txt')
lem = get_lem('lemmatization-en.txt')

for book_i in range(len(books)):
    book_fn = books[book_i]['fn']
    book_rom_num = books[book_i]['roman_numeral']
    print(f'Cleaning {book_fn}')
    raw_text, raw_chapters, raw_chapter_sentences, marked_text = clean_all_text(book_fn, basedir, stop_words, lem, book_rom_num)
    print(f'Saving {book_fn}')
    save_cleaned_text(book_fn, 'clean-data', raw_text, raw_chapters, raw_chapter_sentences, marked_text)


Cleaning a-study-in-scarlet.txt
Saving a-study-in-scarlet.txt
Cleaning arsene-lupin-vs-sherlock-holmes.txt
Saving arsene-lupin-vs-sherlock-holmes.txt
Cleaning the-hound-of-the-baskervilles.txt
Saving the-hound-of-the-baskervilles.txt
Cleaning the-mystery-of-cloomber.txt
Saving the-mystery-of-cloomber.txt
Cleaning the-sign-of-four.txt
Saving the-sign-of-four.txt


In [13]:
# example of reading in cleaned text
book_i = 2
book_fn = books[book_i]['fn']
raw_text, raw_chapters, raw_chapter_sentences, marked_text = read_cleaned_text(book_fn, 'clean-data')

In [14]:
raw_text[:1000]

'mr sherlock holmes mr sherlock holmes usually late morning save infrequent occasion night seat breakfast table. stand hearth rug pick stick visitor leave night. fine thick piece wood bulbous head sort know penang lawyer. head broad silver band nearly inch. james mortimer mrcs. friend cch. engrave date 1884. stick old fashion family practitioner carry dignify solid reassure. watson holmes sit give sign occupation. know believe eye head. polish silver plate coffee pot say. tell watson visitor stick unfortunate miss notion errand accidental souvenir importance. let hear reconstruct man examination. think say follow far method companion dr mortimer successful elderly medical man esteem know mark appreciation. good say holmes. excellent think probability favour country practitioner great deal visit foot. stick originally handsome knock hardly imagine town practitioner carry. thick iron ferrule wear evident great walk. perfectly sound say holmes. friend cch. guess hunt local hunt member pos

In [15]:
raw_chapters[0][:1000]

'. mr sherlock holmes mr sherlock holmes usually late morning save infrequent occasion night seat breakfast table. stand hearth rug pick stick visitor leave night . fine thick piece wood bulbous head sort know penang lawyer. head broad silver band nearly inch . james mortimer mrcs. friend cch. engrave date 1884. stick old fashion family practitioner carry dignify solid reassure. watson holmes sit give sign occupation. know believe eye head. polish silver plate coffee pot say . tell watson visitor stick unfortunate miss notion errand accidental souvenir importance. let hear reconstruct man examination . think say follow far method companion dr mortimer successful elderly medical man esteem know mark appreciation. good say holmes. excellent think probability favour country practitioner great deal visit foot. stick originally handsome knock hardly imagine town practitioner carry . thick iron ferrule wear evident great walk . perfectly sound say holmes. friend cch. guess hunt local hunt me

In [16]:
raw_chapter_sentences[0][:10]

['mr sherlock holmes mr sherlock holmes usually late morning save infrequent occasion night seat breakfast table',
 'stand hearth rug pick stick visitor leave night',
 'fine thick piece wood bulbous head sort know penang lawyer',
 'head broad silver band nearly inch',
 'james mortimer mrcs',
 'friend cch',
 'engrave date 1884',
 'stick old fashion family practitioner carry dignify solid reassure',
 'watson holmes sit give sign occupation',
 'know believe eye head']

In [17]:
marked_text[:1000]

'(((0;;;0))) mr sherlock holmes mr sherlock holmes usually late morning save infrequent occasion night seat breakfast table\n(((0;;;1))) stand hearth rug pick stick visitor leave night\n(((0;;;2))) fine thick piece wood bulbous head sort know penang lawyer\n(((0;;;3))) head broad silver band nearly inch\n(((0;;;4))) james mortimer mrcs\n(((0;;;5))) friend cch\n(((0;;;6))) engrave date 1884\n(((0;;;7))) stick old fashion family practitioner carry dignify solid reassure\n(((0;;;8))) watson holmes sit give sign occupation\n(((0;;;9))) know believe eye head\n(((0;;;10))) polish silver plate coffee pot say\n(((0;;;11))) tell watson visitor stick unfortunate miss notion errand accidental souvenir importance\n(((0;;;12))) let hear reconstruct man examination\n(((0;;;13))) think say follow far method companion dr mortimer successful elderly medical man esteem know mark appreciation\n(((0;;;14))) good say holmes\n(((0;;;15))) excellent think probability favour country practitioner great deal vi

In [165]:
def get_chapter_sentence(marked_text, word):
    pattern = rf'^\(\(\((\d+;;;\d+)\)\)\).*?\b({word})\b.*?$'
    matches = re.finditer(pattern, marked_text, re.MULTILINE | re.IGNORECASE)
    return list(matches)

matches = get_chapter_sentence(marked_text, 'sherlock')

In [166]:
matches

[<re.Match object; span=(0, 122), match='(((0;;;0))) mr sherlock holmes mr sherlock holmes>,
 <re.Match object; span=(3652, 3746), match='(((0;;;59))) laugh incredulously sherlock holmes >,
 <re.Match object; span=(5358, 5506), match='(((0;;;86))) dr james mortimer man science ask sh>,
 <re.Match object; span=(6549, 6614), match='(((0;;;107))) presume mr sherlock holmes address >,
 <re.Match object; span=(6972, 7028), match='(((0;;;114))) sherlock holmes wave strange visito>,
 <re.Match object; span=(13262, 13366), match='(((1;;;67))) dr mortimer finish read singular nar>,
 <re.Match object; span=(17163, 17257), match='(((1;;;112))) thank say sherlock holmes call atte>,
 <re.Match object; span=(21976, 22039), match='(((2;;;33))) sherlock holmes strike hand knee imp>,
 <re.Match object; span=(30584, 30683), match='(((3;;;5))) yes say strange thing mr sherlock hol>,
 <re.Match object; span=(31966, 32006), match='(((3;;;29))) promise say sherlock holmes'>,
 <re.Match object; span=(36858, 

In [None]:
def porters_alg(text):
    # https://vijinimallawaarachchi.com/2017/05/09/porter-stemming-algorithm/#:~:text=The%20Porter%20Stemming%20algorithm%20(or,of%20Information%20Retrieval%20(IR).
    # https://tartarus.org/martin/PorterStemmer/
    m = re.match(r'(\w+?)(?=ly|es|(?<!s)s|y)', text)
    print(m.groups())
porters_alg('caresses days cates')

In [None]:
raw_chapters[0]

In [None]:
raw_chapters[1]

In [None]:
list(filter(lambda c: len(c) > 50, raw_chapters))

In [None]:
from utils.regex_utils import *

In [None]:
textloc = "books/a-study-in-scarlet.txt"
stopwordloc = 'project1\stopwords.txt'
text = GetTextFromFile(textloc)
stopwords = GetStopWords(stopwordloc)
clean_text = GetCleanText(text, stopwords)
wordlist = GetUniqueWordList(clean_text)
chapters = GetChapterTextList(text, stopwords)
chapters_wordlist = GetChapterWordList(chapters)

In [None]:
textloc = "books/a-study-in-scarlet.txt"
stopwordloc = 'project1\stopwords.txt'
text = GetTextFromFile(textloc)
stopwords = GetStopWords(stopwordloc)
clean_text = GetCleanText(text, stopwords)
wordlist = GetUniqueWordList(clean_text)
chapters = GetChapterTextList(text, stopwords)
chapters_wordlist = GetChapterWordList(chapters)

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
with open('stopwords.txt', 'w',encoding='utf-8') as f:
    for w in nlp.Defaults.stop_words:
        f.write(w + '\n')


In [None]:
len(nlp.Defaults.stop_words)