In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
import re
from lib import get_stop_words, get_text

In [80]:
books = [
    {
        'fn': 'a-study-in-scarlet.txt',
        'roman_numeral': True
    },
    {
        'fn': 'the-great-boer-war.txt',
        'roman_numeral': False
    },
    {
        'fn': 'the-hound-of-the-baskervilles.txt',
        'roman_numeral': False
    },
    {
        'fn':  'the-lost-world.txt',
        'roman_numeral': True
    },
    {
        'fn': 'the-sign-of-four.txt',
        'roman_numeral': True
    },
]
basedir = '../books/'
stop_words = get_stop_words('stopwords.txt')
book_i = 1
book_fn = books[book_i]['fn']
book_rom_num = books[book_i]['roman_numeral']

# get the raw text and make it all lower case
raw_text = get_text(book_fn, basedir)
raw_text = raw_text.lower()

# project gutenberg text has a lot of extra stuff at the beginning and end
def get_text_no_gutenberg(raw_text):
    return re.split('^\*\*\*(.*)\*\*\*$', raw_text, flags=re.MULTILINE)[2]
raw_text = get_text_no_gutenberg(raw_text)

# split the raw text into chapters
def split_chapters(text, roman_numeral=True):
    if roman_numeral:
        return re.split(r'CHAPTER [IVXLCDM]+', text, flags=re.IGNORECASE)
    else:
        return re.split(r'CHAPTER \d+', text, flags=re.IGNORECASE)
raw_chapters = split_chapters(raw_text, book_rom_num)

# sometimes the contents lists chapters that are in table of contents. here we just remove chapters that are too short.
# here we also delete the first chapter, which is just the table of contents and preface
def chapter_longer_than(raw_chapters, n=150):
    return list(filter(lambda c: len(c) > n, raw_chapters))
raw_chapters = chapter_longer_than(raw_chapters)[1:]


# split remove stopwords
def remove_stopwords(text, stopwords):
    return re.sub(r'\b(' + '|'.join(stopwords) + r')\b', '', text)
raw_text = remove_stopwords(raw_text, stop_words)
raw_chapters = list(map(lambda c: remove_stopwords(c, stop_words), raw_chapters))

# theres a ton of whitespace that we dont want
def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text)
raw_text = remove_extra_spaces(raw_text)
raw_chapters = list(map(remove_extra_spaces, raw_chapters))


In [81]:
raw_chapters

[". boer nations. community dutchmen type defended years power spain time spain greatest power world. intermix strain inflexible french huguenots gave home fortune left country time revocation edict nantes. product obviously rugged, virile, unconquerable races seen earth. formidable people train seven generations constant warfare savage men ferocious beasts, circumstances weakling survive, place acquire exceptional skill weapons horsemanship, country eminently suited tactics huntsman, marksman, rider. , finally, finer temper military qualities dour fatalistic old testament religion ardent consuming patriotism. combine qualities impulses individual, modern boer-- formidable antagonist crossed path imperial britain. military history largely consisted conflicts france, napoleon veterans treated roughly hard-bitten farmers ancient theology inconveniently modern rifles. look map south africa, , centre british possessions, like stone peach, lies great stretch republics, mighty domain small p

In [91]:
def porters_alg(text):
    # https://vijinimallawaarachchi.com/2017/05/09/porter-stemming-algorithm/#:~:text=The%20Porter%20Stemming%20algorithm%20(or,of%20Information%20Retrieval%20(IR).
    # https://tartarus.org/martin/PorterStemmer/
    m = re.match(r'(\w+?)(?=ly|es|(?<!s)s|y)', text)
    print(m.groups())
porters_alg('caresses days cates')

('car',)


79
['. the boer nations. ', '. the cause of quarrel. ']
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
41


In [127]:
raw_chapters[0]

'\ufeffthe project gutenberg ebook of the great boer war, by arthur conan doyle this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. you may copy it, give it away or re-use it under the terms of the project gutenberg license included with this ebook or online at www.gutenberg.org title: the great boer war author: arthur conan doyle posting date: february 1, 2009 [ebook #3069] release date: feb, 2002 last updated: september 30, 2016 language: english character set encoding: utf-8 *** start of this project gutenberg ebook the great boer war *** produced by robert laing, and sue asscher the great boer war by arthur conan doyle contents. '

In [128]:
raw_chapters[1]

". the end. preface to the final edition. during the course of the war some sixteen editions of this work have appeared, each of which was, i hope, a little more full and accurate than that which preceded it. i may fairly claim, however, that the absolute mistakes made have been few in number, and that i have never had occasion to reverse, and seldom to modify, the judgments which i have formed. in this final edition the early text has been carefully revised and all fresh available knowledge has been added within the limits of a single volume narrative. of the various episodes in the latter half of the war it is impossible to say that the material is available for a complete and final chronicle. by the aid, however, of the official dispatches, of the newspapers, and of many private letters, i have done my best to give an intelligible and accurate account of the matter. the treatment may occasionally seem too brief but some proportion must be observed between the battles of 1899-1900 an

In [129]:
list(filter(lambda c: len(c) > 50, raw_chapters))

['\ufeffthe project gutenberg ebook of the great boer war, by arthur conan doyle this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. you may copy it, give it away or re-use it under the terms of the project gutenberg license included with this ebook or online at www.gutenberg.org title: the great boer war author: arthur conan doyle posting date: february 1, 2009 [ebook #3069] release date: feb, 2002 last updated: september 30, 2016 language: english character set encoding: utf-8 *** start of this project gutenberg ebook the great boer war *** produced by robert laing, and sue asscher the great boer war by arthur conan doyle contents. ',
 ". the end. preface to the final edition. during the course of the war some sixteen editions of this work have appeared, each of which was, i hope, a little more full and accurate than that which preceded it. i may fairly claim, however, that the absolute mistakes made have been few in number, and that i 

In [16]:
from utils.regex_utils import *

In [19]:
textloc = './data/a-study-in-scarlet.txt'
stopwordloc = './data/stopwords.txt'
text = GetTextFromFile(textloc)
stopwords = GetStopWords(stopwordloc)
clean_text = GetCleanText(text, stopwords)
wordlist = GetUniqueWordList(clean_text)
chapters = GetChapterTextList(text, stopwords)
chapters_wordlist = GetChapterWordList(chapters)

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [6]:
with open('stopwords.txt', 'w',encoding='utf-8') as f:
    for w in nlp.Defaults.stop_words:
        f.write(w + '\n')


In [10]:
len(nlp.Defaults.stop_words)

326