In [39]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [66]:
import re
from lib import get_stop_words, get_text, get_lem_words
from multiprocessing import Pool
from itertools import repeat, chain

In [67]:
NUM_PROCESSES = 20

In [68]:
# project gutenberg text has a lot of extra stuff at the beginning and end
def get_text_no_gutenberg(raw_text):
    return re.split('^\*\*\*(.*)\*\*\*$', raw_text, flags=re.MULTILINE)[2]
    
def remove_single_letter(text):
    text = re.sub(r'\b\w\b', ' ', text)
    return text

# split the raw text into chapters
def split_chapters(text, roman_numeral=True):
    if roman_numeral:
        return re.split(r'CHAPTER [IVXLCDM]+', text, flags=re.IGNORECASE)
    else:
        return re.split(r'CHAPTER \d+', text, flags=re.IGNORECASE)

# sometimes the contents lists chapters that are in table of contents. here we just remove chapters that are too short.
# here we also delete the first chapter, which is just the table of contents and preface
def chapter_longer_than(raw_chapters, n=150):
    return list(filter(lambda c: len(c) > n, raw_chapters))

# split remove stopwords
def remove_stopwords(text, stopwords):
    return re.sub(r'\b(' + '|'.join(stopwords) + r')\b', '', text)

# theres a ton of whitespace that we dont want
def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text)

# def get_lem(fn):
#     text = []
#     lem = []
#     word = []
#     with open(fn, 'r', encoding='utf-8-sig') as f:
#         for i in f:
#             text.append([j for j in i.split()])
#     for i in range (len(text)):
#         lem.append(text[i][0])
#         word.append(text[i][1])
#     return text, lem, word
def get_lem(fn):
    lem = {}
    lem_regex = []
    with open(fn, 'r', encoding='utf-8-sig') as f:
        for i in f:
            text = [j for j in i.lower().split()]
            if text[0] not in lem:
                lem[text[0]] = [text[1]]
            else:
                lem[text[0]].append(text[1])
    for word, lemmas in lem.items():
        lem_regex.append([word, fr'\b({"|".join(lemmas)})\b'])
    return lem_regex
def remove_punctuation(text):
    for i in range(len(text)):
        text[i] = re.sub(r'[^\w\s]', lambda m: "." if m.group(0) == "." else " ", text[i])
    return text

def split_sentences(text):
    return re.split(r"[.]", text)

def lemmatization(text, lem):
    for lemma, lem_regex in lem:
        text = re.sub(lem_regex, lemma, text)
    return text

def trim(text):
    for i in range(len(text)):
        text[i] = text[i].strip()
    return text

In [69]:
books = [
    {
        'fn': 'a-study-in-scarlet.txt',
        'roman_numeral': True
    },
    {
        'fn': 'the-great-boer-war.txt',
        'roman_numeral': False
    },
    {
        'fn': 'the-hound-of-the-baskervilles.txt',
        'roman_numeral': False
    },
    {
        'fn':  'the-lost-world.txt',
        'roman_numeral': True
    },
    {
        'fn': 'the-sign-of-four.txt',
        'roman_numeral': True
    },
]
basedir = '../books/'
stop_words = get_stop_words('stopwords.txt')
lem = get_lem('lemmatization-en.txt')
book_i = 1
book_fn = books[book_i]['fn']
book_rom_num = books[book_i]['roman_numeral']

def clean_all_text(book_fn, base_dir, stop_words, lem, book_rom_num):

    # get the raw text and make it all lower case
    raw_text = get_text(book_fn, base_dir)
    raw_text = raw_text.lower()
    raw_text = get_text_no_gutenberg(raw_text)
    raw_text = remove_stopwords(raw_text, stop_words)
    raw_text = remove_extra_spaces(raw_text)
    # raw_text = lemmatization(raw_text, lem)

    raw_chapters = split_chapters(raw_text, book_rom_num)
    raw_chapters = chapter_longer_than(raw_chapters)[1:]
    # raw_chapters = list(map(lambda c: remove_stopwords(c, stop_words), raw_chapters))
    raw_chapters = list(map(remove_extra_spaces, raw_chapters))
    raw_chapters  = remove_punctuation(raw_chapters)
    raw_chapters = list(map(remove_extra_spaces, raw_chapters))

    # lemmatization by chapter so we can parallelize it
    with Pool(NUM_PROCESSES) as pool:
        raw_chapters = pool.starmap(lemmatization, zip(raw_chapters, repeat(lem)))

    def sentence_helper(text):
        s = split_sentences(text)
        s = trim(s)
        s = list(filter(lambda x: x != '', s))
        return s
    raw_chapter_sentences = list(map(sentence_helper, raw_chapters))

    raw_text = '. '.join(chain(*raw_chapter_sentences))
    raw_text = remove_extra_spaces(raw_text)
    return raw_text, raw_chapters, raw_chapter_sentences

raw_text, raw_chapters, raw_chapter_sentences = clean_all_text(book_fn, basedir, stop_words, lem, book_rom_num)

In [70]:
raw_chapter_sentences

[['boer nation',
  'community dutchman type defend year power spain time spain great power world',
  'intermix strain inflexible french huguenots give home fortune leave country time revocation edict nantes',
  'product obviously rugged virile unconquerable race see earth',
  'formidable people train seven generation constant warfare savage man ferocious beast circumstance weakling survive place acquire exceptional skill weapon horsemanship country eminently suit tactic huntsman marksman rider',
  'finally fine temper military quality dour fatalistic old testament religion ardent consume patriotism',
  'combine quality impulse individual modern boer formidable antagonist cross path imperial britain',
  'military history largely consist conflict france napoleon veteran treat roughly hard bite farmer ancient theology inconveniently modern rifle',
  'look map south africa centre british possession like stone peach lie great stretch republic mighty domain small people',
  'come teutonic fo

In [71]:
raw_text[:1200]

'boer nation. community dutchman type defend year power spain time spain great power world. intermix strain inflexible french huguenots give home fortune leave country time revocation edict nantes. product obviously rugged virile unconquerable race see earth. formidable people train seven generation constant warfare savage man ferocious beast circumstance weakling survive place acquire exceptional skill weapon horsemanship country eminently suit tactic huntsman marksman rider. finally fine temper military quality dour fatalistic old testament religion ardent consume patriotism. combine quality impulse individual modern boer formidable antagonist cross path imperial britain. military history largely consist conflict france napoleon veteran treat roughly hard bite farmer ancient theology inconveniently modern rifle. look map south africa centre british possession like stone peach lie great stretch republic mighty domain small people. come teutonic folk burrow deeply africa twice tell tal

In [31]:

l = lemmatization(raw_chapters[0], lem)

In [32]:
print(l)

. boer nation. community dutchman type defend year power spain time spain great power world. intermix strain inflexible french huguenots give home fortune leave country time revocation edict nantes. product obviously rugged virile unconquerable race see earth. formidable people train seven generation constant warfare savage man ferocious beast circumstance weakling survive place acquire exceptional skill weapon horsemanship country eminently suit tactic huntsman marksman rider. finally fine temper military quality dour fatalistic old testament religion ardent consume patriotism. combine quality impulse individual modern boer formidable antagonist cross path imperial britain. military history largely consist conflict france napoleon veteran treat roughly hard bite farmer ancient theology inconveniently modern rifle. look map south africa centre british possession like stone peach lie great stretch republic mighty domain small people. come teutonic folk burrow deeply africa twice tell ta

In [25]:
lem

[['1', '(first)'],
 ['10', '(tenth)'],
 ['100', '(hundredth)'],
 ['1000', '(thousandth)'],
 ['1000000', '(millionth)'],
 ['1000000000', '(billionth)'],
 ['11', '(eleventh)'],
 ['12', '(twelfth)'],
 ['13', '(thirteenth)'],
 ['14', '(fourteenth)'],
 ['15', '(fifteenth)'],
 ['16', '(sixteenth)'],
 ['17', '(seventeenth)'],
 ['18', '(eighteenth)'],
 ['19', '(nineteenth)'],
 ['2', '(second)'],
 ['20', '(twentieth)'],
 ['200', '(two-hundredth)'],
 ['21', '(twenty-first)'],
 ['22', '(twenty-second)'],
 ['23', '(twenty-third)'],
 ['24', '(twenty-fourth)'],
 ['25', '(twenty-fifth)'],
 ['26', '(twenty-sixth)'],
 ['27', '(twenty-seventh)'],
 ['28', '(twenty-eighth)'],
 ['29', '(twenty-ninth)'],
 ['3', '(third)'],
 ['30', '(thirtieth)'],
 ['300', '(three-hundredth)'],
 ['31', '(thirty-first)'],
 ['32', '(thirty-second)'],
 ['33', '(thirty-third)'],
 ['34', '(thirty-fourth)'],
 ['35', '(thirty-fifth)'],
 ['36', '(thirty-sixth)'],
 ['37', '(thirty-seventh)'],
 ['38', '(thirty-eighth)'],
 ['39', '(thi

In [7]:
# print(raw_chapters[0])
print(re.sub(r'\b(' + '|'.join(word) + r')\b', r'\b(' + '|'.join(lem) + r')\b', raw_chapters[0][:100]))



In [9]:
raw_chapters[0]  = lemmatization (raw_chapters[0][:5] , lem, word)


In [11]:
print(raw_chapters[0])



In [None]:
def porters_alg(text):
    # https://vijinimallawaarachchi.com/2017/05/09/porter-stemming-algorithm/#:~:text=The%20Porter%20Stemming%20algorithm%20(or,of%20Information%20Retrieval%20(IR).
    # https://tartarus.org/martin/PorterStemmer/
    m = re.match(r'(\w+?)(?=ly|es|(?<!s)s|y)', text)
    print(m.groups())
porters_alg('caresses days cates')

In [None]:
raw_chapters[0]

In [None]:
raw_chapters[1]

In [None]:
list(filter(lambda c: len(c) > 50, raw_chapters))

In [None]:
from utils.regex_utils import *

In [None]:
textloc = "books/a-study-in-scarlet.txt"
stopwordloc = 'project1\stopwords.txt'
text = GetTextFromFile(textloc)
stopwords = GetStopWords(stopwordloc)
clean_text = GetCleanText(text, stopwords)
wordlist = GetUniqueWordList(clean_text)
chapters = GetChapterTextList(text, stopwords)
chapters_wordlist = GetChapterWordList(chapters)

In [None]:
textloc = "books/a-study-in-scarlet.txt"
stopwordloc = 'project1\stopwords.txt'
text = GetTextFromFile(textloc)
stopwords = GetStopWords(stopwordloc)
clean_text = GetCleanText(text, stopwords)
wordlist = GetUniqueWordList(clean_text)
chapters = GetChapterTextList(text, stopwords)
chapters_wordlist = GetChapterWordList(chapters)

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
with open('stopwords.txt', 'w',encoding='utf-8') as f:
    for w in nlp.Defaults.stop_words:
        f.write(w + '\n')


In [None]:
len(nlp.Defaults.stop_words)