In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import re
from lib import get_stop_words, get_text, get_lem_words

In [None]:
# project gutenberg text has a lot of extra stuff at the beginning and end
def get_text_no_gutenberg(raw_text):
    return re.split('^\*\*\*(.*)\*\*\*$', raw_text, flags=re.MULTILINE)[2]
    
def remove_single_letter(text):
    text = re.sub(r'\b\w\b', ' ', text)
    return text

# split the raw text into chapters
def split_chapters(text, roman_numeral=True):
    if roman_numeral:
        return re.split(r'CHAPTER [IVXLCDM]+', text, flags=re.IGNORECASE)
    else:
        return re.split(r'CHAPTER \d+', text, flags=re.IGNORECASE)

# sometimes the contents lists chapters that are in table of contents. here we just remove chapters that are too short.
# here we also delete the first chapter, which is just the table of contents and preface
def chapter_longer_than(raw_chapters, n=150):
    return list(filter(lambda c: len(c) > n, raw_chapters))

# split remove stopwords
def remove_stopwords(text, stopwords):
    return re.sub(r'\b(' + '|'.join(stopwords) + r')\b', '', text)

# theres a ton of whitespace that we dont want
def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text)

def get_lem(fn):
    text = []
    lem = []
    word = []
    with open(fn, 'r', encoding='utf-8-sig') as f:
        for i in f:
            text.append([j for j in i.split()])
    for i in range (len(text)):
        lem.append(text[i][0])
        word.append(text[i][1])
    return text, lem, word

def remove_punctuation(text):
    for i in range(len(text)):
        text[i] = re.sub(r'[^\w\s]', lambda m: "." if m.group(0) == "." else " ", text[i])
    return text

def split_sentence(text):
    for i in range(len(text)):
        text[i] = re.split(r"[.]", text[i])
    return text

def lemmatization (text, lem, word):
    for i in range(len(text)):
        text[i] = re.sub(r'\b(' + '|'.join(word) + r')\b', r'\b(' + '|'.join(lem) + r')\b', text[i])
    return text
    

In [None]:
books = [
    {
        'fn': 'a-study-in-scarlet.txt',
        'roman_numeral': True
    },
    {
        'fn': 'the-great-boer-war.txt',
        'roman_numeral': False
    },
    {
        'fn': 'the-hound-of-the-baskervilles.txt',
        'roman_numeral': False
    },
    {
        'fn':  'the-lost-world.txt',
        'roman_numeral': True
    },
    {
        'fn': 'the-sign-of-four.txt',
        'roman_numeral': True
    },
]
basedir = '../books/'
stop_words = get_stop_words('stopwords.txt')
text, lem, word = get_lem('lemmatization-en.txt')
book_i = 1
book_fn = books[book_i]['fn']
book_rom_num = books[book_i]['roman_numeral']

# get the raw text and make it all lower case
raw_text = get_text(book_fn, basedir)
raw_text = raw_text.lower()
raw_text = get_text_no_gutenberg(raw_text)
raw_text = remove_stopwords(raw_text, stop_words)
raw_text = remove_extra_spaces(raw_text)

raw_chapters = split_chapters(raw_text, book_rom_num)
raw_chapters = chapter_longer_than(raw_chapters)[1:]
raw_chapters = list(map(lambda c: remove_stopwords(c, stop_words), raw_chapters))
raw_chapters = list(map(remove_extra_spaces, raw_chapters))
raw_chapters  = remove_punctuation(raw_chapters)
raw_chapters = split_sentence(raw_chapters)

In [None]:
raw_chapters[0]  = lemmatization (raw_chapters[0] , lem, word)
print (raw_chapters[0])

In [None]:
def porters_alg(text):
    # https://vijinimallawaarachchi.com/2017/05/09/porter-stemming-algorithm/#:~:text=The%20Porter%20Stemming%20algorithm%20(or,of%20Information%20Retrieval%20(IR).
    # https://tartarus.org/martin/PorterStemmer/
    m = re.match(r'(\w+?)(?=ly|es|(?<!s)s|y)', text)
    print(m.groups())
porters_alg('caresses days cates')

In [None]:
raw_chapters[0]

In [None]:
raw_chapters[1]

In [None]:
list(filter(lambda c: len(c) > 50, raw_chapters))

In [None]:
from utils.regex_utils import *

In [None]:
textloc = "books/a-study-in-scarlet.txt"
stopwordloc = 'project1\stopwords.txt'
text = GetTextFromFile(textloc)
stopwords = GetStopWords(stopwordloc)
clean_text = GetCleanText(text, stopwords)
wordlist = GetUniqueWordList(clean_text)
chapters = GetChapterTextList(text, stopwords)
chapters_wordlist = GetChapterWordList(chapters)

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
with open('stopwords.txt', 'w',encoding='utf-8') as f:
    for w in nlp.Defaults.stop_words:
        f.write(w + '\n')


In [None]:
len(nlp.Defaults.stop_words)