In [7]:
import nltk
from nltk.corpus import gutenberg

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import re
import nltk

import nltk.tokenize as token
from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import time
import string

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import numpy as np
from string import punctuation

stop_words = set(stopwords.words('english'))


## Preprocessing definitions

In [25]:
def convert_to_lower(text):
    # return the reviews after convering then to lowercase
    # Words with different cases are intercepted differently such as 'The' and 'the'. 
    # Hence all words should be converted into same case, preferably lower case.
    l = []
    for t in text:
        l.append(t.lower())
    return remove_punctuation(l)

def remove_punctuation(text):
    # return the reviews after removing punctuations
    # Refer: https://www3.ntu.edu.sg/home/ehchua/programming/howto/Regexe.html
    l = []
    # \w : word character
    # \W : non-word character
    # \d : digits
    # \D : non-digits
    
    for t in text:
        l.append(re.sub(r'[^\w\s]|^\s\d+\s|\s\d+|\d+|\s\d+$', ' ', t)) #|^\s\d+\s|\s\d+|\d+|\s\d+$
    return remove_stopwords(l)

def remove_stopwords(text):
    # return the reviews after removing the stopwords
    # Stopwords are the most common words in a language. For example 'is', 'the', 'that' etc. are stopwords in English language. Stopwords shall be removed during text clean-up phase. However removing stop word can change the meaning of sentence. 
    # For instance 'I didn't love politics' will get converted to 'I love politics' after removing stopword.  
    l = []
    large = 0
    for t in text:
        word_tokens = token.word_tokenize(t)
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        l.append(filtered_sentence)
    return l

def remove_URLs(text):
    text  = re.sub(r"https?://\S+|www\.\S+", "", text )
    return remove_digits(text)

def remove_digits(text):
    text= re.sub(r'[0-9]','',text)
    return remove_spaces(text)

def remove_spaces(text):
    text = re.sub(' +', ' ', text)
    return text


In [9]:
files = gutenberg.fileids()
print(files)

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [27]:
macbeth_sentences = gutenberg.sents('shakespeare-caesar.txt')
print(macbeth_sentences[11])

['Where', 'is', 'thy', 'Leather', 'Apron', ',', 'and', 'thy', 'Rule', '?']


In [26]:
clean_sentences = convert_to_lower(macbeth_sentences[11])
# clean_sentences = remove_punctuation(clean_sentences)
print(clean_sentences)

[[], [], ['thy'], ['leather'], ['apron'], [], [], ['thy'], ['rule'], []]


In [29]:
print(remove_stopwords(macbeth_sentences[11]))

[['Where'], [], ['thy'], ['Leather'], ['Apron'], [','], [], ['thy'], ['Rule'], ['?']]
