## Importing Libraries

In [2]:
import nltk
from nltk.corpus import gutenberg

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import re
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
import nltk.tokenize as token
from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import time
import string

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import numpy as np
from string import punctuation

stop_words = set(stopwords.words('english'))
# stop_words.remove("very")
# stop_words.add("th")

[nltk_data] Downloading package stopwords to /Users/harsh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/harsh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# nltk.download('gutenberg')    for first time installation

In [4]:
files = gutenberg.fileids()
print(files)

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [5]:
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
len(emma)

192427

In [6]:
for fileid in gutenberg.fileids():
    print(fileid)
    num_chars = len(gutenberg.raw(fileid)) 
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))   # number of unique words
#     print('Number of characters:', num_chars)
#     print('Number of words:', num_words)
#     print('Number of sentences:', num_sents)
#     print('Number of vocab:', num_vocab)
#     print(round(num_chars/num_words), round(num_words/num_sents), round(num_words/num_vocab))

austen-emma.txt
Number of characters: 887071
Number of words: 192427
Number of sentences: 7752
Number of vocab: 7344
5 25 26
austen-persuasion.txt
Number of characters: 466292
Number of words: 98171
Number of sentences: 3747
Number of vocab: 5835
5 26 17
austen-sense.txt
Number of characters: 673022
Number of words: 141576
Number of sentences: 4999
Number of vocab: 6403
5 28 22
bible-kjv.txt
Number of characters: 4332554
Number of words: 1010654
Number of sentences: 30103
Number of vocab: 12767
4 34 79
blake-poems.txt
Number of characters: 38153
Number of words: 8354
Number of sentences: 438
Number of vocab: 1535
5 19 5
bryant-stories.txt
Number of characters: 249439
Number of words: 55563
Number of sentences: 2863
Number of vocab: 3940
4 19 14
burgess-busterbrown.txt
Number of characters: 84663
Number of words: 18963
Number of sentences: 1054
Number of vocab: 1559
4 18 12
carroll-alice.txt
Number of characters: 144395
Number of words: 34110
Number of sentences: 1703
Number of vocab: 2

In [14]:
macbeth_sentences = gutenberg.sents('shakespeare-caesar.txt')
macbeth_sentences[11]

['What', 'dost', 'thou', 'with', 'thy', 'best', 'Apparrell', 'on', '?']

## Data Cleaning

In [8]:
def convert_to_lower(text):
    # return the reviews after convering then to lowercase
    # Words with different cases are intercepted differently such as 'The' and 'the'. 
    # Hence all words should be converted into same case, preferably lower case.
    l = []
    for t in text:
        l.append(t.lower())
    return l

def remove_punctuation(text):
    # return the reviews after removing punctuations
    # Refer: https://www3.ntu.edu.sg/home/ehchua/programming/howto/Regexe.html
    l = []
    # \w : word character
    # \W : non-word character
    # \d : digits
    # \D : non-digits
    
    for t in text:
        l.append(re.sub(r'[^\w\s]|^\s\d+\s|\s\d+|\d+|\s\d+$', ' ', t)) #|^\s\d+\s|\s\d+|\d+|\s\d+$
    return l

# def remove_punctuation(text):
#     text = text.translate(str.maketrans('', '', string.punctuation))
#     return text

def remove_stopwords(text):
    # return the reviews after removing the stopwords
    # Stopwords are the most common words in a language. For example 'is', 'the', 'that' etc. are stopwords in English language. Stopwords shall be removed during text clean-up phase. However removing stop word can change the meaning of sentence. 
    # For instance 'I didn't love politics' will get converted to 'I love politics' after removing stopword.  
    l = []
    large = 0
    for t in text:
        word_tokens = token.word_tokenize(t)
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        l.append(filtered_sentence)
    return l

def remove_URLs(text):
    text  = re.sub(r"https?://\S+|www\.\S+", "", text )
    return text

def remove_digits(text):
    text= re.sub(r'[0-9]','',text)
    return text

def remove_spaces(text):
    text = re.sub(' +', ' ', text)
    return text

def perform_tokenization(text):
    # return the reviews after performing tokenization
    text = token.word_tokenize(text)
#     tk = Whitespa/ceTokenizer()
#     tk = WordPunctTokenizer()
#     tk = TreebankWordTokenizer()
#     text = tk.tokenize(text)
    
    
    return text

def perform_padding(data):
    # return the reviews after padding the reviews to maximum length
    maxlen = 30
    return pad_sequences(data, maxlen=maxlen, padding="post")

def correct_spellings(text):
    # At times textual data such as social media data is prone to spelling errors. Spelling errors 
    # should be rectified early during the clean-up phase. Fortunately we have libraries available for spelling correction.
    spell = SpellChecker()
    corrected_words = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_words.append(spell.correction(word))
        else:
            corrected_words.append(word)
    return " ".join(corrected_words)

def convert_emoji(text):
    text = emoji.demojize(text)
    return text

# def convert_to_antonym(sentence):
#     words = nltk.word_tokenize(sentence)
#     new_words = []
#     temp_word = ''
#     for word in words:
#         antonyms = []
#         if word == 'not':
#             temp_word = 'not_'
#         elif temp_word == 'not_':
#             for syn in wordnet.synsets(word):
#                 for s in syn.lemmas():
#                     for a in s.antonyms():
#                         antonyms.append(a.name())
#             if len(antonyms) >= 1:
#                 word = antonyms[0]
#             else:
#                 word = temp_word + word # when antonym is not found, it will
#                                     # remain not_happy
            
#             temp_word = ''
#         if word != 'not':
#             new_words.append(word)
#     return ' '.join(new_words)

# stemmer = PorterStemmer()
# lemmatizer = WordNetLemmatizer()
# # without wordnet map it takes evey word as noun
# wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV }

# def stem_words(text):
#     # a process of removing and replacing suffixes to get the root form of the word.
#     # Porterstemmer is rule based. (eg: dogs -> dog)
#     return " ".join([stemmer.stem(word) for word in text.split()])

# def lemma_words(text):
#     pos_tagged_text = nltk.pos_tag(text.split())
#     return " ".join([lemmatizer.lemmatize(word ,wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [16]:
clean_sentences = remove_punctuation(macbeth_sentences[11])
print(clean_sentences)

['Where', 'is', 'thy', 'Leather', 'Apron', ' ', 'and', 'thy', 'Rule', ' ']


In [20]:
class Preprocessing:
    def convert_to_lower(text):
        # return the reviews after convering then to lowercase
    # Words with different cases are intercepted differently such as 'The' and 'the'. 
    # Hence all words should be converted into same case, preferably lower case.
        l = []
        for t in text:
            l.append(t.lower())
        return l
    
    def remove_punctuation(text):
        # return the reviews after removing punctuations
        # Refer: https://www3.ntu.edu.sg/home/ehchua/programming/howto/Regexe.html
        l = []
        # \w : word character
        # \W : non-word character
        # \d : digits
        # \D : non-digits
        for t in text:
            l.append(re.sub(r'[^\w\s]|^\s\d+\s|\s\d+|\d+|\s\d+$', ' ', t)) #|^\s\d+\s|\s\d+|\d+|\s\d+$
        return l

NameError: name 'text' is not defined

In [19]:
cleaned = Preprocessing(macbeth_sentences)

TypeError: Preprocessing() takes no arguments