## Gutensberg Books Text Cleaning and Normalization
##### Author: Kevin Okiah
**4/9/2019**

In [1]:
#visualization 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Data Manipulation and Statistics
import pandas as pd
import numpy as np

#Directory Navigation and Saving instances
import os
from unipath import Path
wd = os.getcwd()
p = Path(wd)
path = str(p.parent)

#Text Cleaning and Analytics
import spacy
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 100000000
from TextCleaningToolkit import *

#leveraging Sarkar's codes
#from normalization import normalize_corpus 


In [2]:
# Read data
data = pd.read_csv(path+"/Data/GuternsbergBooksRaw.csv", encoding='utf-8')
#data = pd.read_csv(path+"/Data/MovieReviewsWithSentiments.csv", encoding='utf-8')
data.head(2)

Unnamed: 0,BookTitle,Category,url,Body
0,"A Primary Reader: Old-time Stories, Fairy Tale...",Misc.,http://www.gutenberg.org/cache/epub/7841/pg784...,"['CONTENTS.', 'THE UGLY DUCKLING', 'THE LITTLE..."
1,The Bird-Woman of the Lewis and Clark Expedition,Misc.,http://www.gutenberg.org/cache/epub/5742/pg574...,"['CONTENTS', 'THE BIRD-WOMAN', 'WHO THE WHITE ..."


### Exploring Spacy's Text cleaning Capability

##### Spacy tokenization and lemming

In [4]:
doc = nlp('Kevin working for TI went to New York. It was raining the whole time')

#doc = nlp(data.Body[0]) # first guterns book

doc = doc[0:50]
# tokenize a sentence
for token in doc:
    if(token.is_alpha ==True and token.is_stop!=True):
        print(token.text, end = '|') # default end in newline
print('\n')
#tokenize and lematize sentence
for token in doc:    
    if(token.is_alpha ==True and token.is_stop!=True):
        print(token.lemma_, end='|') # access the root word

Kevin|working|TI|went|New|York|raining|time|

Kevin|work|TI|go|New|York|rain|time|

In [5]:
def show_lemmas(text):
    """
    Function to show Lemma
    """
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [6]:
show_lemmas(doc)

Kevin        PROPN  16659986161459375802   Kevin
working      VERB   10038440415813069799   work
for          ADP    16037325823156266367   for
TI           PROPN  7296585798774874219    TI
went         VERB   8004577259940138793    go
to           ADP    3791531372978436496    to
New          PROPN  7503827727184870577    New
York         PROPN  7898044819112200372    York
.            PUNCT  12646065887601541794   .
It           PRON   561228191312463089     -PRON-
was          VERB   10382539506755952630   be
raining      VERB   6253719383086150949    rain
the          DET    7425985699627899538    the
whole        ADJ    16948554243429412012   whole
time         NOUN   8885804376230376864    time


##### Stop words 
Don't give you any additional info

In [7]:
# Spacy's stop words
print(nlp.Defaults.stop_words)

{'must', 'you', 'done', 'why', 'same', 'also', 'whenever', 'against', 'may', 'a', 'anyway', 'become', 'through', 'anyone', 'formerly', 'about', 'being', 'by', 'less', 'side', 'nobody', 'he', 'eight', 'thereby', 'few', 'anything', 'therefore', 'twenty', 'unless', 'with', 'latter', 'our', 'third', 'quite', 'but', 'i', 'once', 'myself', 'thereafter', 'various', 'meanwhile', 'very', 'next', 'moreover', 'were', 'across', 'each', 'empty', 'hers', 'such', 'yourself', 'several', 'without', 'themselves', 'amount', 'using', 'hundred', 'down', 'via', 'yet', 'three', 'whither', 'seem', 'besides', 'on', 'whether', 'until', 'hereby', 'front', 'except', 'two', 'again', 'else', 'twelve', 'every', 'is', 'within', 'rather', 'into', 'please', 'nevertheless', 'something', 'still', 'serious', 'mostly', 'somewhere', 'back', 'your', 'some', 'well', 'that', 'after', 'not', 'its', 'around', 'more', 'per', 'she', 'wherein', 'whom', "'m", 'be', 'because', 'call', 'among', 'go', 'never', 'fifteen', 'herself', 'pu

In [8]:
def add_stop_words(stoplist =['btw']):
    """
    This function extends Spacy's Stop words
    stoplist = list of stopwords to add
    
    """
    for i in stoplist:
        # Add the word to the set of stop words. Use lowercase!
        nlp.Defaults.stop_words.add(i)
        # Set the stop_word tag on the lexeme
        nlp.vocab[i].is_stop = True

#add_stop_words()  
#print(nlp.vocab['btw'].is_stop)

In [9]:
def remove_stop_words(stoplist =['btw']):
    """
    This function extends Spacy's Stop words
    
    """
    for i in stoplist:
        # Add the word to the set of stop words. Use lowercase!
        nlp.Defaults.stop_words.remove(i)
        # Set the stop_word tag on the lexeme
        nlp.vocab[i].is_stop = False
#remove_stop_words() 
#print(nlp.vocab['btw'].is_stop)

##### Named Entity Recognition (NER), Noun Chunking and Spacy Visualization 

In [10]:
# Spacy can recognise Named entity (proper nouns) in a text
for entity in doc.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_))) # explain the label
    print('\n')

Kevin
PERSON
People, including fictional


TI
ORG
Companies, agencies, institutions, etc.


New York
GPE
Countries, cities, states




In [11]:
for chunk in doc.noun_chunks:
    print(chunk)

Kevin
TI
New York
It
the whole time


In [12]:
#spacy Visualization
from spacy import displacy

displacy.render(doc, style='dep', jupyter=True, options={'distance': 70})

In [13]:
doc2 = nlp("Last Year IBM made a wooping 6 million Dollars in Laptop sales")

In [14]:
displacy.render(doc2, style='ent', jupyter=True)

### Applying Spacy Text Tokenization and Normalization capabilty

Parsing GuternsBerg Books using the SPacy Cleaning function below. **This can take a while based on the size of your documents**

In [34]:
doc2 = (u"Tesla is the fastest growing company , say, though not profitable")

def SpacyTextCleaner(text, stoplist = ['contents','gutenberg', 'illustration', 'illustration']):
    '''
    Function leverages Spacy for text cleaning activities
       1. brakes text into tokens based on space, punctuation, 
       2. Lemmatizes text, 
       3. removes spaces, alphanumeric, stop words  
       4. Converts to lower
    '''
    add_stop_words(stoplist)
    Text = nlp(text)
    Tokens = []
    for token in Text:
        
        if(token.is_alpha ==True and token.is_stop!=True):
            Tokens = Tokens + [token.lemma_.lower()]
    return  Tokens



In [35]:
print("Exploring functionality of above function")
print("------------------------------------------")
print("Original Doc: ", doc2)
print('Clean Tokens: ',SpacyTextCleaner(doc2))

Exploring functionality of above function
------------------------------------------
Original Doc:  Tesla is the fastest growing company , say, though not profitable
Clean Tokens:  ['tesla', 'fast', 'grow', 'company', 'profitable']


In [18]:
# Parsing Guternberd Books (this can take a while)
RawCorpus = data.Body 

# Parsing movie reviews (this can take a while)
##RawCorpus = data.Review

Corpus = []

for i in RawCorpus:
    T = SpacyTextCleaner(i)
    Corpus = Corpus +[T]

In [19]:
data["Corpus"] = Corpus

In [20]:
data.head()

Unnamed: 0,BookTitle,Category,url,Body,Corpus
0,"A Primary Reader: Old-time Stories, Fairy Tale...",Misc.,http://www.gutenberg.org/cache/epub/7841/pg784...,"['CONTENTS.', 'THE UGLY DUCKLING', 'THE LITTLE...","[ugly, duckling, little, pine, tree, little, m..."
1,The Bird-Woman of the Lewis and Clark Expedition,Misc.,http://www.gutenberg.org/cache/epub/5742/pg574...,"['CONTENTS', 'THE BIRD-WOMAN', 'WHO THE WHITE ...","[bird, woman, white, men, sacajawea, go, west,..."
2,"Dr. Scudder's Tales for Little Readers, About ...",Misc.,http://www.gutenberg.org/cache/epub/13539/pg13...,"['CONTENTS.', 'CHAPTER I.', 'General Remarks',...","[chapter, general, remarks, chapter, ii, color..."
3,The Louisa Alcott Reader: a Supplementary Read...,Misc.,http://www.gutenberg.org/cache/epub/7425/pg742...,"['CONTENTS.', 'I. A CHRISTMAS DREAM', 'II. THE...","[christmas, dream, ii, candy, country, iii, na..."
4,"Boy Blue and his friends, School ed.",Misc.,http://www.gutenberg.org/cache/epub/16046/pg16...,"['~CONTENTS~', 'LITTLE BOY BLUE', 'SNOWBALL', ...","[little, boy, blue, snowball, fire, cracker, b..."


In [31]:
data.Corpus[0]

['ugly',
 'duckling',
 'little',
 'pine',
 'tree',
 'little',
 'match',
 'girl',
 'little',
 'red',
 'riding',
 'hood',
 'apples',
 'idun',
 'thor',
 'got',
 'hammer',
 'hammer',
 'lose',
 'found',
 'story',
 'sheep',
 'good',
 'ship',
 'argo',
 'jason',
 'harpies',
 'brass',
 'bulls',
 'jason',
 'dragon',
 'dressed',
 'thor',
 'like',
 'freyja',
 'ugly',
 'duckling',
 'break',
 'turkey',
 'warm',
 'ugly',
 'water',
 'duck',
 'nest',
 'leave',
 'duck',
 'nest',
 'sit',
 'egg',
 'warm',
 'egg',
 'break',
 'little',
 'duck',
 'come',
 'egg',
 'leave',
 'large',
 'break',
 'come',
 'big',
 'ugly',
 'duckle',
 'big',
 'duckling',
 'say',
 'old',
 'duck',
 'look',
 'like',
 'like',
 'water',
 'duck',
 'mother',
 'jump',
 'duckling',
 'splash',
 'swim',
 'big',
 'call',
 'begin',
 'little',
 'day',
 'mother',
 'duck',
 'take',
 'duckling',
 'pond',
 'duck',
 'take',
 'ducklings',
 'swim',
 'splash',
 'splash',
 'mother',
 'duck',
 'water',
 'call',
 'duckling',
 'come',
 'jump',
 'begin',
 '

In [24]:
import os
from unipath import Path
wd = os.getcwd()
p = Path(wd)
path = str(p.parent)

# Saving Data for future analysis
data.to_csv(path+'/Data/GuternsbergBooksClean.csv', header=True, index=False, encoding='utf-8')
#data.to_csv(path+"/Data/MovieReviewsWithSentimentsClean.csv", header=True, index=False, encoding='utf-8')

In [23]:
%load_ext version_information
%version_information pandas, numpy, requests, bs4, selenium, lxml, urllib3, pyvirtualdisplay, unipath

Software,Version
Python,3.6.8 64bit [GCC 7.3.0]
IPython,7.2.0
OS,Linux 4.15.0 46 generic x86_64 with debian buster sid
pandas,0.22.0
numpy,1.16.2
requests,2.21.0
bs4,4.7.1
selenium,3.141.0
lxml,4.3.2
urllib3,1.24.1
