<h1> This notebook shows to preprocess a text corpus using NLKT, spaCy, Stanford CoreNLP, and Spark NLP.</h1>


In [4]:
# first load corpus 
import glob
docs = []
for filename in glob.glob('./King/*.txt'): 
    with open(filename, 'r', encoding='utf-8') as f:     
        docs.append(f.read().replace('\n', ' '))

In [5]:
# preprocess with NLTK 
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

cleanDocs = []

# the tag_switch function allows one to switch the pos tags from the format used by nltk.pos_tag 
# to one that is recognizable by WordNetLemmatizer so all word tokens can be lemmatized. 

# the code for tag_switch was taken from the nltk examples at the url below
# https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
def tag_switch(word):
    
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tags = {'J': wordnet.ADJ, 'N': wordnet.NOUN,
            'V': wordnet.VERB, 'R': wordnet.ADV}
    return tags.get(tag, wordnet.NOUN)

def nltk_preprocess(corpus): 
    for doc in corpus:       
        (cleanDocs.append([lemmatizer.lemmatize(word.lower(), tag_switch(word)) for word 
                           in nltk.word_tokenize(doc) if word.isalpha() and word.lower() 
                           not in stop_words]))
    return cleanDocs

In [33]:
nltk_preprocess(docs)

[['happy',
  'join',
  'today',
  'go',
  'history',
  'great',
  'demonstration',
  'freedom',
  'history',
  'nation',
  'five',
  'score',
  'year',
  'ago',
  'great',
  'american',
  'whose',
  'symbolic',
  'shadow',
  'stand',
  'today',
  'sign',
  'emancipation',
  'proclamation',
  'momentous',
  'decree',
  'come',
  'great',
  'beacon',
  'light',
  'hope',
  'million',
  'negro',
  'slave',
  'sear',
  'flame',
  'wither',
  'injustice',
  'come',
  'joyous',
  'daybreak',
  'end',
  'long',
  'night',
  'captivity',
  'one',
  'hundred',
  'year',
  'later',
  'negro',
  'still',
  'free',
  'one',
  'hundred',
  'year',
  'later',
  'life',
  'negro',
  'still',
  'sadly',
  'cripple',
  'manacle',
  'segregation',
  'chain',
  'discrimination',
  'one',
  'hundred',
  'year',
  'later',
  'negro',
  'life',
  'lonely',
  'island',
  'poverty',
  'midst',
  'vast',
  'ocean',
  'material',
  'prosperity',
  'one',
  'hundred',
  'year',
  'later',
  'negro',
  'still',
 

In [34]:
# reload corpus
import glob
docs = []
for filename in glob.glob('./King/*.txt'):  
    with open(filename, 'r', encoding='utf-8') as f:     
        docs.append(f.read().replace('\n', ' '))

In [35]:
# next preprocess with spaCy 
import spacy
nlp = spacy.load('en')

cleanDocs = []

def spacy_preprocess(corpus): 
    for doc in range(len(corpus)):
        corpus[doc] = nlp(corpus[doc]) 
        (cleanDocs.append([word.lemma_.lower() for word in corpus[doc] 
                           if word.is_stop ==False and word.lemma_.isalpha()]))
    
    return cleanDocs

In [36]:
spacy_preprocess(docs)

[['happy',
  'join',
  'today',
  'history',
  'great',
  'demonstration',
  'freedom',
  'history',
  'nation',
  'score',
  'year',
  'ago',
  'great',
  'american',
  'symbolic',
  'shadow',
  'stand',
  'today',
  'sign',
  'emancipation',
  'proclamation',
  'momentous',
  'decree',
  'come',
  'great',
  'beacon',
  'light',
  'hope',
  'million',
  'negro',
  'slave',
  'sear',
  'flame',
  'wither',
  'injustice',
  'come',
  'joyous',
  'daybreak',
  'end',
  'long',
  'night',
  'captivity',
  'year',
  'later',
  'negro',
  'free',
  'year',
  'later',
  'life',
  'negro',
  'sadly',
  'cripple',
  'manacle',
  'segregation',
  'chain',
  'discrimination',
  'year',
  'later',
  'negro',
  'live',
  'lonely',
  'island',
  'poverty',
  'midst',
  'vast',
  'ocean',
  'material',
  'prosperity',
  'year',
  'later',
  'negro',
  'languish',
  'corner',
  'american',
  'society',
  'find',
  'exile',
  'land',
  'come',
  'today',
  'dramatize',
  'shameful',
  'condition',
  

In [37]:
# reload corpus again 
import glob
docs = []
for filename in glob.glob('./King/*.txt'):  
    with open(filename, 'r', encoding='utf-8') as f:     
        docs.append(f.read().replace('\n', ' '))

In [38]:
#now preprocess with Stanford CoreNLP 
import stanfordnlp
nlp = stanfordnlp.Pipeline(processors ="tokenize,lemma,pos") 

# stanfordnlp doesn't come with its own stop words list
# so we are going to have to import one from NLTK again. 

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

cleanDocs = []

def stanford_preprocess(corpus): 
    for doc in range(len(corpus)):
        corpus[doc] = nlp(corpus[doc])
        (cleanDocs.append([words.lemma.lower() for sent in corpus[doc].sentences for words
                           in sent.words if words.lemma.lower() not in stop_words and
                           words.lemma.isalpha()]))
    return cleanDocs

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/ppchsdbib/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/ppchsdbib/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: pos
With settings: 
{'model_path': '/Users/ppchsdbib/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/ppchsdbib/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


In [39]:
stanford_preprocess(docs)

[['happy',
  'join',
  'today',
  'go',
  'history',
  'greatest',
  'demonstration',
  'freedom',
  'history',
  'nation',
  'five',
  'score',
  'year',
  'ago',
  'great',
  'american',
  'whose',
  'symbolic',
  'shadow',
  'stand',
  'today',
  'sign',
  'emancipation',
  'proclamation',
  'momentous',
  'decree',
  'come',
  'great',
  'beacon',
  'light',
  'hope',
  'million',
  'negro',
  'slave',
  'sear',
  'flame',
  'wither',
  'injustice',
  'come',
  'joyous',
  'daybreak',
  'end',
  'long',
  'night',
  'captivity',
  'one',
  'hundred',
  'year',
  'later',
  'negro',
  'still',
  'free',
  'one',
  'hundred',
  'year',
  'later',
  'life',
  'negro',
  'still',
  'sadly',
  'crippled',
  'manacle',
  'segregation',
  'chain',
  'discrimination',
  'one',
  'hundred',
  'year',
  'later',
  'negro',
  'life',
  'lonely',
  'island',
  'poverty',
  'midst',
  'vast',
  'ocean',
  'material',
  'prosperity',
  'one',
  'hundred',
  'year',
  'later',
  'negro',
  'still

In [6]:
# getting started wth spark and sparknlp 
import sparknlp 
from sparknlp.pretrained import PretrainedPipeline
spark = sparknlp.start()

pipeline = PretrainedPipeline('explain_document_dl', lang='en') 

# sparknlp also doesn't come with its own stop words list
# so we are going to have to import from nltk again.

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

cleanDocs = []

def sparknlp_preprocess(corpus): 
     for doc in corpus:
        doc = pipeline.annotate(doc) 
        (cleanDocs.append([word.lower() for word in doc['lemma'] if word.lower() 
                           not in stop_words and word.isalpha()]))
     return cleanDocs

explain_document_dl download started this may take some time.
Approx size to download 167.3 MB
[OK!]


In [8]:
sparknlp_preprocess(docs)

[['happy',
  'join',
  'today',
  'go',
  'history',
  'great',
  'demonstration',
  'freedom',
  'history',
  'nation',
  'five',
  'score',
  'year',
  'ago',
  'great',
  'american',
  'whose',
  'symbolic',
  'shadow',
  'stand',
  'today',
  'sign',
  'emancipation',
  'proclamation',
  'momentous',
  'decree',
  'come',
  'great',
  'beacon',
  'light',
  'hope',
  'millions',
  'negro',
  'slave',
  'sear',
  'flame',
  'wither',
  'injustice',
  'come',
  'joyous',
  'daybreak',
  'end',
  'long',
  'night',
  'captivity',
  'one',
  'hundred',
  'year',
  'late',
  'negro',
  'still',
  'free',
  'one',
  'hundred',
  'year',
  'late',
  'life',
  'negro',
  'still',
  'sadly',
  'cripple',
  'manacle',
  'segregation',
  'chain',
  'discrimination',
  'one',
  'hundred',
  'year',
  'late',
  'negro',
  'life',
  'lonely',
  'island',
  'poverty',
  'midst',
  'vast',
  'ocean',
  'material',
  'prosperity',
  'one',
  'hundred',
  'year',
  'late',
  'negro',
  'still',
  'l