In [53]:
#For this challenge, you will need to choose a corpus of data from nltk or another source 
#that includes categories you can predict and create an analysis pipeline that includes 
#the following steps:

#1.Data cleaning / processing / language parsing
#2.Create features using two different NLP methods: For example, BoW vs tf-idf.
#3.Use the features to fit supervised learning models for each feature set to predict 
   #the category outcomes.
#4.Assess your models using cross-validation and determine whether one model performed 
   #better.
#5.Pick one of the models and try to increase accuracy by at least 5 percentage points.
#6.Write up your report in a Jupyter notebook. Be sure to explicitly justify the choices 
  #you make throughout, and submit it below.



In [20]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import word_tokenize
from collections import Counter
pd.options.display.max_colwidth = 200

In [133]:
#define a processing function to clean our document data
import nltk
from nltk.corpus import gutenberg
nltk.download('gutenberg')
import re

def process_text(text):
#processing
    doc_paras=[]
    for paragraph in (text):
        para=paragraph[0]
        #removing the double-dash from all words
        para=[re.sub(r'--','',word) for word in para]
        para=[re.sub(r'_','',word) for word in para]
        para=[re.sub("[\[].*?[\]]", "", word)for word in para]
        para=[re.sub(r'[\r|\n|\r\n]+', ' ',word)for word in para]
        para=[re.sub('[^a-zA-z0-9\s]', '', word)for word in para]
       #Forming each paragraph into a string and adding it to the list of strings.
        doc_paras.append(' '.join(para))

    print(doc_paras[0:4])
    return doc_paras


[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/karenmcgee/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [134]:
#load the raw text
bryant_stories = gutenberg.paras('bryant-stories.txt')
blake = gutenberg.paras('blake-poems.txt')

In [135]:
#process the text for each corpus
bryant = process_text(bryant_stories)
blake = process_text(blake)

['[ Stories to Tell to Children by Sara Cone Bryant 1918 ]', 'TWO LITTLE RIDDLES IN RHYME', 'There  s a garden that I ken  Full of little gentlemen  Little caps of blue they wear  And green ribbons  very fair ', 'From house to house he goes  A messenger small and slight  And whether it rains or snows  He sleeps outside in the night ']
['[ Poems by William Blake 1789 ]', 'SONGS OF INNOCENCE AND OF EXPERIENCE and THE BOOK of THEL', 'SONGS OF INNOCENCE', 'INTRODUCTION']


In [136]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')

In [137]:
#Use countvectorizer to create a bag of words for each corpus
from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer object
cvec = CountVectorizer(lowercase=True, max_features=1500, min_df=5, 
                       max_df=0.7, stop_words=stopwords)
wm = cvec.fit_transform(bryant) #works
tokens = cvec.get_feature_names()
features = pd.DataFrame(data=wm.toarray(),
             columns=tokens)
features['Source'] = 'Bryant'
features.head()

Unnamed: 0,able,acorn,across,ago,aha,air,alligator,almost,along,always,...,work,world,would,years,yellow,yes,yet,young,zelia,Source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bryant
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bryant
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bryant
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bryant
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,Bryant


In [138]:
#Use tfitd to create a bag of words for each corpus
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words=stopwords, 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )
tfidf=vectorizer.fit_transform(bryant)
terms = vectorizer.get_feature_names()
tffeatures = pd.DataFrame(data=tfidf.toarray(),
             columns=terms)
tffeatures['Source'] = 'Bryant'
tffeatures.head()

Unnamed: 0,able,acorn,across,acted,actions,afraid,afterward,afther,ago,agreed,...,year,years,yees,yellow,yes,yet,young,youth,zelia,Source
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bryant
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bryant
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bryant
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bryant
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.679218,0.0,0.0,0.0,0.0,0.0,Bryant


# Countervectorized vs TfidfVectorizer Results on Bryant stories
#The first difference I notice when comparing Countervectorized vs TfidfVectorizer, is 
#TfidfVectorizer creates twice as many features as Countervectoriezed, many of the words
#are the same but I understand now why you would run the PCA on TFidVectorizer, as it
#provides a way to reduce the number of features that will useful for a model to predict. 

In [139]:
#Use countvectorizer to create a bag of words for each corpus
# instantiate the vectorizer object
cvec = CountVectorizer(lowercase=True, max_features=1500, min_df=5, 
                       max_df=0.7, stop_words=stopwords)
wm = cvec.fit_transform(blake) #works
tokens = cvec.get_feature_names()
features2 = pd.DataFrame(data=wm.toarray(),
             columns=tokens)
features2['Source'] = 'Blake'
features2.head()

Unnamed: 0,among,angel,another,arise,away,birds,black,book,bore,boy,...,voice,weary,weep,weeping,wept,white,wild,woe,youth,Source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Blake
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Blake
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Blake
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Blake
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,Blake


In [140]:
#Use tfitd to create a bag of words for each corpus
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer2 = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )
tfidf2=vectorizer.fit_transform(blake)
terms2 = vectorizer.get_feature_names()
tffeatures2 = pd.DataFrame(data=tfidf.toarray(),
             columns=terms)
tffeatures2['Source'] = 'Blake'
tffeatures2.head()

Unnamed: 0,able,acorn,across,acted,actions,afraid,afterward,afther,ago,agreed,...,year,years,yees,yellow,yes,yet,young,youth,zelia,Source
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Blake
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Blake
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Blake
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Blake
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.679218,0.0,0.0,0.0,0.0,0.0,Blake


# Countervectorized vs TfidfVectorizer Results on Blake stories
#The result is the exact same as the Bryant stories. TfidVectorizer creates more features
#than Countervectorized and as a result would need to apply PCA to reduce the number
#of features.

In [141]:
#combine both list produced by Countervectorized since is contains the least amount of features
all_features = pd.concat([features, features2], ignore_index=True, sort=True)
all_features.head()
#all_features.shape

Unnamed: 0,Source,able,acorn,across,ago,aha,air,alligator,almost,along,...,work,world,would,years,yellow,yes,yet,young,youth,zelia
0,Bryant,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
1,Bryant,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2,Bryant,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
3,Bryant,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
4,Bryant,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0


In [142]:
#first let's check to verify we don't have any missing or NAN values.
missing_val_count_by_column = (all_features.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

able          284
acorn         284
across        284
ago           284
aha           284
air           284
alligator     284
almost        284
along         284
always        284
among        1194
animals       284
answered      284
anything      284
arise        1194
around        284
artist        284
ask           284
asked         284
auntie        284
baby          284
back          284
bad           284
bade          284
bag           284
barn          284
beast         284
beautiful     284
became        284
bed           284
             ... 
weary        1194
weep         1194
weeping      1194
well          284
went          284
wept         1194
whale         284
wheat         284
whole         284
wicked        284
wid           284
wife          284
wild         1194
wind          284
window        284
wise          284
woe          1194
woman         284
women         284
words         284
work          284
world         284
would         284
years         284
yellow    

In [143]:
#fill na values with 0
all_features = all_features.fillna(0.0,axis=0)

In [144]:
#verify missing values have been removed
missing_val_count_by_column = (all_features.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Series([], dtype: int64)


# Let's take the result of our featurs and run it through a few models

In [145]:
#Execute the RandomForestClassifier Model
#remove the warnings related to default parameters
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = all_features['Source']
X = np.array(all_features.drop(['Source'], 1))

#Let's train/test with 60% of our data
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

#compare the training and testing results using cross validation
from sklearn.model_selection import cross_val_score
cv_results_train = cross_val_score(rfc, X_train, y_train, cv=5)
cv_results_test = cross_val_score(rfc, X_test, y_test, cv=5)
print('RFC Training set score:', cv_results_train)
print('RFC Testing set score:', cv_results_test)



Training set score: 0.9729119638826185

Test set score: 0.910472972972973
RFC Training set score: [0.88202247 0.90449438 0.90960452 0.88700565 0.92045455]
RFC Testing set score: [0.8907563  0.89915966 0.88135593 0.83050847 0.89830508]


In [146]:
#create a result table
data = [{'Model':'RFC', 'Training':rfc.score(X_train, y_train),
         'Testing': rfc.score(X_test, y_test),
         'CV Train': cv_results_train.mean(),
         'CV Test': cv_results_test.mean()}]
df_results = pd.DataFrame(data)
df_results

Unnamed: 0,CV Test,CV Train,Model,Testing,Training
0,0.880017,0.900716,RFC,0.910473,0.972912


In [147]:
#Execute the LogisticRegression Model
#remove the warnings related to default parameters
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

#compare the training and testing results using cross validation
#remove the warnings related to default parameters
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.model_selection import cross_val_score
cv_results_train = cross_val_score(lr, X_train, y_train, cv=5)
cv_results_test = cross_val_score(lr, X_test, y_test, cv=5)
print('LR Training set score:', cv_results_train)
print('LR Testing set score:', cv_results_test)



(886, 485) (886,)
Training set score: 0.9446952595936795

Test set score: 0.9155405405405406
LR Training set score: [0.90449438 0.91573034 0.88700565 0.90960452 0.92045455]
LR Testing set score: [0.8907563  0.90756303 0.86440678 0.88135593 0.89830508]


In [148]:
#update results table
data = [{'Model':'LR', 'Training':lr.score(X_train, y_train),
         'Testing': lr.score(X_test, y_test),
         'CV Train': cv_results_train.mean(),
         'CV Test': cv_results_test.mean()}]
d2 = pd.DataFrame(data)
df_results = df_results.append(d2) 
df_results

Unnamed: 0,CV Test,CV Train,Model,Testing,Training
0,0.880017,0.900716,RFC,0.910473,0.972912
0,0.888477,0.907458,LR,0.915541,0.944695


In [149]:
#Execute the GradientBoostingClassifier Model
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

#compare the training and testing results using cross validation
from sklearn.model_selection import cross_val_score
cv_results_train = cross_val_score(clf, X_train, y_train, cv=5)
cv_results_test = cross_val_score(clf, X_test, y_test, cv=5)
print('CLF Training set score:', cv_results_train)
print('CLF Testing set score:', cv_results_test)

Training set score: 0.9288939051918735

Test set score: 0.9037162162162162
CLF Training set score: [0.85393258 0.89325843 0.88700565 0.89830508 0.89772727]
CLF Testing set score: [0.86554622 0.85714286 0.88983051 0.88135593 0.87288136]


In [150]:
#update results table
data = [{'Model':'CLF', 'Training':clf.score(X_train, y_train),
         'Testing': clf.score(X_test, y_test),
         'CV Train': cv_results_train.mean(),
         'CV Test': cv_results_test.mean()}]
d2 = pd.DataFrame(data)
df_results = df_results.append(d2) 

#rearrange the columns sort order
df_results = df_results[['Model', 'Training', 'Testing',
                         'CV Train', 'CV Test']]

df_results


Unnamed: 0,Model,Training,Testing,CV Train,CV Test
0,RFC,0.972912,0.910473,0.900716,0.880017
0,LR,0.944695,0.915541,0.907458,0.888477
0,CLF,0.928894,0.903716,0.886046,0.873351


# Improve Gradientboosting Model
#Based on the results I am going to choose Gradientboosting to see I, we can increase 
#the model's performance by 5% points. It has the lowest overal test score and has the 
#ability to improve in it's performance.

In [151]:
#we will use several values for different paramaters and determine the best parameter 
#to use by utilizing GridserchCV. 

from sklearn.model_selection import GridSearchCV
param_grid = { 
    'n_estimators': [200, 300, 500, 700],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth'   : [5,16,2],
    'learning_rate': [0.1,0.2,0.3]
}

CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5)
CV_clf.fit(X_train, y_train)
print(CV_clf.best_params_)


{'learning_rate': 0.2, 'max_depth': 5, 'max_features': 'log2', 'n_estimators': 300}


In [152]:
#use the parameters found from gridsearch and confirm if the results have improved.
clf = ensemble.GradientBoostingClassifier(max_depth=5,
                                          max_features='log2',
                                          n_estimators=300,
                                          learning_rate=0.2)
train = clf.fit(X_train, y_train)
#max_depth=2,max_features='log2', n_estimators=500,learning_rate=0.3 - 0.95 score

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

#Re-execute the cross validation train/test results to see if improvements have been made
from sklearn.model_selection import cross_val_score
cv_results_train = cross_val_score(clf, X_train, y_train, cv=5)
cv_results_test = cross_val_score(clf, X_test, y_test, cv=5)
print('CLF Training set score:', cv_results_train)
print('CLF Testing set score:', cv_results_test)

Training set score: 0.9729119638826185

Test set score: 0.9408783783783784
CLF Training set score: [0.8988764  0.93258427 0.93220339 0.89830508 0.93181818]
CLF Testing set score: [0.87394958 0.89915966 0.91525424 0.88983051 0.94915254]


In [153]:
data = [{'Model':'CLF_Update', 'Training':clf.score(X_train, y_train),
         'Testing': clf.score(X_test, y_test),
         'CV Train': cv_results_train.mean(),
         'CV Test': cv_results_test.mean()}]
d2 = pd.DataFrame(data)
df_results = df_results.append(d2) 

#rearrange the columns sort order
df_results = df_results[['Model', 'Training', 'Testing',
                         'CV Train', 'CV Test']]
df_results

Unnamed: 0,Model,Training,Testing,CV Train,CV Test
0,RFC,0.972912,0.910473,0.900716,0.880017
0,LR,0.944695,0.915541,0.907458,0.888477
0,CLF,0.928894,0.903716,0.886046,0.873351
0,CLF_Update,0.972912,0.940878,0.918757,0.905469


# Summary: Overall, I was able to improve my traing results by 5 points and my testing results by 4 point by just improving the the parameters of the model. My cross validation numbers for train and testing also increased by 3 points. 