In [15]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from nltk.corpus import gutenberg, stopwords
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
# Lets look at what text I can use. 
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
# Load documents Caesar and Paradise
caesar = gutenberg.raw('shakespeare-caesar.txt')
milton = gutenberg.raw('milton-paradise.txt')

# Data Cleaning/ Processing/Language Parsing

In [4]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    text = re.sub(r'CHAPTER \d+', '', text)
    text = re.sub("\\n\\n.*?\\n\\n", '', text)
  
    # Get rid of extra whitespace.
    text = ' '.join(text.split())

    
    return text

In [5]:
# Clean both documents
caesar = text_cleaner(caesar)
milton = text_cleaner(milton)

In [6]:
# run spacy and analyze the documents
nlp = spacy.load('en')

# Clean Caesar first
caesar_doc = nlp(caesar)
milton_doc = nlp(milton)

In [8]:
# Group into sentences.
caesar_sents = [[sent, "Shakes"] for sent in caesar_doc.sents]
milton_sents = [[sent, "Milton"] for sent in milton_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(caesar_sents + milton_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Enter, Flauius, ,, Murellus, ,, and, certaine...",Shakes
1,"(Flauius, .)",Shakes
2,"(Hence, :, home, you, idle, Creatures, ,, get,...",Shakes
3,"(Is, this, a, Holiday, ?)",Shakes
4,"(What, ,, know, you, not, (, Being, Mechanical...",Shakes


In [9]:
len(sentences)

4673

In [11]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
caesarwords = bag_of_words(caesar_doc)
miltonwords = bag_of_words(milton_doc)

# Combine bags to create a set of unique words.
common_words = set(caesarwords + miltonwords)

# How many words we got?
len(common_words)

3410

In [12]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500


Unnamed: 0,spend,veil,rag,pile,derive,should,constant,indeed,yours,consort,...,sion,backe,fault,speech,past,sky,line,continent,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Enter, Flauius, ,, Murellus, ,, and, certaine...",Shakes
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Flauius, .)",Shakes
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Hence, :, home, you, idle, Creatures, ,, get,...",Shakes
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Is, this, a, Holiday, ?)",Shakes
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(What, ,, know, you, not, (, Being, Mechanical...",Shakes


In [16]:
# random forest fitting 
rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.987513378523011

Test set score: 0.8631016042780749


In [17]:
# Overfitting has occurred but test set still did okay.
# Lets cross validate.

cross_val_score(rfc, X_train, y_train, cv=5)

array([0.85026738, 0.84313725, 0.84313725, 0.875     , 0.84464286])

In [18]:
# Gradient Boosting.
clf = ensemble.GradientBoostingClassifier()

# Fit the model
fit_clf = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.833749554049233

Test set score: 0.8197860962566845


In [19]:
# Scores are more consistent. Overfitting hasn't seem to have occurred.
cross_val_score(clf, X_train, y_train, cv=5)

array([0.80926916, 0.80035651, 0.79322638, 0.8125    , 0.80714286])

In [20]:
# Logistic Regression Model
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(2803, 3410) (2803,)
Training set score: 0.9750267570460222

Test set score: 0.9160427807486631


In [21]:
# Looks like overfitting has occurred. I will attempt to fix that later. 
cross_val_score(lr, X_train, y_train, cv=5)

array([0.89661319, 0.89839572, 0.91622103, 0.92142857, 0.91964286])

# Improving the Gradient Boosting Model.

In [26]:
from sklearn.model_selection import GridSearchCV
# Use GS-CV in order to find the optimal parameters.
clf_parameters = {
             'n_estimators':[100,200,500,1000],
              'max_depth':[2,4,6,8],
              'max_features':[2,4,6,8]
}

clf_grid = GridSearchCV(clf, clf_parameters, cv=5, verbose=1, n_jobs=-1)

#Fit the logistic regression 
clf_grid.fit(X_train, y_train)

#return best parameters and best score
print('Best parameters:')
print(clf_grid.best_params_)
print('Best Score:')
print(clf_grid.best_score_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  4.3min finished


Best parameters:
{'max_depth': 6, 'max_features': 4, 'n_estimators': 1000}
Best Score:
0.9332857652515162


In [28]:
# Presto Mago! Improved the score by increasing iterations and features.
clf = ensemble.GradientBoostingClassifier(n_estimators=1000,
                                         max_depth=6, max_features=4)

cross_val_score(clf, X_train, y_train, cv=5)

array([0.91800357, 0.93582888, 0.94652406, 0.93214286, 0.93571429])

In [29]:
# Scores are consistent but lets try it on a test set.
cross_val_score(clf, X_test, y_test, cv=5)

array([0.90933333, 0.91176471, 0.90374332, 0.93582888, 0.94101877])

I was able to increase the performance of the gradient boosting model after using GridSearch CV to find the optimal parameters. The test set also showed similar results without any of the individual samples overfitting. It performed better by having more iterations and having more depth and features in order to explain more variance. 

# Tf-idf

In [30]:
# Create the tf-idf function
def document_freq(data, sentences, common_words, doc_names, doc_words):
    
    # initialize df
    df = pd.DataFrame(columns = common_words)
    df.iloc[:, 0] = [0, 0, 0, 0, 0]
    df.loc[:, common_words] = 0
    df.rename(index={0:'df', 1:'cf', 2:'idf', 3:'Shakes', 4:'Milton'}, inplace=True)
    
    for word in common_words:
        # find document frequency & collection frequency
        df.loc['df', word] = data[data[word] > 0][word].count()
        df.loc['cf', word] = data.loc[:, word].sum()
        
        # find idf
        df.loc['idf', word] = np.log2(len(sentences)/df.loc['df', word])
        
    # assign the idf value to the documents
    for word in df.columns:
        for i in range(len(doc_names)):
            if word in doc_words[i]:
                df.loc[doc_names[i], word] = df.loc['idf', word]
        
    return df

In [32]:
# Create arrays to identify and hold my words.
doc_names = ['Shakes', 'Milton']
doc_words = [caesarwords, miltonwords]
tf_idf = document_freq(word_counts, sentences, common_words, doc_names, doc_words)
tf_idf

Unnamed: 0,spend,veil,rag,pile,derive,should,constant,indeed,yours,consort,...,howting,fix,sion,backe,fault,speech,past,sky,line,continent
df,11.0,7.0,8.0,6.0,5.0,15.0,9.0,4.0,2.0,9.0,...,1.0,26.0,6.0,10.0,15.0,16.0,26.0,23.0,5.0,5.0
cf,11.0,7.0,8.0,6.0,5.0,15.0,9.0,4.0,2.0,9.0,...,1.0,28.0,6.0,10.0,15.0,16.0,26.0,24.0,5.0,5.0
idf,8.730702,9.382778,9.190133,9.605171,9.868205,8.283243,9.020208,10.190133,11.190133,9.020208,...,12.190133,7.489694,9.605171,8.868205,8.283243,8.190133,7.489694,7.666571,9.868205,9.868205
Shakes,0.0,0.0,0.0,0.0,0.0,8.283243,9.020208,10.190133,11.190133,0.0,...,12.190133,0.0,0.0,8.868205,8.283243,8.190133,7.489694,0.0,0.0,0.0
Milton,8.730702,9.382778,9.190133,9.605171,9.868205,8.283243,9.020208,0.0,0.0,9.020208,...,0.0,7.489694,9.605171,0.0,8.283243,8.190133,7.489694,7.666571,9.868205,9.868205


In [34]:
# Let's make it so that the rows become the columns. 
tf_idf = tf_idf.T
tf_idf.head()

Unnamed: 0,df,cf,idf,Shakes,Milton
spend,11.0,11.0,8.730702,0.0,8.730702
veil,7.0,7.0,9.382778,0.0,9.382778
rag,8.0,8.0,9.190133,0.0,9.190133
pile,6.0,6.0,9.605171,0.0,9.605171
derive,5.0,5.0,9.868205,0.0,9.868205


In [40]:
# Set up a threshold to count and see which word belongs where.
threshold = 5
tf_idf['Shakes_threshold'] = 0
tf_idf['Milton_threshold'] = 0

tf_idf['Shakes_threshold'] = np.where(tf_idf['Shakes'] > threshold, 1, 0)
tf_idf['Milton_threshold'] = np.where(tf_idf['Milton'] > threshold, 1, 0)

tf_idf.head()

Unnamed: 0,df,cf,idf,Shakes,Milton,Shakes_threshold,Milton_threshold
spend,11.0,11.0,8.730702,0.0,8.730702,0,1
veil,7.0,7.0,9.382778,0.0,9.382778,0,1
rag,8.0,8.0,9.190133,0.0,9.190133,0,1
pile,6.0,6.0,9.605171,0.0,9.605171,0,1
derive,5.0,5.0,9.868205,0.0,9.868205,0,1


In [49]:
# Set up a way to determine which word goes into which group.
# default with both
tf_idf['source'] = 'both'

# Create a method
def determine_who(df):
    # Create a loop that iterates through each row and determines where it goes.
    for i in range(len(df)):
        # make a counter
        flag = 0
        source = 'Both'
        
        if (df.iloc[i, 5] == 1):
            flag = 1
            source = 'Shakes'
           
        if (df.iloc[i, 6] == 1):
            if (flag == 1):
                continue
            flag = 1
            source = 'Milton'
            
        df.iloc[i, 7] = source
        
    return df
            

In [50]:
# Was testing the output and accidentally created another row. 
tf_idf_test = determine_who(tf_idf)

tf_idf_test.head(30)

Unnamed: 0,df,cf,idf,Shakes,Milton,Shakes_threshold,Milton_threshold,source,Source
spend,11.0,11.0,8.730702,0.0,8.730702,0,1,Milton,Both
veil,7.0,7.0,9.382778,0.0,9.382778,0,1,Milton,Both
rag,8.0,8.0,9.190133,0.0,9.190133,0,1,Milton,Both
pile,6.0,6.0,9.605171,0.0,9.605171,0,1,Milton,Both
derive,5.0,5.0,9.868205,0.0,9.868205,0,1,Milton,Both
should,15.0,15.0,8.283243,8.283243,8.283243,1,1,both,Both
constant,9.0,9.0,9.020208,9.020208,9.020208,1,1,both,Both
indeed,4.0,4.0,10.190133,10.190133,0.0,1,0,Shakes,Both
yours,2.0,2.0,11.190133,11.190133,0.0,1,0,Shakes,Both
consort,9.0,9.0,9.020208,0.0,9.020208,0,1,Milton,Both


In [53]:
# Finally time to test the models! 
# Drop everything except for the tf-idf values
rfc = ensemble.RandomForestClassifier()
Y2 = tf_idf['source']
X2 = tf_idf.drop(['source', 'Source', 'Shakes_threshold',
                'Milton_threshold' ,'Shakes', 'Milton'], axis=1)

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, 
                                                    Y2,
                                                    test_size=0.3)
train = rfc.fit(X2_train, y2_train)

print('Training set score:', rfc.score(X2_train, y2_train))
print('\nTest set score:', rfc.score(X2_test, y2_test))

Training set score: 0.7930456640134059

Test set score: 0.7370478983382209


In [54]:
# Cross validate the rfc model. 
cross_val_score(rfc, X2_train, y2_train, cv=5)

array([0.74686192, 0.76150628, 0.72803347, 0.74213836, 0.74579832])

In [55]:
# logistic regression fitting
lr = LogisticRegression()
train = lr.fit(X2_train, y2_train)
print(X2_train.shape, y2_train.shape)
print('Training set score:', lr.score(X2_train, y2_train))
print('\nTest set score:', lr.score(X2_test, y2_test))

(2387, 3) (2387,)
Training set score: 0.7452869710934227

Test set score: 0.7165200391006843


In [56]:
cross_val_score(lr, X2_train, y2_train, cv=5)

array([0.73012552, 0.74476987, 0.75941423, 0.74004193, 0.75210084])

I was originally going to improve the accuracy of this model, but given how there aren't many rows or many columns, neither regularization methods will improve the accuracy of the model. 

In [59]:
clf2 = ensemble.GradientBoostingClassifier()

# Gradient Boosting Model.
train = clf2.fit(X2_train, y2_train)

print('Training set score:', clf2.score(X2_train, y2_train))
print('\nTest set score:', clf2.score(X2_test, y2_test))

Training set score: 0.7855048177628823

Test set score: 0.739980449657869


In [60]:
cross_val_score(clf2, X2_train, y2_train, cv=5)

array([0.75941423, 0.77824268, 0.74267782, 0.75262055, 0.75630252])

In [65]:
# Lets improve the GB model since it will be faster than the random forest. 
# Use GS-CV in order to find the optimal parameters.
clf_parameters = {
             'n_estimators':[100,200,500,1000],
              'max_depth':[2,4,6,8],
             'max_features':['auto']
}

clf_grid = GridSearchCV(clf, clf_parameters, cv=5, verbose=1, n_jobs=-1)

#Fit the logistic regression 
clf_grid.fit(X2_train, y2_train)

#return best parameters and best score
print('Best parameters:')
print(clf_grid.best_params_)
print('Best Score:')
print(clf_grid.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   40.5s finished


Best parameters:
{'max_depth': 2, 'max_features': 'auto', 'n_estimators': 100}
Best Score:
0.7612065354000838


# Write Up

So overall, my BoW models did better than my tf-idf model. Originally, I thought that BoW did better than my tf-idf model due to the fact that the BoW model had more data, but it only has 500 more rows which leads me to believe that that's only a minor reason as to why the BoW model did better. 

My hypothesis on why my tf-idf model failed to perform is that not only was there less data for the tf model to use, but I also removed the classifiers for it so I made it even harder for my model to correctly predict it. However, they were still able to get it about 75% right which isn't too bad. Maybe it would have done better if I had performed a LSA on the tf-idf model.  