In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk
from sklearn.model_selection import train_test_split

#import warnings filter - ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
#listing books to choose from
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

# Data cleaning, Processing & Language Parsing

In [3]:
# Load the data.
brown = gutenberg.raw('chesterton-brown.txt')
parents = gutenberg.raw('edgeworth-parents.txt')

In [4]:
#check
#brown
#parents

In [5]:
# Function for text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub('\'','', text)
    text = re.sub('`','', text)
    text = ' '.join(text.split())
    return text

In [6]:
print(len(brown))
print(len(parents))

406629
935158


In [7]:
#cutting down to about first 20k words & cleaning 
brown = text_cleaner(brown[:20000])
parents = text_cleaner(parents[:20000])

In [8]:
#check
#brown
#parents

#print(len(brown))
#print(len(parents))

print(len(brown)/(len(brown)+len(parents)))
print(len(parents)/(len(brown)+len(parents)))

0.501536334306345
0.49846366569365497


In [9]:
# Parse the cleaned novels.
nlp = spacy.load('en')
brown_doc = nlp(brown)
parents_doc = nlp(parents)

In [10]:
# Group into sentences.
brown_sents = [[sent, "Chesterton"] for sent in brown_doc.sents]
parents_sents = [[sent, "Edgeworth"] for sent in parents_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(brown_sents + parents_sents)

In [11]:
#check
#sentences

# Feature Creation - Bag of Words

In [12]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

In [13]:
# Set up the bags.
brownwords = bag_of_words(brown_doc)
parentswords = bag_of_words(parents_doc)

# Combine bags to create a set of unique words.
common_words = set(brownwords + parentswords)

In [14]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300


Unnamed: 0,Marys,ballad,pasture,town,regard,activity,dim,charge,Anne,hinder,...,produce,resolve,migration,detective,good,people,straggli,fond,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(I.),Chesterton
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, Absence, of, Mr, Glass)",Chesterton
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(THE, consulting, -, rooms, of, Dr, Orion, Hoo...",Chesterton
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(In, such, a, place, the, sea, had, something,...",Chesterton
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(It, must, not, be, supposed, that, Dr, Hoods,...",Chesterton


In [15]:
word_counts = word_counts.drop(columns=['text_sentence'])

# Feature Creation - TF-IDF

In [16]:
#reading in the data, this time in the form of paragraphs
brown=gutenberg.sents('chesterton-brown.txt')
parents=gutenberg.sents('edgeworth-parents.txt')

#processing
brown_sentences=[]
for sentence in brown:
    sent = sentence[0]
    sent = [re.sub(r'--',' ',word) for word in sent]
    sent = [re.sub("[\[].*?[\]]", "", word) for word in sent]
    sent = [re.sub('\'','', word) for word in sent]
    sent = [re.sub('"','', word) for word in sent]
    sent = [re.sub('`','', word) for word in sent]
    #Forming each paragraph into a string and adding it to the list of strings.
    brown_sentences.append(''.join(sent))

#processing
parents_sentences=[]
for sentence in parents:
    sent=sentence[0]
    sent = [re.sub(r'--',' ',word) for word in sent]
    sent = [re.sub("[\[].*?[\]]", "", word) for word in sent]
    sent = [re.sub('\'','', word) for word in sent]
    sent = [re.sub('"','', word) for word in sent]
    sent = [re.sub('`','', word) for word in sent]
    #Forming each paragraph into a string and adding it to the list of strings.
    parents_sentences.append(''.join(sent))


In [17]:
#check
#print(brown_sentences[0:5])
#print(parents_sentences[0:5])

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, 
                             min_df=2, 
                             stop_words='english', 
                             lowercase=True, 
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True 
                            )

In [19]:
brown_tfidf = vectorizer.fit_transform(brown_sentences)

brown_feature_names = vectorizer.get_feature_names()

dense = brown_tfidf.todense()
denselist = dense.tolist()

df_brown = pd.DataFrame(denselist, columns=brown_feature_names)

In [20]:
parents_tfidf = vectorizer.fit_transform(parents_sentences)

parents_feature_names = vectorizer.get_feature_names()

dense = parents_tfidf.todense()
denselist = dense.tolist()

df_parents = pd.DataFrame(denselist, columns=parents_feature_names)

In [21]:
print(df_brown.shape)
print(df_parents.shape)

(3806, 68)
(10230, 236)


In [22]:
#Adding author column 
df_brown['text_source'] = "Chesterton"
df_parents['text_source'] = "Edgeworth"

In [23]:
df_tfidf = pd.concat([df_brown, df_parents], sort=False)

In [24]:
df_tfidf.shape

(14036, 280)

In [25]:
df_tfidf = df_tfidf.fillna(0)

In [26]:
#check

#df_tfidf['text_source']
#df_tfidf

# Fitting Supervised Learning Models

In [27]:
x_bow = word_counts.drop(columns=['text_source'])
y_bow = word_counts['text_source']

x_tfidf = df_tfidf.drop(columns=['text_source'])
y_tfidf = df_tfidf['text_source']

X_bow_train, X_bow_test, y_bow_train, y_bow_test = train_test_split(x_bow, y_bow, test_size=0.2, random_state=0)
X_tfidf_train, X_tfidf_test, y_tfidf_train, y_tfidf_test = train_test_split(x_tfidf, y_tfidf, test_size=0.2, random_state=0)

# Random Forest

In [28]:
from sklearn import ensemble

rfc = ensemble.RandomForestClassifier(random_state=1)

#bow
bow_train = rfc.fit(X_bow_train, y_bow_train)

print('Training set score:', rfc.score(X_bow_train, y_bow_train))
print('\nTest set score:', rfc.score(X_bow_test, y_bow_test))

Training set score: 0.9555555555555556

Test set score: 0.8088235294117647


In [29]:
#tfidf
tfidf_train = rfc.fit(X_tfidf_train, y_tfidf_train)

print('Training set score:', rfc.score(X_tfidf_train, y_tfidf_train))
print('\nTest set score:', rfc.score(X_tfidf_test, y_tfidf_test))

Training set score: 0.7418952618453866

Test set score: 0.7581908831908832


# Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression() 

#bow
bow_train = lr.fit(X_bow_train, y_bow_train)

print('Training set score:', lr.score(X_bow_train, y_bow_train))
print('\nTest set score:', lr.score(X_bow_test, y_bow_test))

Training set score: 0.9481481481481482

Test set score: 0.8529411764705882


In [31]:
#tfidf
tfidf_train = lr.fit(X_tfidf_train, y_tfidf_train)

print('Training set score:', lr.score(X_tfidf_train, y_tfidf_train))
print('\nTest set score:', lr.score(X_tfidf_test, y_tfidf_test))

Training set score: 0.7410046312789454

Test set score: 0.7549857549857549


# Gradient Boosting

In [32]:
clf = ensemble.GradientBoostingClassifier()

#bow
bow_train = clf.fit(X_bow_train, y_bow_train)

print('Training set score:', clf.score(X_bow_train, y_bow_train))
print('\nTest set score:', clf.score(X_bow_test, y_bow_test))

Training set score: 0.9518518518518518

Test set score: 0.8235294117647058


In [33]:
#tfidf
tfidf_train = clf.fit(X_tfidf_train, y_tfidf_train)

print('Training set score:', clf.score(X_tfidf_train, y_tfidf_train))
print('\nTest set score:', clf.score(X_tfidf_test, y_tfidf_test))

Training set score: 0.7353045956537229

Test set score: 0.7535612535612536


# Evaluation of Models

In [34]:
from sklearn.model_selection import cross_val_score
import statistics

cvs_rfc_bow = cross_val_score(rfc, x_bow, y_bow, cv=10)
cvs_rfc_bow = ["%.2f"%i for i in cvs_rfc_bow]
cvs_rfc_bow = list(map(float, cvs_rfc_bow))

print('Random Forest Classifier - BOW')
print('Cross Validation Score: {}'.format(cvs_rfc_bow))
print('Average: {0:.2f}'.format(statistics.mean(cvs_rfc_bow)))

Random Forest Classifier - BOW
Cross Validation Score: [0.77, 0.71, 0.88, 0.76, 0.79, 0.94, 0.74, 0.82, 0.76, 0.91]
Average: 0.81


In [35]:
cvs_rfc_tfidf = cross_val_score(rfc, x_tfidf, y_tfidf, cv=10)
cvs_rfc_tfidf = ["%.2f"%i for i in cvs_rfc_tfidf]
cvs_rfc_tfidf = list(map(float, cvs_rfc_tfidf))

print('Random Forest Classifier - TFIDF')
print('Cross Validation Score: {}'.format(cvs_rfc_tfidf))
print('Average: {0:.2f}'.format(statistics.mean(cvs_rfc_tfidf)))

Random Forest Classifier - TFIDF
Cross Validation Score: [0.74, 0.74, 0.75, 0.74, 0.73, 0.74, 0.74, 0.74, 0.74, 0.74]
Average: 0.74


In [36]:
cvs_lr_bow = cross_val_score(lr, x_bow, y_bow, cv=10)
cvs_lr_bow = ["%.2f"%i for i in cvs_lr_bow]
cvs_lr_bow = list(map(float, cvs_lr_bow))

print('Logistic Regression - BOW')
print('Cross Validation Score: {}'.format(cvs_lr_bow))
print('Average: {0:.2f}'.format(statistics.mean(cvs_lr_bow)))

Logistic Regression - BOW
Cross Validation Score: [0.8, 0.76, 0.85, 0.76, 0.82, 0.94, 0.68, 0.85, 0.88, 0.94]
Average: 0.83


In [37]:
cvs_lr_tfidf = cross_val_score(lr, x_tfidf, y_tfidf, cv=10)
cvs_lr_tfidf = ["%.2f"%i for i in cvs_lr_tfidf]
cvs_lr_tfidf = list(map(float, cvs_lr_tfidf))

print('Logistic Regression - TFIDF')
print('Cross Validation Score: {}'.format(cvs_lr_tfidf))
print('Average: {0:.2f}'.format(statistics.mean(cvs_lr_tfidf)))

Logistic Regression - TFIDF
Cross Validation Score: [0.74, 0.74, 0.75, 0.74, 0.73, 0.74, 0.74, 0.74, 0.73, 0.74]
Average: 0.74


In [38]:
cvs_clf_bow = cross_val_score(clf, x_bow, y_bow, cv=10)
cvs_clf_bow = ["%.2f"%i for i in cvs_clf_bow]
cvs_clf_bow = list(map(float, cvs_clf_bow))

print('Gradient Boosting - BOW')
print('Cross Validation Score: {}'.format(cvs_clf_bow))
print('Average: {0:.2f}'.format(statistics.mean(cvs_clf_bow)))

Gradient Boosting - BOW
Cross Validation Score: [0.71, 0.71, 0.82, 0.71, 0.82, 0.88, 0.68, 0.82, 0.91, 0.85]
Average: 0.79


In [39]:
cvs_clf_tfidf = cross_val_score(clf, x_tfidf, y_tfidf, cv=10)
cvs_clf_tfidf = ["%.2f"%i for i in cvs_clf_tfidf]
cvs_clf_tfidf = list(map(float, cvs_clf_tfidf))

print('Gradient Boosting - TFIDF')
print('Cross Validation Score: {}'.format(cvs_clf_tfidf))
print('Average: {0:.2f}'.format(statistics.mean(cvs_clf_tfidf)))

Gradient Boosting - TFIDF
Cross Validation Score: [0.74, 0.73, 0.74, 0.74, 0.73, 0.74, 0.74, 0.74, 0.73, 0.74]
Average: 0.74


# Improving a Model

In [45]:
# Perform Grid-Search
from sklearn.model_selection import GridSearchCV

param_grid={
    'n_estimators': (10, 50, 100, 1000),
    'max_depth': range(2,7),
    'min_samples_split': range(2,5)
        }

gsc = GridSearchCV(
    estimator=rfc,
    param_grid = param_grid,
    cv=10, 
    verbose=0, 
    n_jobs=-1)

In [46]:
grid_result = gsc.fit(x_bow, y_bow)



In [47]:
best_params = grid_result.best_params_
print(best_params)

{'max_depth': 6, 'min_samples_split': 2, 'n_estimators': 1000}


In [57]:
# Model using grid search best paramters
rfc_bp = ensemble.RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params['min_samples_split']
    ,random_state=1
)

#bow
bow_train = rfc_bp.fit(X_bow_train, y_bow_train)

print('Training set score:', rfc_bp.score(X_bow_train, y_bow_train))
print('\nTest set score:', rfc_bp.score(X_bow_test, y_bow_test))

Training set score: 0.8333333333333334

Test set score: 0.8529411764705882
