In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org
  ipython-dev@scipy.org""")


In [2]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


### Cleaning

In [3]:
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
caesar = gutenberg.raw('shakespeare-caesar.txt')
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
macbeth = gutenberg.raw('shakespeare-macbeth.txt')

# The Chapter indicator is idiosyncratic
caesar = re.sub(r'Chapter \d+', '', caesar)
hamlet = re.sub(r'CHAPTER .*', '', hamlet)
macbeth = re.sub(r'CHAPTER .*', '', macbeth)
    
caesar = text_cleaner(caesar)
hamlet = text_cleaner(hamlet)
macbeth = text_cleaner(macbeth)

In [4]:
# pass a different version of spacy to only split sentences and not words
from spacy.pipeline import SentenceSegmenter

nlp = spacy.blank('en')
nlp.pipeline.append(('Segmenter', SentenceSegmenter(nlp.vocab)))

caesar_doc = nlp(caesar)
hamlet_doc = nlp(hamlet)
macbeth_doc = nlp(macbeth)

In [23]:
caesar_sents = [[sent.text, "Caesar"] for sent in caesar_doc.sents]
hamlet_sents = [[sent.text, "Hamlet"] for sent in hamlet_doc.sents]
macbeth_sents = [[sent.text, "Macbeth"] for sent in macbeth_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(caesar_sents + hamlet_sents + macbeth_sents, columns=['sentence','novel'])
sentences.head()

Unnamed: 0,sentence,novel
0,Actus Primus.,Caesar
1,Scoena Prima.,Caesar
2,"Enter Flauius, Murellus, and certaine Commoner...",Caesar
3,Flauius.,Caesar
4,"Hence: home you idle Creatures, get you home: ...",Caesar


### Count Vectorizer

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences.sentence)
Y = sentences['novel']

In [29]:
# Random forest classifier model
from sklearn import ensemble
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)

rfc = ensemble.RandomForestClassifier()
train = rfc.fit(X_train, y_train)

print('\nTest set score:', rfc.score(X_test, y_test))

  from numpy.core.umath_tests import inner1d



Test set score: 0.7259570494864612


In [34]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

cross_val_score(rfc, X_test, y_test, cv=5)

array([0.67906977, 0.6682243 , 0.72429907, 0.69392523, 0.68691589])

In [35]:
y_pred = rfc.predict(X_test)
f1_score(y_test,y_pred,average='weighted')

0.7227734876361174

In [36]:
# Logistic Regression model
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

Training set score: 0.9470899470899471

Test set score: 0.7721755368814193


In [37]:
cross_val_score(rfc, X_test, y_test, cv=5)

array([0.66744186, 0.70560748, 0.71028037, 0.67757009, 0.68691589])

In [38]:
y_pred = rfc.predict(X_test)
f1_score(y_test,y_pred,average='weighted')

0.7227734876361174

### TF-IDF

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences.sentence)
Y = sentences['novel']

In [40]:
# Random forest classifier model
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)

rfc = ensemble.RandomForestClassifier()
train = rfc.fit(X_train, y_train)

print('\nTest set score:', rfc.score(X_test, y_test))


Test set score: 0.7119514472455649


In [41]:
cross_val_score(rfc, X_test, y_test, cv=5)

array([0.65348837, 0.6682243 , 0.68224299, 0.67523364, 0.64953271])

In [42]:
y_pred = rfc.predict(X_test)
f1_score(y_test,y_pred,average='weighted')

0.7089465929735536

In [43]:
# Logistic Regression
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

Training set score: 0.8991596638655462

Test set score: 0.7563025210084033


In [44]:
cross_val_score(rfc, X_test, y_test, cv=5)

array([0.66744186, 0.66121495, 0.70327103, 0.68691589, 0.67990654])

In [45]:
y_pred = rfc.predict(X_test)
f1_score(y_test,y_pred,average='weighted')

0.7089465929735536

Logistic regression working far better than random forest for both the vectorizer and tf_idf vectorizer based on accuracy scores. However, no differece in the f-scores between the two different models, which is surprising. 