In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/jdobrow/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/anaconda3/lib/python3.7/site-packages/en_core_web_sm -->
/anaconda3/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [2]:
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice[:int(len(alice)/4)])
persuasion = text_cleaner(persuasion[:int(len(persuasion)/4)])

In [5]:
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [6]:
def bag_of_words(text):

    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    return [item[0] for item in Counter(allwords).most_common(2500)]
    
def bow_features(sentences, common_words):
    
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    for i, sentence in enumerate(df['text_sentence']):
        
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        for word in words:
            df.loc[i, word] += 1
        
        if i % 100 == 0:
            print("Processing row {}".format(i))
            
    return df

alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

common_words = set(alicewords + persuasionwords)

In [7]:
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 100
Processing row 200
Processing row 300
Processing row 400
Processing row 500
Processing row 600
Processing row 700
Processing row 800
Processing row 900
Processing row 1000
Processing row 1100


Unnamed: 0,heart,parting,stoop,going,aunt,push,unluckily,Latitude,impatient,mild,...,delay,musical,unhappy,constantly,drawing,relieve,pleasing,fetch,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, shall, be, late, !, ')",Carroll


In [8]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9887640449438202

Test set score: 0.8989473684210526




In [9]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2')
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(712, 2641) (712,)
Training set score: 0.9705056179775281

Test set score: 0.9284210526315789




In [10]:
clf = ensemble.GradientBoostingClassifier(n_estimators=500)
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.9887640449438202

Test set score: 0.92


In [11]:
## CHALLENGE 0: Improve test accuracy to over 90% with cross validation

In [13]:
from sklearn.model_selection import cross_val_score
clf2 = ensemble.GradientBoostingClassifier(n_estimators=500)
cross_val_score(clf2, X, Y, cv=6)

array([0.86432161, 0.86363636, 0.88888889, 0.92424242, 0.93908629,
       0.9035533 ])

In [21]:
from sklearn.svm import SVC
classif = SVC(C=100, gamma=.001)
classif.fit(X_train, y_train)

print('Training set score:', classif.score(X_train, y_train))
print('\nTest set score:', classif.score(X_test, y_test))

Training set score: 0.9662921348314607

Test set score: 0.9221052631578948


In [26]:
classif2 = SVC(C=1000, gamma=0.0001)
cv = 6
results = cross_val_score(classif2, X, Y, cv=cv)
print(results)
print(sum(results)/cv)

[0.89447236 0.88383838 0.90909091 0.9040404  0.92893401 0.9035533 ]
0.9039882280705688


In [27]:
## Compare to another work

In [70]:
alice = gutenberg.raw('carroll-alice.txt')
bible = gutenberg.raw('bible-kjv.txt')

alice = re.sub(r'CHAPTER .*', '', alice)
bible = re.sub('\d', '', bible)
bible = re.sub('\s\:\s', ' ', bible)

# Keeping them about the same length
alice = text_cleaner(alice[:int(len(alice)/7)])
bible = text_cleaner(bible[:int(len(bible)/200)])

In [73]:
nlp = spacy.load('en')
alice_doc = nlp(alice)
bible_doc = nlp(bible)

alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
bible_sents = [[sent, "Bible"] for sent in bible_doc.sents]

sentences = pd.DataFrame(alice_sents + bible_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [74]:
alicewords = bag_of_words(alice_doc)
biblewords = bag_of_words(bible_doc)

common_words = set(alicewords + biblewords)

In [75]:
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 100
Processing row 200
Processing row 300


Unnamed: 0,Seth,wander,drown,worth,heart,eastward,crocodile,findeth,eighty,rat,...,eye,tire,half,hoarse,name,compasseth,moment,William,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, shall, be, late, !, ')",Carroll


In [76]:
rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9955555555555555

Test set score: 0.9337748344370861




In [77]:
classif = SVC(C=100, gamma=.001)
classif.fit(X_train, y_train)

print('Training set score:', classif.score(X_train, y_train))
print('\nTest set score:', classif.score(X_test, y_test))

Training set score: 0.9955555555555555

Test set score: 0.9337748344370861


In [78]:
classif2 = SVC(C=100, gamma=0.001)
cv = 6
results = cross_val_score(classif2, X, Y, cv=cv)
print(results)
print(sum(results)/cv)

[0.96875    0.859375   0.93548387 0.90322581 0.90322581 0.9516129 ]
0.9202788978494624
