In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import gutenberg, stopwords
from collections import Counter

from sklearn.model_selection import cross_val_score, train_test_split


# Supervised NLP

Supervised Natural Language Processing (NLP) invovles traing a model to label text.  The approach below involves cleaning the text, identifying the most commonly used words in each of the two texts in question and using the set of those two lists as features for the model.  Each text snippet is then tokenized and processed into a vector of 0s and 1s based on whether the snippet contains the feature-words.  Finally various classifiers are trained and tested.  

This is the simplest case however. Could we get better performance by expanding our feature set?

Additionally, one of the issues implicit in training a model like this is the extent to which it generalizes.  In the below investigation, I will start by training a model to discern between Lewis Carroll's _Alice in Wonderland_ and Jane Austen's _Persuasion_.  However, it begs the question(s), does _Persuasion_ generalize to Austen in general? Or _Alice_ to Lewis Carroll? Would a model trained on _Alice_ and _Persuasion_ still be able to classify Austen and Carroll if the Austen sample was from _Emma_? Let's probe.

In [3]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion_base = re.sub(r'Chapter \d+', '', persuasion)
alice_base = re.sub(r'CHAPTER .*', '', alice)

In [4]:
alice = text_cleaner(alice_base)
persuasion = text_cleaner(persuasion_base)
# question: the text cleaner appears to clean punctuation but the below text still includes it...

In [5]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [6]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


I have adapted the code introduced in the project base text so that there is preallocation of dataframe space and there is optional inclusion of sentence stats (discussed below).

In [None]:
from collections import Counter
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

In [8]:
# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.

# POS dict
pos_d = {'VERB':'verb_ct', 'NOUN':'noun_ct', 'ADV':'adv_ct', 'ADP':'adp_ct', 
         'PROPN':'propn_ct', 'ADJ':'adj_ct', 'DET':'det_ct', 'PUNCT':'punct_ct'}

def bow_features(quotes, common_words, **kwargs):
    print(len(quotes))
    
    # sentence stats
    sent_stats = ['comma_ct', 'word_ct', 'adv_ct', 'adp_ct', 'propn_ct', 'adj_ct', 'punct_ct'] #'verb_ct', 'noun_ct','det_ct',
    if 'sent_stats' in kwargs:
        df = pd.DataFrame(columns=list(common_words) + sent_stats)
        cols = list(common_words)+sent_stats
    else:
        df = pd.DataFrame(columns=common_words)
        cols = list(common_words)
    
    for col in df.columns:
        df[col] = np.zeros(len(quotes[0]))
    df['text_sentence'] = quotes[0] 
    df['text_source'] = quotes[1]
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, quote in enumerate(df['text_sentence']):
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in quote
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            try:
                df.loc[i, word] += 1
            except:
                print(word)
        
        # add sentence features
        if 'sent_stats' in kwargs:
            commas = 0
            for token in quote:
                if token.orth_ == ',':
                    commas += 1
            df.loc[i, 'comma_ct'] = commas
                    
            c = Counter([token.pos_ for token in quote])
            for key in pos_d.keys():
                if key in c.keys():
                    df.loc[i, pos_d[key]] = c[key]
                else:
                    df.loc[i, pos_d[key]] = c[key]
            
            df.loc[i, 'word_ct'] = len([token for token in quote if (not token.is_punct)])

        # This counter is just to make sure the kernel didn't hang.
        if i % 1000 == 0:
            print("Processing row {}".format(i))
            
    return df

In [148]:
# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

Up to this point there has only been one text prep pipeline. Now it will fork into two. 

# Challenge 0:  
### Add features to improve model performance  

`word_counts` is composed entirely of words. However for `word_counts_ss` I added features that pertain to the entire sentence (rather than the culled token list) such as counts for various parts of speech, comma count, and word count.

In [10]:
word_counts = bow_features(sentences, common_words)


5318
made it to the loop
Processing row 0
Processing row 1000
Processing row 2000
Processing row 3000
Processing row 4000
Processing row 5000


In [9]:
# this dataframe includes additional grammatical and symantic details as features 
word_counts_ss = bow_features(sentences, common_words, sent_stats= True)

5318
made it to the loop
Processing row 0
Processing row 1000
Processing row 2000
Processing row 3000
Processing row 4000
Processing row 5000


## Splitting out train/test splits

In [11]:
def make_train_test(word_count_df):
    Y = word_count_df['text_source']
    X = word_count_df.iloc[:, ~word_count_df.columns.isin(['text_sentence','text_source'])]

    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        Y,
                                                        test_size=0.3,
                                                        random_state=0)
    return X_train, X_test, y_train, y_test

In [103]:
X_train_ss, X_test_ss, y_train_ss, y_test_ss = make_train_test(word_counts_ss)

In [86]:
X_train, X_test, y_train, y_test = make_train_test(word_counts)

## BoW with Logistic Regression

In [84]:
from sklearn.linear_model import LogisticRegression

def log_reg(_X_train, _X_test, _y_train, _y_test, **kwargs):
    if 'params' in kwargs:
        params = kwargs['params']
        _lr = LogisticRegression(**params)
    else:
        _lr = LogisticRegression(solver = 'lbfgs')
        
    _lr.fit(_X_train, _y_train)
    print(_X_train.shape, _y_train.shape)
    print('Training set score:', _lr.score(_X_train, _y_train))
    print('\nTest set score:', _lr.score(_X_test, _y_test))
    _y_pred = _lr.predict(_X_test)
    print(pd.crosstab(_y_test, _y_pred))
    return _lr

In [98]:
#As a reference, I re-ran the model with only word features.
lr = log_reg(X_train, X_test, y_train, y_test)

cross_val_score(lr, X_train, y_train, cv = 6)

(3722, 3062) (3722,)
Training set score: 0.9575497044599678

Test set score: 0.9179197994987469
col_0        Austen  Carroll
text_source                 
Austen         1080       27
Carroll         104      385


array([0.92109501, 0.88888889, 0.91304348, 0.93558776, 0.90145396,
       0.90468498])

In [235]:
# I ran a parameter search to see if I could improve the model by configuration before adding features

from sklearn.model_selection import GridSearchCV
parameters = {
            'penalty':['l2'],
            'C':[1,10,100],
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag'],
        }
lr = LogisticRegression()
GS = GridSearchCV(lr, parameters,cv=6,verbose=10)
GS.fit(X_train,y_train)

new_params = GS.best_params_

Fitting 6 folds for each of 12 candidates, totalling 72 fits
[CV] C=1, penalty=l2, solver=newton-cg ...............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=1, penalty=l2, solver=newton-cg, score=0.9210950080515298, total=   1.4s
[CV] C=1, penalty=l2, solver=newton-cg ...............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


[CV]  C=1, penalty=l2, solver=newton-cg, score=0.8888888888888888, total=   1.3s
[CV] C=1, penalty=l2, solver=newton-cg ...............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.8s remaining:    0.0s


[CV]  C=1, penalty=l2, solver=newton-cg, score=0.9130434782608695, total=   1.2s
[CV] C=1, penalty=l2, solver=newton-cg ...............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.0s remaining:    0.0s


[CV]  C=1, penalty=l2, solver=newton-cg, score=0.9355877616747182, total=   1.4s
[CV] C=1, penalty=l2, solver=newton-cg ...............................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.5s remaining:    0.0s


[CV]  C=1, penalty=l2, solver=newton-cg, score=0.901453957996769, total=   1.3s
[CV] C=1, penalty=l2, solver=newton-cg ...............................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.9s remaining:    0.0s


[CV]  C=1, penalty=l2, solver=newton-cg, score=0.9046849757673667, total=   1.3s
[CV] C=1, penalty=l2, solver=lbfgs ...................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    8.2s remaining:    0.0s


[CV]  C=1, penalty=l2, solver=lbfgs, score=0.9210950080515298, total=   0.5s
[CV] C=1, penalty=l2, solver=lbfgs ...................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    8.8s remaining:    0.0s


[CV]  C=1, penalty=l2, solver=lbfgs, score=0.8888888888888888, total=   0.7s
[CV] C=1, penalty=l2, solver=lbfgs ...................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    9.5s remaining:    0.0s


[CV]  C=1, penalty=l2, solver=lbfgs, score=0.9130434782608695, total=   0.7s
[CV] C=1, penalty=l2, solver=lbfgs ...................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   10.2s remaining:    0.0s


[CV]  C=1, penalty=l2, solver=lbfgs, score=0.9355877616747182, total=   0.6s
[CV] C=1, penalty=l2, solver=lbfgs ...................................
[CV]  C=1, penalty=l2, solver=lbfgs, score=0.901453957996769, total=   0.6s
[CV] C=1, penalty=l2, solver=lbfgs ...................................
[CV]  C=1, penalty=l2, solver=lbfgs, score=0.9046849757673667, total=   0.5s
[CV] C=1, penalty=l2, solver=liblinear ...............................
[CV]  C=1, penalty=l2, solver=liblinear, score=0.9210950080515298, total=   0.2s
[CV] C=1, penalty=l2, solver=liblinear ...............................
[CV]  C=1, penalty=l2, solver=liblinear, score=0.8888888888888888, total=   0.2s
[CV] C=1, penalty=l2, solver=liblinear ...............................
[CV]  C=1, penalty=l2, solver=liblinear, score=0.9130434782608695, total=   0.2s
[CV] C=1, penalty=l2, solver=liblinear ...............................
[CV]  C=1, penalty=l2, solver=liblinear, score=0.9355877616747182, total=   0.2s
[CV] C=1, penalty=l2



[CV]  C=1, penalty=l2, solver=sag, score=0.9210950080515298, total=   9.3s
[CV] C=1, penalty=l2, solver=sag .....................................




[CV]  C=1, penalty=l2, solver=sag, score=0.8888888888888888, total=   8.3s
[CV] C=1, penalty=l2, solver=sag .....................................




[CV]  C=1, penalty=l2, solver=sag, score=0.9130434782608695, total=   8.3s
[CV] C=1, penalty=l2, solver=sag .....................................




[CV]  C=1, penalty=l2, solver=sag, score=0.9355877616747182, total=   8.8s
[CV] C=1, penalty=l2, solver=sag .....................................




[CV]  C=1, penalty=l2, solver=sag, score=0.901453957996769, total=  11.1s
[CV] C=1, penalty=l2, solver=sag .....................................




[CV]  C=1, penalty=l2, solver=sag, score=0.9046849757673667, total=   9.0s
[CV] C=10, penalty=l2, solver=newton-cg ..............................
[CV]  C=10, penalty=l2, solver=newton-cg, score=0.9082125603864735, total=   2.0s
[CV] C=10, penalty=l2, solver=newton-cg ..............................
[CV]  C=10, penalty=l2, solver=newton-cg, score=0.8969404186795491, total=   1.7s
[CV] C=10, penalty=l2, solver=newton-cg ..............................
[CV]  C=10, penalty=l2, solver=newton-cg, score=0.9114331723027376, total=   1.8s
[CV] C=10, penalty=l2, solver=newton-cg ..............................
[CV]  C=10, penalty=l2, solver=newton-cg, score=0.9388083735909822, total=   2.0s
[CV] C=10, penalty=l2, solver=newton-cg ..............................
[CV]  C=10, penalty=l2, solver=newton-cg, score=0.901453957996769, total=   1.6s
[CV] C=10, penalty=l2, solver=newton-cg ..............................
[CV]  C=10, penalty=l2, solver=newton-cg, score=0.9063004846526656, total=   1.7s
[CV] C=1



[CV]  C=10, penalty=l2, solver=lbfgs, score=0.8969404186795491, total=   0.9s
[CV] C=10, penalty=l2, solver=lbfgs ..................................




[CV]  C=10, penalty=l2, solver=lbfgs, score=0.9114331723027376, total=   0.9s
[CV] C=10, penalty=l2, solver=lbfgs ..................................




[CV]  C=10, penalty=l2, solver=lbfgs, score=0.9388083735909822, total=   1.0s
[CV] C=10, penalty=l2, solver=lbfgs ..................................
[CV]  C=10, penalty=l2, solver=lbfgs, score=0.901453957996769, total=   1.1s
[CV] C=10, penalty=l2, solver=lbfgs ..................................
[CV]  C=10, penalty=l2, solver=lbfgs, score=0.9063004846526656, total=   1.0s
[CV] C=10, penalty=l2, solver=liblinear ..............................
[CV]  C=10, penalty=l2, solver=liblinear, score=0.9082125603864735, total=   0.2s
[CV] C=10, penalty=l2, solver=liblinear ..............................
[CV]  C=10, penalty=l2, solver=liblinear, score=0.8969404186795491, total=   0.2s
[CV] C=10, penalty=l2, solver=liblinear ..............................
[CV]  C=10, penalty=l2, solver=liblinear, score=0.9114331723027376, total=   0.2s
[CV] C=10, penalty=l2, solver=liblinear ..............................
[CV]  C=10, penalty=l2, solver=liblinear, score=0.9388083735909822, total=   0.1s
[CV] C=10, pe



[CV]  C=10, penalty=l2, solver=sag, score=0.9130434782608695, total=  10.0s
[CV] C=10, penalty=l2, solver=sag ....................................




[CV]  C=10, penalty=l2, solver=sag, score=0.9033816425120773, total=   8.3s
[CV] C=10, penalty=l2, solver=sag ....................................




[CV]  C=10, penalty=l2, solver=sag, score=0.9130434782608695, total=   8.9s
[CV] C=10, penalty=l2, solver=sag ....................................




[CV]  C=10, penalty=l2, solver=sag, score=0.9452495974235104, total=   8.6s
[CV] C=10, penalty=l2, solver=sag ....................................




[CV]  C=10, penalty=l2, solver=sag, score=0.9046849757673667, total=   9.1s
[CV] C=10, penalty=l2, solver=sag ....................................




[CV]  C=10, penalty=l2, solver=sag, score=0.901453957996769, total=   9.1s
[CV] C=100, penalty=l2, solver=newton-cg .............................
[CV]  C=100, penalty=l2, solver=newton-cg, score=0.9001610305958132, total=   2.4s
[CV] C=100, penalty=l2, solver=newton-cg .............................
[CV]  C=100, penalty=l2, solver=newton-cg, score=0.8727858293075684, total=   2.2s
[CV] C=100, penalty=l2, solver=newton-cg .............................
[CV]  C=100, penalty=l2, solver=newton-cg, score=0.895330112721417, total=   2.3s
[CV] C=100, penalty=l2, solver=newton-cg .............................
[CV]  C=100, penalty=l2, solver=newton-cg, score=0.9259259259259259, total=   2.3s
[CV] C=100, penalty=l2, solver=newton-cg .............................
[CV]  C=100, penalty=l2, solver=newton-cg, score=0.8723747980613893, total=   2.0s
[CV] C=100, penalty=l2, solver=newton-cg .............................
[CV]  C=100, penalty=l2, solver=newton-cg, score=0.8966074313408724, total=   2.3s
[C



[CV]  C=100, penalty=l2, solver=lbfgs, score=0.9001610305958132, total=   1.1s
[CV] C=100, penalty=l2, solver=lbfgs .................................




[CV]  C=100, penalty=l2, solver=lbfgs, score=0.8743961352657005, total=   0.9s
[CV] C=100, penalty=l2, solver=lbfgs .................................




[CV]  C=100, penalty=l2, solver=lbfgs, score=0.893719806763285, total=   1.0s
[CV] C=100, penalty=l2, solver=lbfgs .................................




[CV]  C=100, penalty=l2, solver=lbfgs, score=0.927536231884058, total=   1.0s
[CV] C=100, penalty=l2, solver=lbfgs .................................




[CV]  C=100, penalty=l2, solver=lbfgs, score=0.8723747980613893, total=   1.0s
[CV] C=100, penalty=l2, solver=lbfgs .................................




[CV]  C=100, penalty=l2, solver=lbfgs, score=0.8966074313408724, total=   1.1s
[CV] C=100, penalty=l2, solver=liblinear .............................
[CV]  C=100, penalty=l2, solver=liblinear, score=0.9001610305958132, total=   0.2s
[CV] C=100, penalty=l2, solver=liblinear .............................
[CV]  C=100, penalty=l2, solver=liblinear, score=0.8727858293075684, total=   0.2s
[CV] C=100, penalty=l2, solver=liblinear .............................
[CV]  C=100, penalty=l2, solver=liblinear, score=0.895330112721417, total=   0.2s
[CV] C=100, penalty=l2, solver=liblinear .............................
[CV]  C=100, penalty=l2, solver=liblinear, score=0.9259259259259259, total=   0.2s
[CV] C=100, penalty=l2, solver=liblinear .............................
[CV]  C=100, penalty=l2, solver=liblinear, score=0.8723747980613893, total=   0.2s
[CV] C=100, penalty=l2, solver=liblinear .............................
[CV]  C=100, penalty=l2, solver=liblinear, score=0.8966074313408724, total=   0.2



[CV]  C=100, penalty=l2, solver=sag, score=0.9146537842190016, total=   8.3s
[CV] C=100, penalty=l2, solver=sag ...................................




[CV]  C=100, penalty=l2, solver=sag, score=0.9066022544283414, total=  10.4s
[CV] C=100, penalty=l2, solver=sag ...................................




[CV]  C=100, penalty=l2, solver=sag, score=0.9130434782608695, total=   8.3s
[CV] C=100, penalty=l2, solver=sag ...................................




[CV]  C=100, penalty=l2, solver=sag, score=0.9468599033816425, total=   9.2s
[CV] C=100, penalty=l2, solver=sag ...................................




[CV]  C=100, penalty=l2, solver=sag, score=0.8998384491114702, total=   8.7s
[CV] C=100, penalty=l2, solver=sag ...................................


[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:  3.6min finished


[CV]  C=100, penalty=l2, solver=sag, score=0.8998384491114702, total=   9.7s




In [236]:
print(GS.best_score_)
print(GS.best_params_)

0.9134873723804406
{'C': 10, 'penalty': 'l2', 'solver': 'sag'}


In [106]:
# Interestingly, on another run, this combination fared better, which suggests a small difference between configurations.
params_d = {'C':10, 'penalty':'l2', 'solver':'liblinear'}

In [101]:
# retrained model with updated parameters
lr = log_reg(X_train, X_test, y_train, y_test, params = params_d)

(3722, 3062) (3722,)
Training set score: 0.9572810317033853

Test set score: 0.918546365914787
col_0        Austen  Carroll
text_source                 
Austen         1081       26
Carroll         104      385




In [107]:
# trained the model with the additional features
lr_ss = log_reg(X_train_ss, X_test_ss, y_train_ss, y_test_ss, params = params_d)

(3722, 3072) (3722,)
Training set score: 0.9841483073616335

Test set score: 0.918546365914787
col_0        Austen  Carroll
text_source                 
Austen         1065       42
Carroll          88      401


# Challenge 1:
Now I will look into those questions about model generalizability.  

1. ) Is the model good at discerning _Alice_?  Which is to say, can the existing model discern between _Alice_ and something the model hasn't seen before (NOT _Alice_)?  
2. ) Is the model good at discerning _Persuasion_? Similar to (1), can the existing model discern between _Persuasion_ and something the model hasn't seen before (NOT _Persuasion_)?  
3. ) Is the model good at discerning Austen in general?  If the model is fed a mix of _Persuasion_ and _Emma_ and _Alice_, will it be able to correctly classify both _Persuasion_ and _Emma_ as Austen?

### A few functions for making various test sets

In [135]:
# make author set
def make_author_test(_X_test, _y_test, target_author):
    _X_test['target'] = _y_test
    df_test_author = _X_test[_X_test.target == target_author]

    X_test_author = df_test_author.iloc[:, ~df_test_author.columns.isin(['target'])] #X_test[y_test[y_test == 'Carroll'].index]
    y_test_author = df_test_author['target']
    return X_test_author, y_test_author

In [136]:
# make NOT author set
def make_nonauthor_test(df_nonauthor_bow, nonauthor, sample_size):
    df_nonauthor_bow['text_source'] = [nonauthor for ik in range(len(df_nonauthor_bow))]

    subsample_nonauthor = df_nonauthor_bow.sample(n=sample_size)
    X_test_nonauthor = subsample_nonauthor.iloc[:, ~subsample_nonauthor.columns.isin(['text_sentence','text_source']) ]
    y_test_nonauthor = subsample_nonauthor['text_source']
    return X_test_nonauthor, y_test_nonauthor

In [137]:
# concatenate test sets 
def make_full_sets(X_test_author, y_test_author, X_test_nonauthor, y_test_nonauthor):
#     X_test_author, y_test_author = test_author[0], test_author[1]
#     X_test_nonauthor, y_test_nonauthor = test_nonauthor[0], test_nonauthor[1]
    X_test_AnonA = np.concatenate([X_test_author, X_test_nonauthor], axis=0)
    y_test_AnonA = np.concatenate([y_test_author, y_test_nonauthor], axis=0)
    return X_test_AnonA, y_test_AnonA

### Prep _Paradise Lost_ by John Milton as an alternative text text

In [22]:
# another work:
paradise_base = gutenberg.raw('milton-paradise.txt')
paradise_base = re.sub(r'VOLUME \w+', '', paradise_base)
paradise_base = re.sub(r'CHAPTER \w+', '', paradise_base)
paradise = text_cleaner(paradise_base)
print(paradise[:100])

Book I Of Man's first disobedience, and the fruit Of that forbidden tree whose mortal taste Brought 


In [23]:
# Parse our cleaned data.
paradise_doc = nlp(paradise)

In [24]:
# Group into sentences.
paradise_sents = [[sent, "Not"] for sent in paradise_doc.sents]

In [25]:
# Nnormalize to the same length or Alice
paradise_sents = paradise_sents[0:len(alice_sents)]

In [26]:
# Bag of words for Paradise Lost
paradise_sentences = pd.DataFrame(paradise_sents)
paradise_bow = bow_features(paradise_sentences, common_words)

print('done')

1669
made it to the loop
Processing row 0
Processing row 1000
done


In [27]:
# Bage of words for Paradise Lost with sentence features
paradise_bow_ss = bow_features(paradise_sentences, common_words, sent_stats = True)

print('done')

1669
made it to the loop
Processing row 0
Processing row 1000
done


### Alice v. any other work  

In this case, the `Austen` label will be interpretted as **not** `Carroll`, so the test data will be prepped with `Austen` labels, added to the `Carroll` test set and scored.


In [108]:
X_test_Carroll, y_test_Carroll = make_author_test(X_test, y_test, 'Carroll')
X_test_nonCarroll, y_test_nonCarroll = make_nonauthor_test(paradise_bow, 'Austen', len(y_test_Carroll))

X_test_CnonC, y_test_CnonC = make_full_sets(X_test_Carroll, y_test_Carroll, X_test_nonCarroll, y_test_nonCarroll)

In [109]:
# run existing model over new test set
print('\nTest set score:', lr.score(X_test_CnonC, y_test_CnonC))
lr_CnC_predicted = lr.predict(X_test_CnonC)
preds = np.where(lr_CnC_predicted=='Austen', 'Not', lr_CnC_predicted) 
# y_test_relabeled = np.where(y_test_CnonC=='Austen', 'Not', y_test_CnonC) 
pd.crosstab(y_test_CnonC, preds)


Test set score: 0.7975460122699386


col_0,Carroll,Not
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,94,395
Carroll,385,104


#### Compare to version with gramatical features

In [110]:
X_test_Carroll_ss, y_test_Carroll_ss = make_author_test(X_test_ss,y_test_ss, 'Carroll')
X_test_nonCarroll_ss, y_test_nonCarroll_ss = make_nonauthor_test(paradise_bow_ss, 'Austen', len(y_test_Carroll))

X_test_CnonC_ss, y_test_CnonC_ss = make_full_sets(X_test_Carroll_ss, y_test_Carroll_ss, X_test_nonCarroll_ss, y_test_nonCarroll_ss)

In [131]:
# run existing model over new test set
print('\nTest set score:', lr_ss.score(X_test_CnonC_ss, y_test_CnonC_ss))
lr_CnC_predicted_ss = lr_ss.predict(X_test_CnonC_ss)
# preds_ss = np.where(lr_CnC_predicted_ss=='Austen', 'Not', lr_CnC_predicted_ss) 
y_test_relabeled = np.where(y_test_CnonC=='Austen', 'Not', y_test_CnonC) 
pd.crosstab(y_test_relabeled, lr_CnC_predicted_ss)


Test set score: 0.843558282208589


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Carroll,88,401
Not,424,65


The above results suggest that _Alice_ seperates well from _Paradise Lost_. If this were not the case, one would see  more "Not" examples classified as "Carroll". 

### Persuasion v. any other work

In [138]:
# filter out the Persuasion test rows
X_test_pers, y_test_pers = make_author_test(X_test, y_test, 'Austen')

# prep the non Persuasion (Carroll) test rows
X_test_nonPers, y_test_nonPers = make_nonauthor_test(paradise_bow, 'Carroll', len(y_test_pers))

# concatenate into one set
X_test_PnonP, y_test_PnonP = make_full_sets(X_test_pers, y_test_pers, X_test_nonPers, y_test_nonPers)

In [139]:
# run existing model over new test set
print('\nTest set score:', lr.score(X_test_PnonP, y_test_PnonP))
lr_PnP_predicted = lr.predict(X_test_PnonP)
pd.crosstab(y_test_PnonP, lr_PnP_predicted)


Test set score: 0.5844625112917796


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1081,26
Carroll,894,213


#### Compare to version with grammatical features

In [140]:
# filter out the Persuasion test rows
X_test_pers_ss, y_test_pers_ss = make_author_test(X_test_ss, y_test_ss, 'Austen')

# prep the non Persuasion (Carroll) test rows
X_test_nonPers_ss, y_test_nonPers_ss = make_nonauthor_test(paradise_bow_ss, 'Carroll', len(y_test_pers_ss))

# concatenate into one set
X_test_PnonP_ss, y_test_PnonP_ss = make_full_sets(X_test_pers_ss, y_test_pers_ss, X_test_nonPers_ss, y_test_nonPers_ss)

In [141]:
# run existing model over new test set
print('\nTest set score:', lr_ss.score(X_test_PnonP_ss, y_test_PnonP_ss))
lr_PnP_predicted_ss = lr_ss.predict(X_test_PnonP_ss)
y_test_relabeled_ss = np.where(y_test_PnonP_ss=='Carroll', 'Not', y_test_PnonP_ss) 


pd.crosstab(y_test_relabeled_ss, lr_PnP_predicted_ss)


Test set score: 0.53613369467028


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1065,42
Not,985,122


In this case, it seems that the model thinks everything is _Persuasion_.  Adding the grammatical features back in did not improve performance.  

### Austen v. any other work

In [45]:
# another work:
emma_base = gutenberg.raw('austen-emma.txt')
emma_base = re.sub(r'VOLUME \w+', '', emma_base)
emma_base = re.sub(r'CHAPTER \w+', '', emma_base)
emma = text_cleaner(emma_base)
print(emma[:100])

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


In [46]:
# Parse our cleaned data.
emma_doc = nlp(emma)

In [47]:
# Group into sentences.
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]


In [48]:
# Emma is quite long, let's cut it down to the same length as Alice.
emma_sents = emma_sents[0:len(alice_sents)]


In [50]:
# Build a new Bag of Words data frame for Emma word counts.
emma_sentences = pd.DataFrame(emma_sents)
emma_bow = bow_features(emma_sentences, common_words)

print('done')

1669
made it to the loop
Processing row 0
Processing row 1000
done


In [152]:
# build smaller subsamples of Austen works so that the test set will be half Austen, half not Austen (in this case, Milton)
subsample_emma = emma_bow.sample(n=int(np.floor(len(y_test_nonPers)/2)))
X_test_emma = subsample_emma.iloc[:, ~subsample_emma.columns.isin(['text_sentence','text_source']) ]
y_test_emma = subsample_emma['text_source']

test_Pers = X_test_pers
test_Pers['target'] = y_test_pers
subsample_Pers = test_Pers.sample(n=len(y_test_nonPers)-len(y_test_emma))
X_test_Pers = subsample_Pers.iloc[:, ~subsample_Pers.columns.isin(['target'])] #X_test[y_test[y_test == 'Carroll'].index]
y_test_Pers = subsample_Pers['target']

In [120]:
X_test_Carroll.shape, X_test_emma.shape, X_test_Pers.shape, X_test.shape, subsample_Pers.shape, len(y_test_pers)-len(y_test_emma)

((489, 3062), (244, 3062), (245, 3062), (1596, 3063), (245, 3063), 863)

In [153]:
# concatenate test sets to full Carroll-nonCarroll test set
X_test_AunonC = np.concatenate([X_test_nonPers, X_test_emma, X_test_Pers], axis=0)
y_test_AunonC = np.concatenate([y_test_nonPers, y_test_emma, y_test_Pers], axis=0)

In [154]:
# run existing model over new test set
print('\nTest set score:', lr.score(X_test_AunonC, y_test_AunonC))
lr_AunonC_predicted = lr.predict(X_test_AunonC)
pd.crosstab(y_test_AunonC, lr_AunonC_predicted)


Test set score: 0.573170731707317


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1056,51
Carroll,894,213


### With Gramatical features

In [52]:
emma_bow_ss = bow_features(emma_sentences, common_words, sent_stats = True)
print('done')


1669
made it to the loop
Processing row 0
Processing row 1000
done


In [149]:
# build smaller subsamples of Austen works so that the test set will be half Austen, half not Austen (in this case, Milton)
subsample_emma_ss = emma_bow_ss.sample(n=int(np.floor(len(y_test_nonPers_ss)/2)))
X_test_emma_ss = subsample_emma_ss.iloc[:, ~subsample_emma_ss.columns.isin(['text_sentence','text_source']) ]
y_test_emma_ss = subsample_emma_ss['text_source']

test_Pers_ss = X_test_pers_ss
test_Pers_ss['target'] = y_test_pers_ss
subsample_Pers_ss = test_Pers_ss.sample(n=len(y_test_nonPers_ss)-len(y_test_emma_ss))
X_test_Pers_ss = subsample_Pers_ss.iloc[:, ~subsample_Pers_ss.columns.isin(['target'])] #X_test[y_test[y_test == 'Carroll'].index]
y_test_Pers_ss = subsample_Pers_ss['target']

In [150]:
# concatenate test sets to full Austen-nonAusten test set
X_test_AunonC_ss = np.concatenate([X_test_nonPers_ss, X_test_emma_ss, X_test_Pers_ss], axis=0)
y_test_AunonC_ss = np.concatenate([y_test_nonPers_ss, y_test_emma_ss, y_test_Pers_ss], axis=0)

In [151]:
# run existing model over new test set
print('\nTest set score:', lr_ss.score(X_test_AunonC_ss, y_test_AunonC_ss))
lr_AunonC_predicted_ss = lr_ss.predict(X_test_AunonC_ss)
pd.crosstab(y_test_AunonC_ss, lr_AunonC_predicted_ss)



Test set score: 0.5221318879855466


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1034,73
Carroll,985,122


The models did a fair job labelling Austen works correctly, but Carroll's _Alice_ did not turn out to be a catch-all example of another work.  The penalty on accuracy was in Milton sentences being incorrectly clasified as Austen lines rather than being labelled as "Carroll" (or "NOT Austen", as the case may be).  