In [1]:
%matplotlib inline
import numpy as np, pandas as pd, scipy, sklearn, spacy, matplotlib.pyplot as plt, seaborn as sns, re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk

In [2]:
# Supervised requires pre-labeled data, predict whether a sentence is from Alice or Persuasion
# Works with any model which allows categorical outcomes
# Feature generation with BoW_ (Bag of words)
# For each sentence, count number of times a word appears, counts are features

# Text cleaning function
def text_cleaner(text):
    # Double dash '--' needs to be removed explicity
    text = re.sub(r'--', ' ', text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

# Load and clean data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)

# Use only 1/10 of the text to reduce processing time
alice = text_cleaner(alice[:int(len(alice)/10)])
persuasion = text_cleaner(persuasion[:int(len(persuasion)/10)])

# Parsing the cleaned novels
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine to dataframe
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


In [18]:
# Utility function to define the 2000 most common words
def bag_of_words(text):
    
    # Filter punctionation and stopwords
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return 2000 most common words
    return [item[0] for item in Counter(allwords).most_common(2000)]

# Create DF with features for each word in our common word set
# Values are the count of the times the word appears in each sentence
def bow_features(sentences, common_words):
    
    # Scaffold the DF and initialize counts to 0
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_sources'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurance of words in each sentence
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert to lemmas, filter out punctuation, stopwords and uncommon words 
        words = [token.lemma_
                 for token in sentence
                 if (not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words)]
        
        # Populate rows with word counts
        for word in words:
            df.loc[i, word] += 1
        
        # Checks for hung kernel
        if i % 50 == 0:
            print("Processing row {}".format(i))
        
    return df

# Set up bags
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine two bags to create a set of unique words
common_words = set(alicewords + persuasionwords)

# Create dataset with features. Computationally intensive
word_counts = bow_features(sentences, common_words)

# Minimal feature engineering due to time constraints
def sentence_length(sent):
    return len(sent)
word_counts['sentence_length'] = word_counts['text_sentence'].apply(sentence_length)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400
Processing row 450


Unnamed: 0,stay,nod,waistcoat,preserver,blood,Musgrove,neighbourhood,outward,neat,poison,...,issue,employ,key,rise,prescribe,severe,read,text_sentence,text_sources,sentence_length
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,67
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll,63
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,30
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll,3


In [19]:
# Using bag of words dataset with random forest model
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_sources']
X = np.array(word_counts.drop(['text_sentence', 'text_sources'], 1))

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4,
                                                   random_state=0)

train = rfc.fit(X_train, Y_train)
print("Training Set Score:", rfc.score(X_train, Y_train))
print("Test Set Score:", rfc.score(X_test, Y_test))

# Overfitting is a known issue in bag-of-words, also random forests. 
# Disparity between train/test set expected

Training Set Score: 0.985239852398524
Test Set Score: 0.8453038674033149




In [20]:
# Bag of Words with logistic regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, Y_train)

print(X_train.shape, Y_train.shape)
print("Training Set Score:", lr.score(X_train, Y_train))
print("Test Set Score:", lr.score(X_test, Y_test))

(271, 1613) (271,)
Training Set Score: 0.9520295202952029
Test Set Score: 0.8342541436464088




In [21]:
# Bag of Words with gradient boosting
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, Y_train)
print("Training Set Score:", clf.score(X_train, Y_train))
print("Test Set Score:", clf.score(X_test, Y_test))

Training Set Score: 0.985239852398524
Test Set Score: 0.850828729281768


For increasing performance, I created a feature for sentence length and attempted SVM to see if it was better suited. I understand the necessity of thorough feature engineering, but chose to move on quickly due to time constraints. 

Including sentence length seemed to reduce the performance of logistic regression while increasing performance of gradient boosting and random forest. SVM performance performed slightly worse than logistic regression prior to inclusion of this new feature, but was the best performer after inclusion. 

In [23]:
# Support Vector Machine
from sklearn.svm import SVC
svm = SVC(kernel='linear')
train = svm.fit(X_train, Y_train)
print("Training Set Score:", svm.score(X_train, Y_train))
print("Test Set Score:", svm.score(X_test, Y_test))

Training Set Score: 0.988929889298893
Test Set Score: 0.8839779005524862


In [24]:
# Challenge #1: I chose to compare Moby Dick (published in 1851) with Alice in Wonderland (published 1865)
moby = gutenberg.raw('melville-moby_dick.txt')
moby = text_cleaner(moby[:int(len(moby)/80)])

moby_doc = nlp(moby)
moby_sents = [[sent, "Melville"] for sent in moby_doc.sents]
sentences = pd.DataFrame(alice_sents + moby_sents)

mobywords = bag_of_words(moby_doc)
common_words = set(alicewords + mobywords)
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400
Processing row 450


Unnamed: 0,label,school,pierce,important,WALW,mariner,waistcoat,bit,fancy,lowly,...,vent,stream,LATIN,Europe,later,POCKET,Fife,read,text_sentence,text_sources
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll


In [25]:
# Model Moby versus Alice with logistic regression
lr.fit(X_train, Y_train)
print("Training Set Score:", lr.score(X_train, Y_train))
print("Test Set Score:", lr.score(X_test, Y_test))
lr_preds = lr.predict(X_test)
pd.crosstab(Y_test, lr_preds)

Training Set Score: 0.9520295202952029
Test Set Score: 0.8342541436464088




col_0,Carroll,Melville
text_sources,Unnamed: 1_level_1,Unnamed: 2_level_1
Carroll,25,28
Melville,2,126
