In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

In [2]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [12]:
caesar = gutenberg.raw('shakespeare-caesar.txt')
moby = gutenberg.raw('melville-moby_dick.txt')

In [14]:
caesar = re.sub(r'Actus .*', '', caesar)

#Remove newlines and other white spaces by splitting and rejoining
caesar = ' '.join(caesar.split())
moby = ' '.join(moby.split())

In [16]:
print(caesar[0:400])
print(moby[0:400])

[The Tragedie of Julius Caesar by William Shakespeare 1599] Enter Flauius, Murellus, and certaine Commoners ouer the Stage. Flauius. Hence: home you idle Creatures, get you home: Is this a Holiday? What, know you not (Being Mechanicall) you ought not walke Vpon a labouring day, without the signe Of your Profession? Speake, what Trade art thou? Car. Why Sir, a Carpenter Mur. Where is thy Leather Ap
[Moby Dick by Herman Melville 1851] ETYMOLOGY. (Supplied by a Late Consumptive Usher to a Grammar School) The pale Usher--threadbare in coat, heart, body, and brain; I see him now. He was ever dusting his old lexicons and grammars, with a queer handkerchief, mockingly embellished with all the gay flags of all the known nations of the world. He loved to dust his old grammars; it somehow mildly remi


In [17]:
#Parse the cleaned novels.
nlp = spacy.load('en')
moby_doc = nlp(moby)
caesar_doc = nlp(caesar)

In [18]:
def text_cleaner(text):
    text = re.sub(r'--', '', text)
    text = re.sub('[\[]:.*?[\]]()', "", text)
    text = ' '.join(text.split())
    return text

moby = text_cleaner(moby)
caesar = text_cleaner(caesar)

In [19]:
#Group into sentences
moby_sents = [[sent, "Merville"] for sent in moby_doc.sents]
caesar_sents = [[sent, 'Shakespeare'] for sent in caesar_doc.sents]

# Cut Caesar down to the same length as Moby.
caesar_sents = caesar_sents[0:len(moby_sents)]

#Combine the sentences from the two novels into one df.
sentences = pd.DataFrame(moby_sents + caesar_sents)
sentences.head()

Unnamed: 0,0,1
0,"([, Moby, Dick, by, Herman, Melville, 1851, ],...",Merville
1,"((, Supplied, by, a, Late, Consumptive)",Merville
2,"(Usher, to, a, Grammar, School, ))",Merville
3,"(The, pale, Usher, --, threadbare, in, coat, ,...",Merville
4,"(He, was, ever, dusting, his, old, lexicons, a...",Merville


In [20]:
#Code that counts length row by row, i.e. sentence by sentence
sentences['sentence_length']=sentences[0].str.len()
print(sentences.head())

                                                   0         1  \
0  ([, Moby, Dick, by, Herman, Melville, 1851, ],...  Merville   
1            ((, Supplied, by, a, Late, Consumptive)  Merville   
2                 (Usher, to, a, Grammar, School, ))  Merville   
3  (The, pale, Usher, --, threadbare, in, coat, ,...  Merville   
4  (He, was, ever, dusting, his, old, lexicons, a...  Merville   

   sentence_length  
0               10  
1                6  
2                6  
3               20  
4               31  


In [21]:
# Utility function to create a list of the 1000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(1000)]

In [32]:
#Create a df with features for each word in our common word set.
#Each value is the count of the times the word appears.
#BOW is bag of words

def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df.loc[:, 'punctuation_length'] = 0
    #df.loc[:, 'unique_words'] = 0
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    #loc you use the column name, iloc you use index (column number)
    #: means all rows. columns within common_words would be 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        puncts = [token for token in sentence if (token.is_punct)]
        df.loc[i,'punctuation_length'] += len(puncts)
        
        #example_words = [token for token in example_sentence if not token.is_punct]
        #unique_words = set([token.text for token in example_words])
        #df.loc[i, 'unique_words'] += len(unique_words)
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
    return df

In [33]:
#Set up the bags.
mobywords = bag_of_words(moby_doc)
caesarwords = bag_of_words(caesar_doc)

#Combine bags to create a set of unique words.
#Set takes out duplicates
common_words = set(mobywords + caesarwords)

In [34]:
#Creating the df with features.
word_counts = bow_features(sentences,common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000
Processing row 5500
Processing row 6000
Processing row 6500
Processing row 7000
Processing row 7500
Processing row 8000
Processing row 8500
Processing row 9000
Processing row 9500
Processing row 10000
Processing row 10500
Processing row 11000
Processing row 11500
Processing row 12000


Unnamed: 0,blow,bit,ride,swift,be,flee,hammock,cymber,wake,what,...,year,answer'd,lucil,finde,passe,swell,sirra,text_sentence,punctuation_length,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"([, Moby, Dick, by, Herman, Melville, 1851, ],...",3,Merville
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"((, Supplied, by, a, Late, Consumptive)",1,Merville
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(Usher, to, a, Grammar, School, ))",1,Merville
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(The, pale, Usher, --, threadbare, in, coat, ,...",6,Merville
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(He, was, ever, dusting, his, old, lexicons, a...",3,Merville


In [35]:
#Concat the number of words in that sentence into the df.
word_counts = pd.concat([word_counts, sentences['sentence_length']], axis=1)

In [38]:
#Include column with number of words in the previous sentence.
#Create new column. Set to NULL
word_counts['previous_length'] = word_counts['sentence_length']
word_counts['previous_length'] = None

#df.shape returns two numbers, for example (10,10). The first number is the row count
for i in range(1,word_counts.shape[0]):
    word_counts.loc[i,'previous_length'] = word_counts.loc[i-1,'sentence_length']

In [40]:
#Set first sentence of Merville to NULL
first_sentence = min(word_counts.index[word_counts['text_source'] == 'Merville'].tolist())
word_counts.loc[first_sentence, 'previous_length'] = None

In [41]:
#Include column with number of words in the next sentence.
#Create new column. Set to NULL
word_counts['next_length'] = word_counts['sentence_length']
word_counts['next_length'] = None

for i in range(0,word_counts.shape[-1]):
    word_counts.loc[i,'next_length'] = word_counts.loc[i+1,'sentence_length']

In [43]:
#Set last sentence of Carroll to NULL
last_sentence = max(word_counts.index[word_counts['text_source'] == 'Shakespeare'].tolist())
word_counts.loc[last_sentence, 'next_length'] = None

In [45]:
word_counts['previous_length'] = word_counts['previous_length'].fillna(word_counts['previous_length'].mean())
word_counts['next_length'] = word_counts['next_length'].fillna(word_counts['next_length'].mean())

In [46]:
print(word_counts.head())

   blow  bit  ride  swift  be  flee  hammock  cymber  wake  what     ...       \
0     0    0     0      0   0     0        0       0     0     0     ...        
1     0    0     0      0   0     0        0       0     0     0     ...        
2     0    0     0      0   0     0        0       0     0     0     ...        
3     0    0     0      0   0     0        0       0     0     0     ...        
4     0    0     0      0   0     0        0       0     0     0     ...        

   finde  passe  swell  sirra  \
0      0      0      0      0   
1      0      0      0      0   
2      0      0      0      0   
3      0      0      0      0   
4      0      0      0      0   

                                       text_sentence  punctuation_length  \
0  ([, Moby, Dick, by, Herman, Melville, 1851, ],...                   3   
1            ((, Supplied, by, a, Late, Consumptive)                   1   
2                 (Usher, to, a, Grammar, School, ))                   1   
3  (The, p

In [47]:
#Trying random forest
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'],1))

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score', rfc.score(X_train, y_train))
print('\nTest set score', rfc.score(X_test, y_test))

Training set score 0.99186440678

Test set score 0.930662871086


In [48]:
from sklearn.model_selection import cross_val_score
rfc_score = cross_val_score(rfc, X, Y, cv=5)
print(rfc_score)
print ('\nMean cross validation score is: ' + str(np.mean(rfc_score)))

[ 0.91459943  0.90890606  0.9402196   0.92717657  0.9182262 ]

Mean cross validation score is: 0.921825571595


In [49]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)
print('Training set score', clf.score(X_train, y_train))
print('\nTest set score', clf.score(X_test, y_test))

Training set score 0.915661016949

Test set score 0.909109394063


In [50]:
from sklearn.model_selection import cross_val_score
gb_score = cross_val_score(clf, X, Y, cv=5)
print(gb_score)
print ('\nMean cross validation score is: ' + str(np.mean(gb_score)))

[ 0.88735258  0.89914599  0.91581944  0.90195281  0.89747762]

Mean cross validation score is: 0.90034968934


In [None]:
#Model did not work as well for Moby Dick against Julius Caesar. Random Forest shows signs of overfitting. 
#Potentially more features could be added, such as number of unique words in each sentence. 