In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

In [2]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [3]:
#Predict whether a sentence comes from Alice in Wonderland or Caesar.
#Removing the title. Match all text between square brackets and replace with empty string.

#Load and clean the data.
caesar = gutenberg.raw('shakespeare-caesar.txt')
alice = gutenberg.raw('carroll-alice.txt')

In [4]:
caesar = re.sub(r'Actus .*', '', caesar)
alice = re.sub(r'CHAPTER .*', '', alice)

#Remove newlines and other white spaces by splitting and rejoining
caesar = ' '.join(caesar.split())
alice = ' '.join(alice.split())

In [5]:
print(alice[0:400])
print(caesar[0:400])

[Alice's Adventures in Wonderland by Lewis Carroll 1865] Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?' So she was considering in her own mind (a
[The Tragedie of Julius Caesar by William Shakespeare 1599] Enter Flauius, Murellus, and certaine Commoners ouer the Stage. Flauius. Hence: home you idle Creatures, get you home: Is this a Holiday? What, know you not (Being Mechanicall) you ought not walke Vpon a labouring day, without the signe Of your Profession? Speake, what Trade art thou? Car. Why Sir, a Carpenter Mur. Where is thy Leather Ap


In [7]:
#Parse the cleaned novels.
nlp = spacy.load('en')
alice_doc = nlp(alice)
caesar_doc = nlp(caesar)

In [8]:
def text_cleaner(text):
    text = re.sub(r'--', '', text)
    text = re.sub('[\[]:.*?[\]]()', "", text)
    text = ' '.join(text.split())
    return text

alice = text_cleaner(alice)
caesar = text_cleaner(caesar)

In [9]:
#Group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
caesar_sents = [[sent, 'Shakespeare'] for sent in caesar_doc.sents]

# Cut Caesar down to the same length as Alice.
caesar_sents = caesar_sents[0:len(alice_sents)]

#Combine the sentences from the two novels into one df.
sentences = pd.DataFrame(alice_sents + caesar_sents)
sentences.head()

Unnamed: 0,0,1
0,"([, Alice, 's, Adventures, in, Wonderland, by,...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [10]:
#previous line that counts for whole column
#sentences['sentence_length']= len(sentences)

#correct code line that counts length row by row, i.e. sentence by sentence
sentences['sentence_length']=sentences[0].str.len()
print(sentences.head())

                                                   0        1  sentence_length
0  ([, Alice, 's, Adventures, in, Wonderland, by,...  Carroll               78
1  (So, she, was, considering, in, her, own, mind...  Carroll               63
2  (There, was, nothing, so, VERY, remarkable, in...  Carroll               33
3                                      (Oh, dear, !)  Carroll                3
4                         (I, shall, be, late, !, ')  Carroll                6


In [11]:
# Utility function to create a list of the 1000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(1000)]

In [12]:
#Create a df with features for each word in our common word set.
#Each value is the count of the times the word appears.
#BOW is bag of words

def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df.loc[:, 'punctuation_length'] = 0
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    #loc you use the column name, iloc you use index (column number)
    #: means all rows. columns within common_words would be 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        puncts = [token for token in sentence if (token.is_punct)]
        df.loc[i,'punctuation_length'] += len(puncts)
        
        example
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
    return df

In [None]:
example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

#Identify words according to which POS they belong to. 
def distinct_words_of_pos(text, pos):
    sent_word_tokens = [nltk.word_tokenize(s) for s in nltk.sent_tokenize(text)]
    all_pos = nltk.pos_tag_sents(sent_word_tokens, tagset="universal")
 
    uniques = { x[0].lower() for el in all_pos for x in el if x[1]==pos }
    return sorted(uniques)

In [13]:
#Set up the bags.
alicewords = bag_of_words(alice_doc)
caesarwords = bag_of_words(caesar_doc)

#Combine bags to create a set of unique words.
#Set takes out duplicates
common_words = set(alicewords + caesarwords)

In [14]:
#Creating the df with features.
word_counts = bow_features(sentences,common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000


Unnamed: 0,sorrowful,cynna,destruction,ambition,dardanius,knowne,spurne,plebeians,sicknesse,preuent,...,slay,barren,plenty,gate,rabbit,edge,examine,text_sentence,punctuation_length,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"([, Alice, 's, Adventures, in, Wonderland, by,...",12,Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,"(So, she, was, considering, in, her, own, mind...",7,Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,"(There, was, nothing, so, VERY, remarkable, in...",4,Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(Oh, dear, !)",1,Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(I, shall, be, late, !, ')",2,Carroll


In [15]:
#Concat the number of words in that sentence into the df.
word_counts = pd.concat([word_counts, sentences['sentence_length']], axis=1)

In [16]:
#Include column with number of words in the previous sentence.
#Create new column. Set to NULL
word_counts['previous_length'] = word_counts['sentence_length']
word_counts['previous_length'] = None

#df.shape returns two numbers, for example (10,10). The first number is the row count
for i in range(1,word_counts.shape[0]):
    word_counts.loc[i,'previous_length'] = word_counts.loc[i-1,'sentence_length']

In [17]:
#Set first sentence of Shakespeare to NULL
first_sentence = min(word_counts.index[word_counts['text_source'] == 'Shakespeare'].tolist())
word_counts.loc[first_sentence, 'previous_length'] = None

In [18]:
#Include column with number of words in the next sentence.
#Create new column. Set to NULL
word_counts['next_length'] = word_counts['sentence_length']
word_counts['next_length'] = None

for i in range(0,word_counts.shape[-1]):
    word_counts.loc[i,'next_length'] = word_counts.loc[i+1,'sentence_length']

In [19]:
#Set last sentence of Carroll to NULL
last_sentence = max(word_counts.index[word_counts['text_source'] == 'Carroll'].tolist())
word_counts.loc[last_sentence, 'next_length'] = None

In [20]:
word_counts['previous_length'] = word_counts['previous_length'].fillna(word_counts['previous_length'].mean())
word_counts['next_length'] = word_counts['next_length'].fillna(word_counts['next_length'].mean())

In [21]:
print(word_counts.head())

   sorrowful  cynna  destruction  ambition  dardanius  knowne  spurne  \
0          0      0            0         0          0       0       0   
1          0      0            0         0          0       0       0   
2          0      0            0         0          0       0       0   
3          0      0            0         0          0       0       0   
4          0      0            0         0          0       0       0   

   plebeians  sicknesse  preuent     ...       gate  rabbit  edge  examine  \
0          0          0        0     ...          0       0     0        0   
1          0          0        0     ...          0       1     0        0   
2          0          0        0     ...          0       1     0        0   
3          0          0        0     ...          0       0     0        0   
4          0          0        0     ...          0       0     0        0   

                                       text_sentence  punctuation_length  \
0  ([, Alice, 's

example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

In [22]:
#Trying random forest
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'],1))

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score', rfc.score(X_train, y_train))
print('\nTest set score', rfc.score(X_test, y_test))

Training set score 0.998509687034

Test set score 0.970215934475


In [27]:
from sklearn.model_selection import cross_val_score
rfc_score = cross_val_score(rfc, X, Y, cv=5)
print(rfc_score)
print ('\nMean cross validation score is: ' + str(np.mean(rfc_score)))

[ 0.90922619  0.99702381  0.98511905  0.99104478  0.98059701]

Mean cross validation score is: 0.972602167733


In [23]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)
print('Training set score', clf.score(X_train, y_train))
print('\nTest set score', clf.score(X_test, y_test))

Training set score 0.99652260308

Test set score 0.98808637379


In [25]:
from sklearn.model_selection import cross_val_score
gb_score = cross_val_score(clf, X, Y, cv=5)
print(gb_score)
print ('\nMean cross validation score is: ' + str(np.mean(gb_score)))

[ 0.9077381   0.99702381  1.          1.          0.99701493]

Mean cross validation score is: 0.980355366027
