In [25]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from nltk.corpus import gutenberg, stopwords
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Background 

First, pick a set of texts. This can be either a series of novels, chapters, or articles. Anything you'd like. It just has to have multiple entries of varying characteristics. At least 100 should be good. There should also be at least 10 different authors, but try to keep the texts related (either all on the same topic of from the same branch of literature - something to make classification a bit more difficult than obviously different subjects).

In [2]:
# Lets look at what text I can use. 
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
# Pick 10 different books with 10 different authors.
austen = gutenberg.raw('austen-emma.txt')
bible = gutenberg.raw('bible-kjv.txt')
blake = gutenberg.raw('blake-poems.txt')
bryant = gutenberg.raw('bryant-stories.txt')
buster = gutenberg.raw('burgess-busterbrown.txt')
chesterton = gutenberg.raw('chesterton-thursday.txt')
edgeworth = gutenberg.raw('edgeworth-parents.txt')
milton = gutenberg.raw('milton-paradise.txt')
shakes = gutenberg.raw('shakespeare-caesar.txt')
whitman = gutenberg.raw('whitman-leaves.txt')

# Data Cleaning 

In [4]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    text = re.sub(r'CHAPTER \d+', '', text)
    text = re.sub("\\n\\n.*?\\n\\n", '', text)
  
    # Get rid of extra whitespace.
    text = ' '.join(text.split())

    
    return text

In [5]:
# Clean documents
austen = text_cleaner(austen)
# Bible was over the limit for nlp so I had to limit it.
bible = text_cleaner(bible)[:99990]
blake = text_cleaner(blake)
bryant = text_cleaner(bryant)
buster = text_cleaner(buster)
chesterton = text_cleaner(chesterton)
edgeworth = text_cleaner(edgeworth)
milton = text_cleaner(milton)
shakes = text_cleaner(shakes)
whitman = text_cleaner(whitman)

In [6]:
# run spaCy and analyze the documents
nlp = spacy.load('en')


austen_doc = nlp(austen)
bible_doc = nlp(bible)
blake_doc = nlp(blake)
bryant_doc = nlp(bryant)
buster_doc = nlp(buster)
chesterton_doc = nlp(chesterton)
edgeworth_doc = nlp(edgeworth)
milton_doc = nlp(milton)
shakes_doc = nlp(shakes)
whitman_doc = nlp(whitman)

In [9]:
# Group into sentences
austen_sents = [[sent, 'austen'] for sent in austen_doc.sents]
bible_sents = [[sent, 'bible'] for sent in bible_doc.sents]
bryant_sents = [[sent, 'bryant'] for sent in bryant_doc.sents]
buster_sents = [[sent, 'buster'] for sent in buster_doc.sents]
chesterton_sents = [[sent, 'chesterton'] for sent in chesterton_doc.sents]
edgeworth_sents = [[sent, 'edgeworth'] for sent in edgeworth_doc.sents]
milton_sents = [[sent, 'milton'] for sent in milton_doc.sents]
shakes_sents = [[sent, 'shakes'] for sent in shakes_doc.sents]
whitman_sents = [[sent, 'whitman'] for sent in whitman_doc.sents]

In [10]:
# Combine the sentences from the 10 novels into one data frame.
sentences = pd.DataFrame(austen_sents + bible_sents + bryant_sents +
                        buster_sents + chesterton_sents + edgeworth_sents +
                        milton_sents + shakes_sents + whitman_sents)

In [12]:
# Add a title to the columns so we know what we are looking at.
sentences.columns = ['text', 'author']

sentences.head()

Unnamed: 0,text,author
0,"(CHAPTER, I, Emma, Woodhouse, ,, handsome, ,, ...",austen
1,"(She, was, the, youngest, of, the, two, daught...",austen
2,"(Her, mother, had, died, too, long, ago, for, ...",austen
3,"(Sixteen, years, had, Miss, Taylor, been, in, ...",austen
4,"(Between, _, them)",austen


In [13]:
# Look at the size of the data. 
sentences.shape

(37215, 2)

In [28]:
# Create a training and testing data set. 
X = sentences['text']
y = sentences['author']

# Create training model now.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Feature Generation

## Tf-idf Vectorization