In [69]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
from collections import Counter

import re

import itertools
flatten = itertools.chain.from_iterable

from nltk.corpus import gutenberg, stopwords

from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import TfidfVectorizer

In [70]:
pd.options.display.max_rows = 4000

## For this challenge, you will need to choose a corpus of data from nltk or another source that includes categories you can predict and create an analysis pipeline that includes the following steps:

1. Data cleaning / processing / language parsing
2. Create features using two different NLP methods: For example, BoW vs tf-idf.
3. Use the features to fit supervised learning models for each feature set to predict the category outcomes.
4. Assess your models using cross-validation and determine whether one model performed better.
5. Pick one of the models and try to increase accuracy by at least 5 percentage points.

In [71]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


How well do these parsers work on Early Modern English verse?

In [72]:
paradise = gutenberg.raw('milton-paradise.txt')

In [73]:
paradise



### Data cleaning/processing/language parsing

_Paradise Lost_ is divided into around 10-12 "books", [depending on the edition](https://en.wikipedia.org/wiki/Paradise_Lost). These books are not very big, but I'm hoping I can classify based on book, and find some interesting comparisons between the texts of each book. Maybe some of them lean harder on particular characters, or actions. Let's see how that looks.

In [74]:
paradise_split = paradise.split('\nBook ')

In [75]:
len(paradise_split)

13

In [76]:
# Utility function for standard text cleaning.
def text_cleaner(text):
# Remove double dashes
    text = re.sub(r'--',' ',text)
# Strip lines encased in brackets
    text = re.sub("[\[].*?[\]]", "", text)
# Strip the roman numerals that used to come after "Book ", before splitting.
# They will be found at the beginning of a line, and have at least one trailing space and a newline.
    text = re.sub(r'^([I]*[X]*[V]*[I]*)+\s+\n', '', text,flags=re.M)
# Random asterisks and dollar signs appear throughout
    text = re.sub('[\*\$]*', '', text)
# Remove lines that are nothing but spaces and digits
    text = re.sub(r'^\s*[0-9]+\s*$', '', text,flags=re.M)
# Strip EOF characters
    text = re.sub(r"\x1a","",text)

    text = " ".join(text.split())
    return text

In [77]:
paradise_cleaned = [text_cleaner(clean_me) for clean_me in paradise_split]

# Drop the leading empty line created when we dropped the title
paradise_cleaned = paradise_cleaned[1:]

In [78]:
nlp = spacy.load('en')

In [79]:
# One of the dangers of Early Modern English--all of the th's, thees, and thous :)
nlp.Defaults.stop_words |= {"th","th'","thou","thee","thy","thyself","till","hast","hath","ere","o"}

In [80]:
print(nlp.Defaults.stop_words)

{'were', 'besides', 'in', 'whereafter', 'take', 'more', 'someone', 'can', 'where', 'as', 'make', 'go', 'give', 'ca', 'often', 'ourselves', 'himself', 'herein', 'somewhere', 'either', 'her', 'very', 'latter', 'a', 'between', 'rather', 'nine', 'show', 'first', 'nevertheless', 'n’t', 'hereafter', 'yourselves', 'across', 'among', 'seemed', 'used', 'could', 'beforehand', 'thereafter', 'when', 'whether', 'back', 'none', 'would', 'top', 'thru', 'has', 'part', 'whenever', 'ten', "'m", 'alone', 'his', 'latterly', 'once', '‘ll', "'d", 'still', 'may', 'perhaps', 'how', 'through', 'thyself', 'it', 'per', 'everyone', 'former', 'all', 'itself', 'hence', 'however', 'together', 'full', '‘m', 'own', 'being', 'so', 'been', '’m', 'much', 'anyone', 'becoming', 'them', 'several', 'yourself', 'regarding', 'that', 'least', 'name', 'sometimes', 'one', 'had', 'yours', 'are', 'why', 'empty', 'along', 'otherwise', 'up', 'anyway', '’ve', 'he', 'since', 'ours', 'eleven', 'some', 'herself', 'an', 'nor', 'toward', '

In [81]:
paradise_doc = [nlp(parse_me) for parse_me in paradise_cleaned]

In [142]:
len(paradise_doc)

12

In [141]:
len(list(paradise_doc[0].sents))


186

In [82]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(200)]

paradise_doc is a list of lists, so flatten that to a 1-dimensional list before we feed it to the bag_of_words function.

In [83]:
common_words = bag_of_words(list(flatten(paradise_doc)))

In [84]:
books = ['Book I','Book II','Book III','Book IV','Book V','Book VI',
         'Book VII','Book VIII','Book IX','Book X','Book XI','Book XII']

In [85]:
# I tried to build this with a comprehension, and it broke my brain.
# So this is probably less efficient, but it's readable, and sane.
# document and books need to be lists of the same length.
def build_dataframe(document,books):
    document_df = pd.DataFrame()
    for idx,book in enumerate(document):
        book_df = pd.DataFrame([[sent, books[idx]] for sent in book.sents])
        document_df = document_df.append(book_df,ignore_index=True)
    return document_df

In [86]:
paradise_df = build_dataframe(paradise_doc,books)

In [87]:
paradise_df

Unnamed: 0,0,1
0,"(Of, Man, 's, first, disobedience, ,, and, the...",Book I
1,"(That, shepherd, who, first, taught, the, chos...",Book I
2,"(And, chiefly, thou, ,, O, Spirit, ,, that, do...",Book I
3,"(,, what, is, low, raise, and, support, ;, Tha...",Book I
4,"(Say, first, for, Heaven, hides, nothing, from...",Book I
5,"(Who, first, seduced, them, to, that, foul, re...",Book I
6,"(Th, ', infernal, Serpent, ;, he, it, was, who...",Book I
7,"(Him, the, Almighty, Power, Hurled, headlong, ...",Book I
8,"(In, adamantine, chains, and, penal, fire, ,, ...",Book I
9,"(Nine, times, the, space, that, measures, day,...",Book I


In [88]:
# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
# Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0

# Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
# Convert the sentence to lemmas, then filter out punctuation,
# stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct and
                     not token.is_stop
                     and token.lemma_ in common_words
                 )]
# Capture the amount of punctuation (check the length of this list)
        punctuation = [token.lemma_
                       for token in sentence
                       if token.is_punct]

# Populate values for individual word count
        word_count = Counter(words)
        for word in word_count:
            df.loc[i,word] = word_count[word]

# Add sentence-level word count, punctuation
        df.loc[i,'sentence_length'] = len(sentence)
        df.loc[i,'sentence_punctuation'] = len(punctuation)

# This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

In [89]:
word_counts = bow_features(paradise_df, common_words)

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400
Processing row 450
Processing row 500
Processing row 550
Processing row 600
Processing row 650
Processing row 700
Processing row 750
Processing row 800
Processing row 850
Processing row 900
Processing row 950
Processing row 1000
Processing row 1050
Processing row 1100
Processing row 1150
Processing row 1200
Processing row 1250
Processing row 1300
Processing row 1350
Processing row 1400
Processing row 1450
Processing row 1500
Processing row 1550
Processing row 1600
Processing row 1650
Processing row 1700
Processing row 1750
Processing row 1800
Processing row 1850
Processing row 1900
Processing row 1950
Processing row 2000
Processing row 2050
Processing row 2100
Processing row 2150
Processing row 2200
Processing row 2250
Processing row 2300
Processing row 2350
Processing row 2400
Processing row 2450
Processing row 2500
Pro

In [90]:
word_counts.head()

Unnamed: 0,Heaven,shall,God,high,know,stand,day,find,great,good,...,rage,fierce,length,drive,walk,virtue,text_sentence,text_source,sentence_length,sentence_punctuation
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,"(Of, Man, 's, first, disobedience, ,, and, the...",Book I,65.0,11.0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(That, shepherd, who, first, taught, the, chos...",Book I,78.0,8.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(And, chiefly, thou, ,, O, Spirit, ,, that, do...",Book I,62.0,12.0
3,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,"(,, what, is, low, raise, and, support, ;, Tha...",Book I,33.0,6.0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Say, first, for, Heaven, hides, nothing, from...",Book I,57.0,7.0


## Supervised learning models 
## (using bag of words)

They all perform pretty poorly against this material, probably because there are 12 possible categories, for text that is a single overall work. I sort of expected this wouldn't work so well...

In [107]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split


Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

In [108]:
rfc = ensemble.RandomForestClassifier(n_estimators=100)

rfc_scores = cross_val_score(rfc, X, Y, cv=10)
print("Cross-validation scores: ",rfc_scores)

Cross-validation scores:  [0.14181818 0.18315018 0.21611722 0.16605166 0.23333333 0.23507463
 0.19924812 0.19172932 0.18796992 0.14448669]


In [109]:
from sklearn.linear_model import LogisticRegression

# As I understand it, 'liblinear' is good for smaller datasets, and it can handle l1 penalty
lr = LogisticRegression(penalty='l1',multi_class='auto',solver='liblinear')

lr_scores = cross_val_score(lr, X, Y, cv=10)
print("Cross-validation scores: ",lr_scores)

Cross-validation scores:  [0.17454545 0.18315018 0.23443223 0.16605166 0.24074074 0.22761194
 0.2518797  0.21428571 0.19924812 0.17870722]


In [110]:
clf = ensemble.GradientBoostingClassifier()

clf_scores = cross_val_score(clf, X, Y, cv=10)
print("Cross_validation scores: ",clf_scores)

Cross_validation scores:  [0.16363636 0.14652015 0.19413919 0.16605166 0.23703704 0.21268657
 0.2406015  0.22180451 0.13909774 0.1634981 ]


## (using tf-idf)

In [95]:
len(paradise_cleaned)

12

In [164]:
# document and books need to be lists of the same length.
# Returns two lists: 1) the document, split into sentences,
#                    2) and the "target" for each sentence.
def build_lol(document,books):
    document_list = []
    book_list = []
    for idx,book in enumerate(document):
        sentences = list(book.sents)
        book_list = book_list + list(np.repeat([books[idx]],len(sentences)))
        document_list = document_list + sentences
    return document_list, book_list

In [173]:
# Start with a model similar to the one we built in the exercises
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             analyzer='word',
                             stop_words=nlp.Defaults.stop_words, # we have our own list of stopwords now 
                             lowercase=True, #convert everything to lower case
                             use_idf=True,
                             norm='l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

In [159]:
X_lol, Y_lol = build_lol(paradise_doc,books)

In [174]:
#Applying the vectorizer
paradise_tfidf = vectorizer.fit_transform(paradise_cleaned)
print("Number of features: %d" % paradise_tfidf.get_shape()[1])

Number of features: 3564


In [184]:
paradise_tfidf.shape

(12, 3564)

In [179]:

# get the first vector out (for the first document)
first_vector_tfidfvectorizer=paradise_tfidf[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)
 

Unnamed: 0,tfidf
temple,0.142267
height,0.124856
sion,0.113074
names,0.111342
egypt,0.101619
armed,0.092785
fiery,0.092785
huge,0.092785
durst,0.092785
anon,0.089654


In [102]:
X_train_tfidf_csr

<7x3564 sparse matrix of type '<class 'numpy.float64'>'
	with 7050 stored elements in Compressed Sparse Row format>

In [98]:
len(tfidf_bypara)

7

In [113]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd = TruncatedSVD(130)
#lsa = make_pipeline(svd, Normalizer(copy=False))

I'm interested to see whether applying SVD improves cross-validation scores for the same models I tried earlier.

In [116]:
paradise_tfidf

<12x3564 sparse matrix of type '<class 'numpy.float64'>'
	with 11112 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.linear_model import LogisticRegression

# As I understand it, 'liblinear' is good for smaller datasets, and it can handle l1 penalty
lr = LogisticRegression(penalty='l1',multi_class='auto',solver='liblinear')

lr_scores = cross_val_score(lr, X, Y, cv=10)
print("Cross-validation scores: ",lr_scores)

In [None]:
clf = ensemble.GradientBoostingClassifier()

clf_scores = cross_val_score(clf, X, Y, cv=10)
print("Cross_validation scores: ",clf_scores)

In [122]:
X_train_tfidf.tocsr().shape

(7, 3564)

In [97]:
#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(paradise_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

In [99]:
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train) #<--it hates this
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 100.00000000000013


ValueError: Shape of passed values is (7, 7), indices imply (1614, 7)