# Thinkful - 4.4.5 - Challenge - Build Own NLP Model

For this challenge, you will need to choose a corpus of data from nltk or another source that includes categories you can predict and create an analysis pipeline that includes the following steps:

1. Data cleaning / processing / language parsing
2. Create features using two different NLP methods: For example, BoW vs tf-idf.
3. Use the features to fit supervised learning models for each feature set to predict the category outcomes.
4. Assess your models using cross-validation and determine whether one model performed better.
5. Pick one of the models and try to increase accuracy by at least 5 percentage points.

In [40]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import gutenberg
from nltk.corpus import cmudict
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [3]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
filename1 = 'Data/Trump.txt'
file1 = open(filename1,'rt')
trump = file1.read()
file1.close()

filename2 = 'Data/Clinton.txt'
file2 = open(filename2,'rt')
clinton = file2.read()
file2.close()

# The Chapter indicator is idiosyncratic
trump = re.sub(r'Chapter \d+', '', trump)
clinton = re.sub(r'CHAPTER .*', '', clinton)
    
clinton = text_cleaner(clinton)
trump = text_cleaner(trump)

# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
clinton_doc = nlp(clinton)
trump_doc = nlp(trump)

# Group into sentences.
clinton_sents = [[sent, "Clinton"] for sent in clinton_doc.sents]
trump_sents = [[sent, "Trump"] for sent in trump_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(clinton_sents + trump_sents)
sentences.columns = ['Sentence','Speaker']
sentences.head()

Unnamed: 0,Sentence,Speaker
0,"(Thank, you, .)",Clinton
1,"(Thank, you, so, much, .)",Clinton
2,"(Thank, you, .)",Clinton
3,"(Thank, you, all, so, much, .)",Clinton
4,"(Thank, you, !)",Clinton


In [4]:
sentences.shape

(25106, 2)

# Data Exploration

In [5]:
# Utility function to calculate how frequently lemas appear in the text.
def lemma_frequencies(text, include_stop=True):
    
    # Build a list of lemas.
    # Strip out punctuation and, optionally, stop words.
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
            
    # Build and return a Counter object containing word counts.
    return Counter(lemmas)

# Instantiate our list of most common lemmas.
clinton_lemma_freq = lemma_frequencies(clinton_doc, include_stop=False).most_common(150)
trump_lemma_freq = lemma_frequencies(trump_doc, include_stop=False).most_common(150)
#print('\nClinton Most Frequent:', clinton_lemma_freq)
#print('\nTrump Most Frequent:', trump_lemma_freq)

# Again, identify the lemmas common to one text but not the other.
clinton_lemma_common = [pair[0] for pair in clinton_lemma_freq]
trump_lemma_common = [pair[0] for pair in trump_lemma_freq]
print('\nUnique to Clinton:', set(clinton_lemma_common) - set(trump_lemma_common))
print('\nUnique to Trump:', set(trump_lemma_common) - set(clinton_lemma_common))


Unique to Clinton: {'respect', 'fair', 'meet', 'kind', 'senator', 'small', 'hope', 'clean', 'young', 'health', 'student', 'today', 'raise', 'child', 'donald', 'issue', 'opponent', 'serve', 'election', 'school', 'americans', 'man', 'strong', 'future', 'chance', 'debt', 'community', 'national', 'try', 'live', 'plan', 'home', 'worker', 'opportunity', 'create', 'life', 'education', 'high', 'ahead', 'service', 'sure', 'able', 'republicans', 'support', 'real', 'economy', 'help', 'value', 'nuclear', 'feel', 'to', 'campaign', "'s", 'stand', 'college', 'family', 'kid'}

Unique to Trump: {'mexico', 'amazing', 'immigration', 'smart', 'guy', 'watch', 'iran', 'dollar', 'border', 'use', 'ago', 'remember', 'hillary', 'week', 'probably', 'problem', 'trade', 'trillion', 'company', 'in', 'little', 'military', 'incredible', 'okay', 'what', 'clinton', 'spend', 'billion', 'understand', 'how', 'second', 'politician', 'anybody', 'iowa', 'do', 'no', 'poll', 'kill', 'thousand', '’', 'folk', 'number', 'somebod

In [6]:
clinton_total_words = [token for token in clinton_doc if not token.is_punct]
trump_total_words = [token for token in trump_doc if not token.is_punct]

clinton_unique_words = set([token.text for token in clinton_total_words])
trump_unique_words = set([token.text for token in trump_total_words])

print(len(clinton_total_words))
print(len(clinton_unique_words))
print(len(trump_total_words))
print(len(trump_unique_words))

clinton_wps = len(clinton_doc)/len(clinton_sents)
trump_wps = len(trump_doc)/len(trump_sents)
wps_comp = clinton_wps/trump_wps

clinton_vocab = len(clinton_unique_words)/len(clinton_total_words)
trump_vocab = len(trump_unique_words)/len(trump_total_words)
vocab_comp = clinton_vocab/trump_vocab

print(("Clinton's sentences were {}% longer than Trump's while campaigning.").format("%0.1f" % ((wps_comp-1)*100)))
print(("Clinton's vocabulary was {}% larger than Trump's while campaigning.").format("%0.1f" % ((vocab_comp-1)*100)))

wordsummary = np.array([['Number of Words',len(clinton_doc),len(trump_doc)],
                        ['Number of Sentences',len(clinton_sents),len(trump_sents)],
                        ['Words Per Sentence',"%0.1f" % clinton_wps,"%0.1f" % trump_wps],
                        ['Vocab Variability',"%0.5f" % clinton_vocab,"%0.5f" % trump_vocab]])

df_wordsummary = pd.DataFrame(wordsummary)
df_wordsummary.columns = ['Parameter','Clinton','Trump']
df_wordsummary.head()

117523
6655
168100
6538
Clinton's sentences were 60.1% longer than Trump's while campaigning.
Clinton's vocabulary was 45.6% larger than Trump's while campaigning.


Unnamed: 0,Parameter,Clinton,Trump
0,Number of Words,132128.0,196418.0
1,Number of Sentences,7428.0,17678.0
2,Words Per Sentence,17.8,11.1
3,Vocab Variability,0.05663,0.03889


**Parts of Speech Bags Generator**

In [7]:
# Utility function to create a list of the potential parts of speech
def parts_of_speech(text):
    
    # Filter out punctuation and stop words.
    parts_speech = [token.pos_ for token in text]
    
    # Return all words.
    #return [item[0] for item in Counter(parts_speech)]
    return parts_speech

# Set up the bags.
clintonpos = parts_of_speech(clinton_doc)
trumppos = parts_of_speech(trump_doc)

# Combine bags to create a set of unique words.
pos_words = list(set(clintonpos + trumppos))

print(pos_words)

['CCONJ', 'PART', 'ADP', 'NUM', 'X', 'PRON', 'DET', 'VERB', 'ADV', 'INTJ', 'NOUN', 'SYM', 'PUNCT', 'PROPN', 'ADJ']


**Parts of Speech Counter**

In [8]:
# Creates a data frame with features for each part of speech corresponding to each word in our part of speech word set.
# Each value is the count of the times the part of speech appears in each sentence.
def pos_features(sentences, pos_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df1 = pd.DataFrame(columns=pos_words)
    df1['Sentence'] = sentences['Sentence']
    df1['Speaker'] = sentences['Speaker']
    df1.loc[:, pos_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df1['Sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        pos = [token.pos_ for token in sentence]
        
        # Populate the row with word counts.
        for pos1 in pos:
            df1.loc[i, pos1] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 5000 == 0:
            print("Processing row {}".format(i))
    
    return df1
        
# Create our data frame with features. This can take a while to run.
pos_counts = pos_features(sentences, pos_words)
pos_counts.head()

Processing row 0
Processing row 5000
Processing row 10000
Processing row 15000
Processing row 20000
Processing row 25000


Unnamed: 0,CCONJ,PART,ADP,NUM,X,PRON,DET,VERB,ADV,INTJ,NOUN,SYM,PUNCT,PROPN,ADJ,Sentence,Speaker
0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,"(Thank, you, .)",Clinton
1,0,0,0,0,0,1,0,1,2,0,0,0,1,0,0,"(Thank, you, so, much, .)",Clinton
2,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,"(Thank, you, .)",Clinton
3,0,0,0,0,0,1,1,1,2,0,0,0,1,0,0,"(Thank, you, all, so, much, .)",Clinton
4,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,"(Thank, you, !)",Clinton


**Common Words Bags Generator**

In [9]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

# Set up the bags.
clintonwords = bag_of_words(clinton_doc)
trumpwords = bag_of_words(trump_doc)

# Combine bags to create a set of unique words.
common_words = set(clintonwords + trumpwords)

**Word Counter: Word Frequency**

In [10]:
# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df2 = pd.DataFrame(columns=common_words)
    df2['Sentence'] = sentences['Sentence']
    df2['Speaker'] = sentences['Speaker']
    df2.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df2['Sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words)]
        
        # Populate the row with word counts.
        for word in words:
            df2.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 5000 == 0:
            print("Processing row {}".format(i))
              
    return df2

# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 5000
Processing row 10000
Processing row 15000
Processing row 20000
Processing row 25000


Unnamed: 0,stay,mosque,ride,thank,suffer,depression,leave,goodness,vast,march,...,during,fairness,moines,payment,cherish,certainly,everything,caucus,Sentence,Speaker
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Thank, you, .)",Clinton
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Thank, you, so, much, .)",Clinton
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Thank, you, .)",Clinton
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Thank, you, all, so, much, .)",Clinton
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Thank, you, !)",Clinton


**Word Counter: Total and Unique Word Counter**

In [11]:
df3 = sentences.drop(['Speaker'],axis=1)
total_count = []
unique_count = []

# Process each row, counting the occurrence of words in each sentence.
for i, sentence in enumerate(df3['Sentence']):
    
    #print(sentence)
    total_word_count = [token for token in sentence if not token.is_punct]
    unique_word_count = set([token.text for token in total_word_count])

    total_count.append(len(total_word_count))
    unique_count.append(len(unique_word_count))
        
df3['Total Word Count'] = pd.Series(total_count, index=df3.index)
df3['Unique Word Count'] = pd.Series(unique_count, index=df3.index)
df3.head(10)

Unnamed: 0,Sentence,Total Word Count,Unique Word Count
0,"(Thank, you, .)",2,2
1,"(Thank, you, so, much, .)",4,4
2,"(Thank, you, .)",2,2
3,"(Thank, you, all, so, much, .)",5,5
4,"(Thank, you, !)",2,2
5,"(Thank, you, !)",2,2
6,"(Thank, you, all, very, ,, very, much, !)",6,5
7,"(Thank, you, for, that, amazing, welcome, !)",6,6
8,"(Thank, you, all, for, the, great, convention,...",11,11
9,"(And, Chelsea, ,, thank, you, .)",4,4


# Model 1: Bag of Words

In [15]:
Y = word_counts['Speaker']
X = word_counts.drop(['Sentence','Speaker'], 1)
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.4,random_state=0)

**Bag of Words with Random Forest Classifier**

In [16]:
rfc = ensemble.RandomForestClassifier()

train = rfc.fit(X_train, y_train)

rfc_train_1 = rfc.score(X_train, y_train)
rfc_test_1 = rfc.score(X_test, y_test)

print('Training set score:', rfc_train_1)
print('\nTest set score:', rfc_test_1)

Training set score: 0.975303724358

Test set score: 0.815493378473


**Bag of Words with Logistic Regression**

In [17]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)

lr_train_1 = lr.score(X_train, y_train)
lr_test_1 = lr.score(X_test, y_test)

print('Training set score:', lr_train_1)
print('\nTest set score:', lr_test_1)

(15063, 2877) (15063,)
Training set score: 0.8962358096

Test set score: 0.850542666534


**Bag of Words with Gradient Boosting**

In [18]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

clf_train_1 = clf.score(X_train, y_train)
clf_test_1 = clf.score(X_test, y_test)

print('Training set score:', clf_train_1)
print('\nTest set score:', clf_test_1)

Training set score: 0.79187412866

Test set score: 0.776859504132


**Bag of Words with Support Vector Classifier**

In [19]:
svm = SVC(kernel = 'linear')
svm.fit(X_train, y_train)

svm_train_1 = svm.score(X_train, y_train)
svm_test_1 = svm.score(X_test, y_test)

print('Training set score:', svm_train_1)
print('\nTest set score:', svm_test_1)

Training set score: 0.9097789285

Test set score: 0.850542666534


# Model 2: Parts of Speech

In [29]:
Y1 = pos_counts['Speaker']
X1 = pos_counts.drop(['Sentence','Speaker'], 1)
X_train, X_test, y_train, y_test = train_test_split(X1, Y1,test_size=0.4,random_state=0)

**Random Forest Classifier**

In [22]:
rfc = ensemble.RandomForestClassifier()

train = rfc.fit(X_train, y_train)

rfc_train_2 = rfc.score(X_train, y_train)
rfc_test_2 = rfc.score(X_test, y_test)

print('Training set score:', rfc_train_2)
print('\nTest set score:', rfc_test_2)

Training set score: 0.945296421696

Test set score: 0.715821965548


**Logistic Regression**

In [23]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)

lr_train_2 = lr.score(X_train, y_train)
lr_test_2 = lr.score(X_test, y_test)

print('Training set score:', lr_train_2)
print('\nTest set score:', lr_test_2)

(15063, 15) (15063,)
Training set score: 0.743344619266

Test set score: 0.736433336652


**Gradient Boosting**

In [30]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

clf_train_2 = clf.score(X_train, y_train)
clf_test_2 = clf.score(X_test, y_test)

print('Training set score:', clf_train_2)
print('\nTest set score:', clf_test_2)

Training set score: 0.759543251676

Test set score: 0.742407647117


**Support Vector Machines**

In [31]:
svm = SVC(kernel = 'linear')
svm.fit(X_train, y_train)

svm_train_2 = svm.score(X_train, y_train)
svm_test_2 = svm.score(X_test, y_test)

print('Training set score:', svm_train_2)
print('\nTest set score:', svm_test_2)

Training set score: 0.736108344951

Test set score: 0.731454744598


# Improved Models: BoW, PoS and Unique Word Counts

In [38]:
pos_counts2 = pos_counts.drop(['Sentence','Speaker'], axis=1)
df4 = df3.drop(['Sentence'], axis=1)
df = pd.concat([pos_counts2, word_counts,df4], axis=1)
Y2 = df['Speaker']
X2 = df.drop(['Sentence','Speaker'], 1)
X_train, X_test, y_train, y_test = train_test_split(X2, Y2,test_size=0.4,random_state=0)

**Random Forest Classifier**

In [26]:
rfc = ensemble.RandomForestClassifier()

train = rfc.fit(X_train, y_train)

rfc_train_3 = rfc.score(X_train, y_train)
rfc_test_3 = rfc.score(X_test, y_test)

print('Training set score:', rfc_train_3)
print('\nTest set score:', rfc_test_3)

Training set score: 0.986456881099

Test set score: 0.805237478841


**Logistic Regression**

In [27]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)

lr_train_3 = lr.score(X_train, y_train)
lr_test_3 = lr.score(X_test, y_test)

print('Training set score:', lr_train_3)
print('\nTest set score:', lr_test_3)

(15063, 2894) (15063,)
Training set score: 0.902476266348

Test set score: 0.860101563278


**Gradient Boosting**

In [34]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

clf_train_3 = clf.score(X_train, y_train)
clf_test_3 = clf.score(X_test, y_test)

print('Training set score:', clf_train_3)
print('\nTest set score:', clf_test_3)

Training set score: 0.814645157007

Test set score: 0.796176441302


**Support Vector Machines**

In [13]:
svm = SVC(kernel = 'linear')
svm.fit(X_train, y_train)

svm_train_3 = svm.score(X_train, y_train)
svm_test_3 = svm.score(X_test, y_test)

print('Training set score:', svm_train_3)
print('\nTest set score:', svm_test_3)

Training set score: 0.915886609573

Test set score: 0.856915264363


**Summary**

In [35]:
summary = np.array([['Random Forest Classifier',"%0.3f" % rfc_test_1,
                      "%0.3f" % rfc_test_2,"%0.3f" % rfc_test_3],
                  ['Logistic Regression',"%0.3f" % lr_test_1,
                      "%0.3f" % lr_test_2,"%0.3f" % lr_test_3],
                  ['Gradient Boosting Regression',"%0.3f" % clf_test_1,
                      "%0.3f" % clf_test_2,"%0.3f" % clf_test_3],
                  ['Support Vector Machines',"%0.3f" % svm_test_1,
                      "%0.3f" % svm_test_2,"%0.3f" % svm_test_3]])

df_summary = pd.DataFrame(summary)
df_summary.columns = ['Model','BoW','PoS','Improved']

df_summary.head()

Unnamed: 0,Model,BoW,PoS,Improved
0,Random Forest Classifier,0.815,0.716,0.805
1,Logistic Regression,0.851,0.736,0.86
2,Gradient Boosting Regression,0.777,0.742,0.796
3,Support Vector Machines,0.851,0.731,0.857


In [36]:
params = {'loss':['deviance','exponential'],
              'max_depth':[2,5,8],
              'max_features':['log2','sqrt','auto'],
              'n_estimators':[5,10,100]}

gbc = ensemble.GradientBoostingClassifier(params)
clf = GridSearchCV(gbc, params, cv=5, n_jobs=-1,verbose=1)
clf.fit(X2,Y2)

print(clf.best_params_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 83.6min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 164.4min finished


{'loss': 'deviance', 'max_depth': 8, 'n_estimators': 100, 'max_features': 'auto'}


In [44]:
params = {
    "loss":"deviance",
    "max_depth":8,
    "max_features":"auto",
    "n_estimators":100
    }
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_test,y_test)

#Inspect results
print('\nR-squared:')
print(clf.score(X_test, y_test))
score_w = cross_val_score(clf, X_test, y_test, cv=5)
print(score_w)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score_w.mean(), score_w.std() * 2))


R-squared:
0.890072687444
[ 0.81393035  0.81781981  0.80926295  0.81175299  0.82270916]
Weighted Accuracy: 0.82 (+/- 0.01)
