In [148]:
import os
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from pandas import DataFrame, Series
import time
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.lda import LDA
from sklearn.linear_model import LogisticRegression

ROOT_DIR = os.getcwd()
DATA_DIR = os.path.join(ROOT_DIR, 'Raw_debates')
RESULT_DIR = os.path.join(ROOT_DIR, 'candidate_lines')

candidates = {'CLINTON', 'SANDERS', 'TRUMP', 'RUBIO', 'CRUZ'}
cand_list = sorted(candidates)

stops = set(stopwords.words("english"))

%matplotlib inline

In [149]:
# parameters

numWords = 1000
minLength = 200

In [150]:
word_dic = {'action': 'act',
               'agreement': 'agree',
               'americans': 'american',
               'asked': 'ask',
               'asking': 'ask',
               'going': 'go',
               'states': 'state',
               'working': 'work',
               'millions': 'million',
               'bringing': 'bring',
               'businesses': 'business',
               'candidates': 'candidate',
               'children': 'child',
               'comes': 'come',
               'coming': 'come',
               'companies': 'company',
               'countries': 'country',
               'deals': 'deal',
               'economic': 'economy',
               'families': 'family',
               'fighting': 'fight',
               'gets': 'get',
               'getting': 'get',
               'goes': 'go',
               'got': 'get',
               'groups': 'group',
               'guns': 'gun',
               'happened': 'happen',
               'happening': 'happen',
               'helped': 'help',
               'issues': 'issue',
               'knows': 'know',
               'laws': 'law',
               'lives': 'live',
               'living': 'live',
               'making': 'make',
               'needs': 'need',
               'passed': 'pass',
               'problems': 'problem',
               'putting': 'put',
               'really': 'real',
               'republicans': 'republican',
               'running': 'run',
               'saying': 'say',
               'said': 'say',
               'seeing': 'see',
               'seen': 'see',
               'started': 'start',
               'supported': 'support',
               'taking': 'take',
               'talked': 'talk',
               'talking': 'talk',
               'terrorists': 'terrorist',
               'terrorism': 'terrorist',
               'things': 'thing',
               'trying': 'try',
               'used': 'use',
               'using': 'use',
               'voted': 'vote',
               'wages': 'wage',
               'wanted': 'want',
               'wants': 'want',
               'building': 'build',
               'called': 'call',
               'came': 'come',
               'communities': 'community',
               'costs': 'cost',
               'deffence': 'deffend',
               'difference': 'different',
               'drugs': 'drug',
               'gave': 'give',
               'given': 'give',
               'gone': 'go',
               'higher': 'high',
               'highest': 'high',
               'interests': 'interest',
               'jobs': 'job',
               'longer': 'long',
               'looked': 'look',
               'looking': 'look',
               'lost': 'lose',
               'made': 'make',
               'means': 'mean',
               'paying': 'pay',
               'planned': 'plan',
               'programs': 'program',
               'raising': 'raise',
               'reasons': 'reason',
               'ringing': 'ring',
               'says': 'say',
               'saw': 'see',
               'wealthy': 'wealth',
               'worked': 'work',
               'years': 'year'}

In [151]:
def get_raw_text(cand_name):
    file_name = '{0}.txt'.format(cand_name)
    file_path = os.path.join(RESULT_DIR, file_name)
    with open(file_path, 'rb') as f:
        return f.read()

def raw_to_blocks(cand_name, raw_text, delimiter='\r\n\r\n', min_len=minLength):
    # Delete candidate's name
    names_removed = raw_text.replace(cand_name + ':', '')
    
    # Split the text into paragraphs
    splitted = names_removed.split(delimiter)

    # Gather only long enough paragraphs
    rets = [paragraph for paragraph in splitted if len(paragraph) > min_len]
    return rets

# Replace words that are virtually same into one word. 
def hard_code_process(text):
    separate_word = ' {0} '
    text = separate_word.format(text)
    
    for key, value in word_dic.iteritems():
        key = separate_word.format(key)
        value = separate_word.format(value)
        text = text.replace(key, value)
    return text

def process_paragraph(paragraph):
    # Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", paragraph) 
    
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()
    
    # Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    rets = ' '.join(meaningful_words)
    return hard_code_process(rets)

def get_processed_lines(cand_name, delimiter='\r\n\r\n', min_len=minLength):
    raw_text = get_raw_text(cand_name)
    blocks = raw_to_blocks(cand_name, raw_text, 
                           delimiter=delimiter, min_len=min_len)
    return map(process_paragraph, blocks)

def get_vectorizer(cand_lines, max_features=1000):
    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.  
    vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = max_features)
    
    concatenated = []
    for cand_name, lines in cand_lines.iteritems():
        concatenated += lines
    
    vectorizer.fit(concatenated)
    return vectorizer

In [152]:
# Examples
cand_lines = {c: get_processed_lines(c) for c in candidates}
vectorizer = get_vectorizer(cand_lines, numWords)

vocab = vectorizer.get_feature_names()
clinton_train = vectorizer.transform(cand_lines['CLINTON']).toarray()

In [153]:
# Split training data and test data
def test_train_split(cand_lines, test_ratio=0.25):
    cands = cand_lines.keys()
    test_data = dict.fromkeys(cands)
    train_data = dict.fromkeys(cands)
    for c in cands:
        lines = cand_lines[c]
        l = len(lines)
        test_len = int(l * test_ratio)
        test_indices = set(np.random.choice(l, test_len, replace=False))
        train_indices = set(range(l)) - test_indices
        test_data[c] = list(np.take(lines, list(test_indices)))
        train_data[c] = list(np.take(lines, list(train_indices)))
    return test_data, train_data

def normalize(data):
    rets = dict()
    for c in data:
        d = data[c].astype(float)
        _sum = d.sum(axis=1)
        _sum[_sum==0] = 1
#         rets[c] = (d.T).T
        rets[c] = (d.T / _sum).T
    return rets

def normalizeBinary(data):
    rets = dict()
    for c in data:
        d = data[c].astype(float)
        d[d!=0] = 1
       
        rets[c] = d
    return rets

In [161]:
# Process the data

cand_lines = {c: get_processed_lines(c) for c in candidates}
test_data, train_data = test_train_split(cand_lines, 0.01)

vectorizer = get_vectorizer(train_data, numWords)
vocab = vectorizer.get_feature_names()

for c in train_data:
    train_data[c] = vectorizer.transform(train_data[c]).toarray()
    test_data[c] = vectorizer.transform(test_data[c]).toarray()
    
# Processing proportional, unnormalized data

train_x_reg, test_x_reg = None, None
train_y_reg, test_y_reg = [], []

for c in train_data:
    i  = cand_list.index(c)
    if train_x_reg is None:
        train_x_reg = train_data[c]
    else:
        train_x_reg = np.concatenate((train_x_reg, train_data[c]))
    train_y_reg += [i] * len(train_data[c])
    
for c in test_data:
    i  = cand_list.index(c)
    if test_x_reg is None:
        test_x_reg = test_data[c]
    else:
        test_x_reg = np.concatenate((test_x_reg, test_data[c]))
    test_y_reg += [i] * len(test_data[c])
    
# Processing proportional data

train_data = normalize(train_data)
test_data = normalize(test_data)

train_x, test_x = None, None
train_y, test_y = [], []

for c in train_data:
    i  = cand_list.index(c)
    if train_x is None:
        train_x = train_data[c]
    else:
        train_x = np.concatenate((train_x, train_data[c]))
    train_y += [i] * len(train_data[c])
    
for c in test_data:
    i  = cand_list.index(c)
    if test_x is None:
        test_x = test_data[c]
    else:
        test_x = np.concatenate((test_x, test_data[c]))
    test_y += [i] * len(test_data[c])
    
# Processing binary data

train_dataBin = normalizeBinary(train_data)
test_dataBin = normalizeBinary(test_data)

train_xBin, test_xBin = None, None
train_yBin, test_yBin = [], []

for c in train_dataBin:
    i  = cand_list.index(c)
    if train_xBin is None:
        train_xBin = train_dataBin[c]
    else:
        train_xBin = np.concatenate((train_xBin, train_dataBin[c]))
    train_yBin += [i] * len(train_dataBin[c])
    
for c in test_dataBin:
    i  = cand_list.index(c)
    if test_xBin is None:
        test_xBin = test_dataBin[c]
    else:
        test_xBin = np.concatenate((test_xBin, test_dataBin[c]))
    test_yBin += [i] * len(test_dataBin[c])
    


In [162]:
set(train_y)

{0, 1, 2, 3, 4}

In [11]:
counts = []
for i in xrange(len(train_x[0,:])):
    counts.append(sum(train_xBin[:,i]) + sum(test_xBin[:,i]))

print(sorted(counts))

[5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 1

In [12]:
_nTree = []
_depth = []
_train = []
_test = []
_timeTaken = []

_trainBin = []
_testBin = []
_timeTakenBin = []

tryNTree = [100,500,2000]
tryDepth = [50,100,500,1000]

for nTree in tryNTree:
    for depth in tryDepth:
        
        print('nTree: {0}, depth: {1}'.format(nTree,depth))
        
        start = time.time()
        tempRF = RandomForestClassifier(n_estimators=nTree, max_depth=depth)
        tempRF.fit(train_x, train_y)
        finish = time.time() - start
        print(finish)
        
        _timeTaken.append(finish)
        _nTree.append(nTree)
        _depth.append(depth)
        _train.append(tempRF.score(train_x, train_y))
        _test.append(tempRF.score(test_x, test_y))
        
        start = time.time()
        tempRF = RandomForestClassifier(n_estimators=nTree, max_depth=depth)
        tempRF.fit(train_xBin, train_yBin)
        finish = time.time() - start
        print(finish)
        
        _timeTakenBin.append(finish)
        _trainBin.append(tempRF.score(train_xBin, train_yBin))
        _testBin.append(tempRF.score(test_xBin, test_yBin))
        

nTree: 100, depth: 50
1.25675296783
1.13967013359
nTree: 100, depth: 100
1.53021001816
1.44566106796
nTree: 100, depth: 500
1.72514295578
1.22407102585
nTree: 100, depth: 1000
1.28824400902
1.20889997482
nTree: 500, depth: 50


KeyboardInterrupt: 

In [13]:
DataFrame({'nTree': _nTree, 'depth': _depth, 'train_score': _train, 'test_score': _test,'train_scoreBin': _trainBin, 'test_scoreBin': _testBin, 'time taken': _timeTaken})

Unnamed: 0,depth,nTree,test_score,test_scoreBin,time taken,train_score,train_scoreBin
0,50,100,0.687379,0.708738,1.256753,1,1
1,100,100,0.664078,0.708738,1.53021,1,1
2,500,100,0.646602,0.68932,1.725143,1,1
3,1000,100,0.666019,0.700971,1.288244,1,1


In [15]:
confusion_matrix(test_y,tempRF.predict(test_xBin))

array([[70,  5,  8, 15, 11],
       [ 4, 65, 11,  2, 16],
       [ 7, 12, 77,  3,  9],
       [ 8,  1, 13, 58, 18],
       [ 4,  5,  3,  4, 86]])

In [21]:
# result = tempRF.predict_proba(np.eye(1500))
np.unique(test_y, return_counts = True)


(array([0, 1, 2, 3, 4]), array([109,  98, 108,  98, 102]))

In [18]:
DataFrame(result)

Unnamed: 0,0,1,2,3,4
0,0.0185,0.2895,0.5735,0.0210,0.0975
1,0.0410,0.2975,0.5405,0.0205,0.1005
2,0.0275,0.2590,0.5730,0.0345,0.1060
3,0.0180,0.2835,0.5525,0.0195,0.1265
4,0.0190,0.2980,0.5650,0.0205,0.0975
5,0.0535,0.2805,0.5315,0.0230,0.1115
6,0.0190,0.2940,0.5575,0.0205,0.1090
7,0.0240,0.2775,0.5465,0.0575,0.0945
8,0.0190,0.2875,0.5565,0.0395,0.0975
9,0.0190,0.2945,0.5675,0.0215,0.0975


In [22]:
result = Series(result)
result.describe()

for i in range(5):
    print i, (result==i).sum()
    

NameError: name 'result' is not defined

In [None]:
plt.imshow(confusion_matrix(test_y,tempRF.predict(test_xBin)))

In [45]:
cand_list

['CLINTON', 'CRUZ', 'RUBIO', 'SANDERS', 'TRUMP']

In [15]:
# Optimize C and gamma

_C = []
_gamma = []
_train = []
_test = []

for C in [0.01, 0.1, 1, 10, 100]:
    for gamma in [0.01, 0.1, 1, 10, 100]:
        clf = svm.SVC(C=C, gamma=gamma)
        clf.fit(train_x, train_y)
        _C.append(C)
        _gamma.append(gamma)
        _train.append(clf.score(train_x, train_y))
        _test.append(clf.score(test_x, test_y))

In [16]:
DataFrame({'C': _C, 'gamma': _gamma, 'train_score': _train, 'test_score': _test})

Unnamed: 0,C,gamma,test_score,train_score
0,0.01,0.01,0.21165,0.21056
1,0.01,0.1,0.21165,0.21056
2,0.01,1.0,0.21165,0.21056
3,0.01,10.0,0.21165,0.21056
4,0.01,100.0,0.21165,0.21056
5,0.1,0.01,0.21165,0.21056
6,0.1,0.1,0.21165,0.21056
7,0.1,1.0,0.21165,0.21056
8,0.1,10.0,0.32233,0.364456
9,0.1,100.0,0.21165,0.21056


In [39]:
svm_clf = svm.SVC(C=10, gamma=10, probability=True)
svm_clf.fit(train_x, train_y)
train_score_SVM = svm_clf.score(train_x, train_y)
test_score_SVM = svm_clf.score(test_x, test_y)
probs_SVM = svm_clf.predict_proba(test_x)

In [163]:
clfNB = MultinomialNB()
clfNB.fit(train_xBin, train_yBin)
train_score_NB_bin = clfNB.score(train_xBin, train_yBin)
test_score_NB_bin = clfNB.score(test_xBin,test_yBin)
probs_NB_bin = clfNB.predict_proba(test_xBin)


In [41]:
clfLR = LogisticRegression()
clfLR.fit(train_x_reg,train_y_reg)
train_score_LR = clfLR.score(train_x_reg,train_y_reg)
test_score_LR = clfLR.score(test_x_reg,test_y_reg)
probs_LR = clfLR.predict_proba(test_x_reg)


In [42]:
tempRF = RandomForestClassifier(n_estimators=2000, max_depth=250)
tempRF.fit(train_xBin, train_yBin)
train_score_RF = tempRF.score(train_xBin, train_yBin)
test_score_RF = tempRF.score(test_xBin, test_yBin)
probs_RF = tempRF.predict_proba(test_xBin)

In [43]:
confusion_matrix(test_y, svm_clf.predict(test_x))

array([[84,  1, 10,  8,  6],
       [ 5, 70,  9,  5,  9],
       [ 7, 12, 80,  3,  6],
       [16,  1,  7, 65,  9],
       [ 4,  7,  4,  2, 85]])

In [44]:
confusion_matrix(test_yBin,clfNB.predict(test_xBin))

array([[90,  1,  6, 10,  2],
       [ 3, 70, 11,  6,  8],
       [10, 10, 73,  6,  9],
       [19,  1,  4, 71,  3],
       [ 2,  4,  1,  2, 93]])

In [45]:
confusion_matrix(test_y_reg, clfLR.predict(test_x_reg))

array([[85,  1,  7,  9,  7],
       [ 2, 73, 14,  5,  4],
       [ 7, 16, 74,  5,  6],
       [14,  1,  7, 74,  2],
       [ 4,  6,  4,  5, 83]])

In [46]:
confusion_matrix(test_yBin,tempRF.predict(test_xBin))

array([[74,  1, 16,  7, 11],
       [ 5, 68, 10,  6,  9],
       [ 4,  8, 74,  7, 15],
       [17,  3, 12, 58,  8],
       [ 3,  4,  3,  2, 90]])

In [47]:
test_score_SVM

0.74563106796116507

In [48]:
test_score_NB_bin

0.77087378640776694

In [49]:
test_score_LR

0.75533980582524274

In [50]:
test_score_RF

0.70679611650485441

In [51]:
ensemble_probs = probs_SVM + probs_LR + probs_NB_bin + probs_RF

In [52]:
ensemble_predicted = ensemble_probs.argmax(1)

In [53]:
sum(ensemble_predicted == test_y) / float(len(test_y))

0.78640776699029125

In [54]:
confusion_matrix(test_y, ensemble_predicted)

array([[88,  0, 10,  7,  4],
       [ 3, 76, 11,  3,  5],
       [ 7, 11, 77,  5,  8],
       [17,  1,  4, 72,  4],
       [ 3,  3,  3,  1, 92]])

In [164]:
feature_probs = np.exp(clfNB.feature_log_prob_)

np.size(feature_probs)
test = np.reshape(feature_probs, (5,numWords))
test

array([[  4.99300979e-04,   1.99720391e-03,   9.98601957e-05, ...,
          2.59636509e-03,   1.99720391e-04,   2.99580587e-04],
       [  6.35889610e-04,   3.81533766e-04,   5.08711688e-04, ...,
          3.81533766e-04,   1.27177922e-04,   1.27177922e-04],
       [  5.98157674e-04,   2.63189377e-03,   3.58894605e-04, ...,
          4.78526139e-04,   1.19631535e-04,   3.58894605e-04],
       [  4.26030461e-04,   1.38459900e-03,   1.06507615e-04, ...,
          2.23665992e-03,   7.45553307e-04,   5.32538076e-04],
       [  2.64865581e-04,   1.72162627e-03,   5.29731161e-04, ...,
          7.94596742e-04,   2.64865581e-04,   2.64865581e-04]])

In [165]:
np.shape(feature_probs)

(5, 1000)

In [166]:
ind = np.argpartition(test[1,:],-10)[-10:]
best_words = np.zeros((5,10))
best_probs = np.zeros((5,10))
test[1,ind]

array([ 0.00724914,  0.00737632,  0.0075035 ,  0.00763068,  0.00788503,
        0.01042859,  0.00915681,  0.00826656,  0.00966552,  0.01068295])

In [167]:
for i in range(0,5):
    ind = np.argpartition(test[i,:],-10)[-10:]
    best_words[i,:] = ind
    best_probs[i,:] = feature_probs[i,ind]

In [168]:
new_mat = np.zeros((5,numWords))
for i in range(0,numWords):
    for j in range(0,5):
        new_mat[j,i] = feature_probs[j,i]/np.sum(feature_probs[:,i])

In [169]:
np.array(vocab)[best_words[0,0]]

  if __name__ == '__main__':


u'president'

In [145]:
numWords

1000

In [236]:
best_words2 = np.zeros((5,50))
best_probs2 = np.zeros((5,50))

In [237]:
for i in range(0,5):
    ind = np.argpartition(new_mat[i,:],-50)[-50:]
    best_probs2[i,:] = new_mat[i,ind]
    best_words2[i,:] = ind[np.argsort(best_probs2[i])]

In [238]:
best_words2 = np.array(best_words2, dtype=int)

In [217]:
sort = np.argsort(best_probs2)

In [220]:
bestwords2[sort]

array([ 0.48692551,  0.49165605,  0.49672604,  0.49802756,  0.49983493,
        0.5010798 ,  0.50410489,  0.50705204,  0.507431  ,  0.50776566,
        0.50918804,  0.5096656 ,  0.50987117,  0.51323796,  0.51476922,
        0.52090624,  0.52128589,  0.52926985,  0.53367306,  0.53617697,
        0.53918911,  0.53977914,  0.54072398,  0.54836952,  0.55239963,
        0.55536931,  0.55716319,  0.56023796,  0.56238546,  0.56602366,
        0.56756767,  0.57593938,  0.60944718,  0.61553915,  0.61721699,
        0.61770576,  0.62466382,  0.62628515,  0.62665614,  0.64952272,
        0.65331049,  0.65421051,  0.66730946,  0.69391551,  0.69530092,
        0.7060179 ,  0.71746336,  0.73879546,  0.77024708,  0.84166363])

In [239]:
np.array(vocab)[best_words2[3]]

array([u'steagall', u'loophole', u'tomorrow', u'institutions', u'public',
       u'veterans', u'youth', u'virtually', u'poverty', u'super', u'glass',
       u'dictator', u'billionaire', u'african', u'huge', u'income',
       u'terms', u'opposition', u'involved', u'international', u'medicare',
       u'jail', u'corporations', u'street', u'pharmaceutical', u'hour',
       u'rigged', u'marijuana', u'committee', u'lower', u'earth',
       u'guarantee', u'unemployment', u'contributions', u'wealth',
       u'corrupt', u'view', u'class', u'colleges', u'large', u'revolution',
       u'billionaires', u'decent', u'universities', u'medical', u'major',
       u'vermont', u'secretary', u'finance', u'handful'], 
      dtype='<U14')

In [240]:
np.array(vocab)[best_words2[0]]

array([u'proposed', u'kind', u'tried', u'arabs', u'met', u'forward',
       u'universal', u'racism', u'drumpf', u'toward', u'tough', u'hear',
       u'fund', u'move', u'difficult', u'hard', u'sure', u'comprehensive',
       u'investment', u'appropriate', u'prevent', u'enforcement',
       u'prescription', u'specific', u'chance', u'figure', u'kinds',
       u'try', u'improve', u'ahead', u'brothers', u'results', u'malley',
       u'certainly', u'fighters', u'equal', u'others', u'message',
       u'opportunity', u'russians', u'agenda', u'questions', u'possible',
       u'ways', u'plans', u'incomes', u'affordable', u'senator',
       u'particularly', u'sanders'], 
      dtype='<U14')

In [241]:
np.array(vocab)[best_words2[1]]

array([u'liberal', u'payroll', u'ethanol', u'winning', u'corruption',
       u'principles', u'defending', u'judgment', u'focused', u'harry',
       u'amendment', u'secure', u'arms', u'dad', u'words', u'maria',
       u'ought', u'focus', u'marco', u'playing', u'marriage', u'everyone',
       u'texas', u'supreme', u'carter', u'indeed', u'reid',
       u'overwhelming', u'donald', u'subsidies', u'washington', u'simple',
       u'nominate', u'commander', u'liberty', u'chief', u'whatsoever',
       u'rubio', u'court', u'irs', u'org', u'voters', u'growth', u'fed',
       u'islamic', u'flat', u'note', u'schumer', u'amnesty', u'religious'], 
      dtype='<U14')

In [242]:
np.array(vocab)[best_words2[2]]

array([u'attack', u'less', u'tonight', u'control', u'elect', u'dream',
       u'cannot', u'jihadists', u'regulatory', u'starting', u'already',
       u'cheering', u'greatest', u'trust', u'choose', u'society', u'ever',
       u'changed', u'pro', u'force', u'threat', u'parents', u'immigration',
       u'human', u'service', u'especially', u'code', u'serious',
       u'regulations', u'repeal', u'space', u'future', u'hire', u'fully',
       u'growing', u'criminals', u'simply', u'sunni', u'someone', u'shia',
       u'paycheck', u'prove', u'jihadist', u'replace', u'access',
       u'allowed', u'st', u'operating', u'century', u'enterprise'], 
      dtype='<U14')

In [243]:
np.array(vocab)[best_words2[4]]

array([u'ted', u'company', u'stronger', u'china', u'thousands', u'great',
       u'self', u'read', u'trillions', u'poll', u'everybody', u'polls',
       u'economically', u'pouring', u'oh', u'smart', u'concerned', u'love',
       u'biggest', u'domain', u'horrible', u'trillion', u'roads', u'vets',
       u'thinking', u'advantage', u'anywhere', u'oil', u'losing', u'along',
       u'worst', u'heads', u'trade', u'watched', u'probably', u'sitting',
       u'built', u'politicians', u'japan', u'ok', u'mexico', u'mess',
       u'anymore', u'total', u'nice', u'jeb', u'frankly', u'nobody',
       u'totally', u'tremendous'], 
      dtype='<U14')

In [179]:
np.max(probs_NB_bin,axis = 1)

array([ 0.99803362,  0.64781966,  0.99895486,  0.96523869,  0.99708544,
        0.99999875,  0.97733364,  0.99523622,  0.99777496,  0.71699963,
        0.75322482,  0.80756365,  0.46526746,  0.8803524 ,  0.40627449,
        0.84776015,  0.85707391,  0.95802608])

In [180]:
probs_NB_bin

array([[  8.36456548e-04,   1.20830300e-05,   1.43171563e-05,
          9.98033621e-01,   1.10352200e-03],
       [  2.89692639e-01,   1.60740976e-02,   2.38422714e-02,
          6.47819661e-01,   2.25713306e-02],
       [  1.38129251e-05,   1.63104690e-07,   1.95438979e-07,
          9.98954859e-01,   1.03096973e-03],
       [  6.75705105e-03,   9.65238695e-01,   2.76383480e-02,
          2.57010640e-05,   3.40205153e-04],
       [  2.54323148e-06,   9.97085442e-01,   7.30673839e-05,
          6.95808828e-04,   2.14313832e-03],
       [  3.58376671e-09,   9.99998753e-01,   1.09766134e-06,
          1.26521048e-07,   1.90870045e-08],
       [  9.77333635e-01,   3.73490012e-04,   9.62204572e-03,
          5.76210496e-04,   1.20946187e-02],
       [  9.95236220e-01,   5.67282367e-05,   2.42386191e-04,
          4.16278564e-03,   3.01879942e-04],
       [  9.97774957e-01,   1.76616775e-06,   2.26308871e-04,
          1.99674318e-03,   2.24481822e-07],
       [  7.16999633e-01,   3.1770588

In [245]:
np.max(probs_NB_bin,axis = 1)

array([ 0.99803362,  0.64781966,  0.99895486,  0.96523869,  0.99708544,
        0.99999875,  0.97733364,  0.99523622,  0.99777496,  0.71699963,
        0.75322482,  0.80756365,  0.46526746,  0.8803524 ,  0.40627449,
        0.84776015,  0.85707391,  0.95802608])