In [2]:
import numpy as np, pickle
from IPython.display import Audio
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import featurizer, data_formatter, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_recall_fscore_support

In [3]:
with open('/data/jrgillick/speeches.pkl') as f:
    speeches = pickle.load(f)

In [None]:
import time
t0 = time.time()
with open('/data/jrgillick/individual_feature_lists.pkl') as f:
    individual_feature_lists = pickle.load(f)
print "load in " + str(time.time() - t0)

In [5]:
speaker_names = ['donald_trump',
 'hilary_clinton',
 'bernie_sanders',
 'ted_cruz',
 'marco_rubio',
 'john_kasich',
 'barack_obama',
 'bill_clinton',
 'joe_biden',
 'mike_pence',
 'carly_fiorina',
 'jeb_bush',
 'rand_paul',
 'gary_johnson',
 'chris_christie',
 'rick_santorum']

In [None]:
len(speeches)

In [237]:
#for s in speeches:
#    phrase_list = s.alignment.get_phrase_text()
#    with open('/data/corpora/cspan/phrase_lists/' + s.file_path.split('/')[1] + '.txt','wb') as f:
#        for p in phrase_list:
#            f.write(p + '\n')

In [301]:
speeches = [speech.Speech(s.file_path.split('/')[1]) for s in tqdm(speeches)]

100%|██████████| 310/310 [30:49<00:00,  5.54s/it]


In [302]:
for s in speeches:
    s.phrase_audio_features = s.get_phrase_audio_features()
    normalize_phrase_audio_features(s)

In [None]:
def normalize_data(X_train, X_test):
    new_X_train = np.copy(X_train)
    new_X_test = np.copy(X_test)
    for i in xrange(new_X_train.shape[1]):
        if set(new_X_train[:,i]) != set([0,1]) and set(new_X_train[:,i]) != set([0]) and set(new_X_train[:,i]) != set([1]):
            #print 'norming'
            #print set(new_X_train[:,i])
            mean = np.mean(new_X_train[:,i])
            new_X_train[:,i] -= mean
            new_X_test[:,i] -=mean
            std = np.std(X_train[:,i])
            if std > 1e-2:
                new_X_train[:,i] /= std
                new_X_test[:,i] /= std
    return new_X_train, new_X_test

In [972]:
individual_feature_lists = []
for i in range(len(train_speeches)):
    h = {'id': train_speeches[i].file_path,
         'bigrams': train_bigram_feats[i],
         'rst': train_rst_feats[i],
         'audio': train_audio_feats[i],
         'euphony': train_euphony_feats[i],
         'liu': train_liu_feats[i],
         'cosine': np.array(train_cosine_feats[i]).reshape(-1,1),
         'phone': train_phone_feats[i],
         'substring': np.array(train_substring_feats[i]).reshape(-1,1),
         'word_overlap': np.array(train_word_overlap_feats[i]).reshape(-1,1),
         'labels': train_labels[i]}
    individual_feature_lists.append(h)
    
for i in range(len(test_speeches)):
    h = {'id': test_speeches[i].file_path,
         'bigrams': test_bigram_feats[i],
         'rst': test_rst_feats[i],
         'audio': test_audio_feats[i],
         'euphony': test_euphony_feats[i],
         'liu': test_liu_feats[i],
         'cosine': np.array(test_cosine_feats[i]).reshape(-1,1),
         'phone': test_phone_feats[i],
         'substring': np.array(test_substring_feats[i]).reshape(-1,1),
         'word_overlap': np.array(test_word_overlap_feats[i]).reshape(-1,1),
         'labels': test_labels[i]}
    individual_feature_lists.append(h)

In [None]:
def get_train_and_test_sets(speaker_name):
    train_set = []
    test_set = []
    for i in range(len(individual_feature_lists)):
        if speaker_name in individual_feature_lists[i]['id']:
            test_set.append(individual_feature_lists[i])
        else:
            train_set.append(individual_feature_lists[i])
    return train_set, test_set

In [36]:
def get_features(data_set, feature_names):
    all_feats = []
    for i in range(len(data_set)):
        feats = [data_set[i][name] for name in feature_names]
        all_feats.append(np.hstack(feats))
    return all_feats

def get_labels(data_set):
    all_labels = []
    for i in range(len(data_set)):
        all_labels.append(data_set[i]['labels'])
    return all_labels

In [133]:
feature_set = ['bigrams','audio','liu','euphony','phone','cosine','substring','word_overlap','rst']
phrase_count = 1

train_set = individual_feature_lists[0:248]
test_set = individual_feature_lists[248:]

labels_train = get_labels(train_set)
data_train = get_features(train_set,feature_set)

labels_test = get_labels(test_set)
data_test = get_features(test_set,feature_set)

current_train_formatted_feats, current_train_formatted_labs = models.format_balanced_multiple_phrase_input(data_train,labels_train,phrase_count=phrase_count)
current_test_formatted_feats, current_test_formatted_labs = models.format_balanced_multiple_phrase_input(data_test,labels_test,phrase_count=phrase_count)

X_train = np.array(current_train_formatted_feats).astype('float64')
y_train = np.array(current_train_formatted_labs)
        
print X_train.shape

X_test = np.array(current_test_formatted_feats).astype('float64')
y_test = np.array(current_test_formatted_labs)

#X_train, X_test = normalize_data(X_train,X_test)
#model = models.train_cv_logistic_regression_faster(X_train,y_train)

(28832, 26859)


In [134]:
X_train, jeb_train = normalize_data(X_train, np.array(jeb_feats))

In [104]:
coefs = model.best_estimator_.coef_[0]
#coefs

In [106]:
import jeb_speech, jeb_featurizer

In [107]:
jeb = jeb_speech.Speech('jeb')
f = jeb_featurizer.Featurizer(jeb)

In [128]:
#feature_set = ['bigrams','audio','liu','euphony','phone','cosine','substring','word_overlap','rst']
jeb_feats =np.hstack([np.array(f.get_bigram_features()),
           np.array(f.get_audio_features()),
           np.array(f.get_liu_features()),
           np.array(f.get_euphony_features()),
           np.array(f.get_phone_features()),
           np.array(f.get_vector_cosine_distances()).reshape(-1,1),
           np.array(f.get_common_substring_features()).reshape(-1,1),
           np.array(f.get_n_common_words()).reshape(-1,1),
           np.array(f.get_rst_features()),
          ])



In [153]:
jeb_order_0 = np.argsort(jeb_train[0] * coefs)
jeb_order_1 = np.argsort(jeb_train[1] * coefs)
jeb_order_2 = np.argsort(jeb_train[2] * coefs)

In [155]:
jeb_train.shape

(3, 26859)

In [156]:
coefs

array([ 0.09509445,  0.04586393,  0.0455949 , ..., -0.05811358,
        0.00184465, -0.1136884 ])

In [162]:
h = {'coefs':coefs,'feature_names':all_feature_names,'jeb_features':jeb_train}
jeb_train

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  1.,  1.]])

In [165]:
with open('/data/corpora/cspan/jeb_coefs.pkl','wb') as f:
    pickle.dump(h,f)

In [166]:
with open('/data/corpora/cspan/jeb_coefs.pkl','rb') as f:
    h = pickle.load(f)

In [168]:
h['jeb_features'].shape

(3, 26859)

In [157]:
jeb_0_activations = zip(np.array(all_feature_names)[jeb_order_0], jeb_train[0]*coefs)
jeb_1_activations = zip(np.array(all_feature_names)[jeb_order_1], jeb_train[1]*coefs[jeb_order_1])
jeb_2_activations = zip(np.array(all_feature_names)[jeb_order_2], jeb_train[2]*coefs[jeb_order_2])

In [160]:
jeb_1_activations[-50:]

[(u'be_a', 0.0),
 (u'LIWC_friend', 0.0),
 (u'min_phone_length', -0.11841505438139058),
 (u'LIWC_percept', 0.0),
 (u'the_business', 0.0),
 (u'LIWC_we', 0.0),
 (u'mean_pitch', -0.0),
 (u'back_in', 0.0),
 (u'internal_silence', 0.1178684567214011),
 (u'LIWC_power', 0.0),
 (u'will_be', 0.058892930007002173),
 (u'LIWC_i', 0.0),
 (u'of_this', 0.0),
 (u'country_to', 0.0),
 (u'rhyme', 0.0),
 (u'LIWC_auxverb', 0.0),
 (u'LIWC_focuspresent', 0.0),
 (u'a_commander', 0.0),
 (u'LIWC_motion', 0.077162140595016776),
 (u'in_the', 0.0),
 (u'LIWC_negate', 0.0),
 (u'rst_cat_N-purpose', 0.090784796912178556),
 (u'LIWC_achiev', 0.0),
 (u"i_won't", 0.0),
 (u'LIWC_female', 0.0),
 (u'the_military', 0.0),
 (u'get_back', 0.0),
 (u'LIWC_affiliation', 0.0),
 (u'out_there', 0.0),
 (u'a_lot', 0.11195111771020727),
 (u'will_have', 0.0),
 (u'to_act', 0.0),
 (u'LIWC_sexual', 0.0),
 (u'alliteration', 0.0),
 (u'who_will', 0.0),
 (u'LIWC_leisure', 0.0),
 (u'i_think', 0.0),
 (u'LIWC_reward', 0.0),
 (u'next_president', 0.0),

In [149]:
jeb_fails_features[-50:]

[(u'be_a', 0.031838287462172883),
 (u'LIWC_friend', 0.033482519190737206),
 (u'min_phone_length', -0.11841505438139058),
 (u'LIWC_percept', 0.037158682089851985),
 (u'the_business', 0.047011592795904479),
 (u'LIWC_we', 0.047574463120682461),
 (u'mean_pitch', -0.081377825397440734),
 (u'back_in', 0.049425862091193665),
 (u'internal_silence', 0.1178684567214011),
 (u'LIWC_power', 0.05678676794291547),
 (u'will_be', 0.058892930007002173),
 (u'LIWC_i', 0.061607090604034916),
 (u'of_this', 0.066118407633243184),
 (u'country_to', 0.067722884021763075),
 (u'rhyme', 0.019610660860075655),
 (u'LIWC_auxverb', 0.072545241142137784),
 (u'LIWC_focuspresent', 0.075311245702706392),
 (u'a_commander', 0.076039724090637098),
 (u'LIWC_motion', 0.077162140595016776),
 (u'in_the', 0.07748078562768182),
 (u'LIWC_negate', 0.087615537447223946),
 (u'rst_cat_N-purpose', 0.090784796912178556),
 (u'LIWC_achiev', 0.092107404498263407),
 (u"i_won't", 0.094331942203364064),
 (u'LIWC_female', 0.097161055641983796),

In [138]:
zip(jeb.alignment.get_phrase_text(), model.predict_proba(jeb_train)[:,1])

[(u"So here's my pledge to you", 0.24821266037726158),
 (u"will be a commander in chief who will have the back of the military I won't trash talk I won't be a divider in chief or an agitator in chief I won't be out there blowharding talking a big game without backing it up I think the next President needs to be a lot quieter but send a signal that we're prepared to act in the national security interests of this country to get back in the business of creating a more peaceful world",
  0.9450858130262213),
 (u'Please clap', 0.39974463627379864)]

In [73]:
feat = featurizer.Featurizer(s)
#feature_set = ['audio','liu','euphony','phone','cosine','substring','word_overlap','rst']

In [74]:
def make_delta(feature_names):
    return ["delta_"+f for f in feature_names]

In [78]:
all_feature_names = feat.get_audio_feature_names() + make_delta(feat.get_audio_feature_names()) + feat.get_liu_feature_names() + make_delta(feat.get_liu_feature_names()) + feat.get_euphony_feature_names() + make_delta(feat.get_euphony_feature_names()) + feat.get_phone_feature_names() + make_delta(feat.get_phone_feature_names())+feat.get_cosine_feature_names() + make_delta(feat.get_cosine_feature_names())+feat.get_substring_feature_names() + make_delta(feat.get_substring_feature_names())+ feat.get_word_overlap_feature_names() + make_delta(feat.get_word_overlap_feature_names())+feat.get_rst_feature_names() + make_delta(feat.get_rst_feature_names())

In [92]:
all_feature_names = all_bigram_names + feat.get_audio_feature_names()+ feat.get_liu_feature_names() + feat.get_euphony_feature_names() + feat.get_phone_feature_names() + feat.get_cosine_feature_names() +feat.get_substring_feature_names() + feat.get_word_overlap_feature_names() +feat.get_rst_feature_names() 

In [80]:
with open('/data/corpora/cspan/top_bigrams.pkl','rb') as f:
    top_bigrams = pickle.load(f)

In [83]:
all_bigram_names = [n[0] for n in top_bigrams]

In [85]:
all_bigram_names.sort()

In [87]:
all_bigram_names[0:10]

[u'000_50',
 u'000_70',
 u'000_a',
 u'000_e',
 u'000_factories',
 u'000_in',
 u'000_jobs',
 u'000_or',
 u'000_people',
 u'000_soldiers']

In [95]:
coef_list = zip(np.array(all_feature_names)[np.argsort(coefs)],coefs[np.argsort(coefs)])

In [102]:
with open('/data/jrgillick/coef_list.txt','wb') as f:
    for c in coef_list:
        f.write(str(c[0]) + ' ' + str(c[1]) + '\n')

In [103]:
speechjeb 

<module 'speech' from 'speech.pyc'>

In [99]:
str(coef_list[0][0]) + ' ' + str(coef_list[0][1])

'talk_about -0.566926635654'

In [51]:
s = speech.Speech('bernie_sanders_10')

In [30]:
def run_logistic_cross_validation(feature_set, phrase_count):
    all_true = []; all_pred = []
    C = 0.1
    for speaker in speaker_names:
        print speaker + ": "
        train_set, test_set = get_train_and_test_sets(speaker)

        labels_train = get_labels(train_set)
        data_train = get_features(train_set,feature_set)

        labels_test = get_labels(test_set)
        data_test = get_features(test_set,feature_set)

        current_train_formatted_feats, current_train_formatted_labs = models.format_balanced_multiple_phrase_input(data_train,labels_train,phrase_count=phrase_count)
        current_test_formatted_feats, current_test_formatted_labs = models.format_balanced_multiple_phrase_input(data_test,labels_test,phrase_count=phrase_count)

        X_train = np.array(current_train_formatted_feats).astype('float64')
        y_train = np.array(current_train_formatted_labs)
        
        print X_train.shape

        X_test = np.array(current_test_formatted_feats).astype('float64')
        y_test = np.array(current_test_formatted_labs)

        X_train, X_test = normalize_data(X_train,X_test)
        if C is None:
            model = models.train_cv_logistic_regression_faster(X_train,y_train)
            C = model.best_params_['C']
        else:
            model = models.train_logistic_regression(X_train,y_train,C)
        y_true, y_pred = models.evaluate_model(model, X_test, y_test)
        
        all_true += list(y_true)
        all_pred += list(y_pred)
        print
    return all_true, all_pred

In [32]:
def run_logistic_single_speaker(feature_set, phrase_count):
    all_true = []; all_pred = []
    C = None
    for speaker in speaker_names:
        print speaker + ": "
        other, speaker_data = get_train_and_test_sets(speaker)
        speaker_data = np.array(speaker_data)
        #speaker_labels = get_labels(speaker_data)
        
        if len(speaker_data) == 1:
            continue
        
        if len(speaker_data) > 10:
            kf = KFold(n_splits=10)
        else:
            kf = KFold(n_splits=len(speaker_data))
            
        for train_index, test_index in kf.split(speaker_data):
            train_set = speaker_data[train_index]
            test_set = speaker_data[test_index]
            #train_set, test_set = train_test_split(speaker_data, test_size=0.2, random_state=43)

            labels_train = get_labels(train_set)
            data_train = get_features(train_set,feature_set)

            labels_test = get_labels(test_set)
            data_test = get_features(test_set,feature_set)

            current_train_formatted_feats, current_train_formatted_labs = models.format_balanced_multiple_phrase_input(data_train,labels_train,phrase_count=phrase_count)
            current_test_formatted_feats, current_test_formatted_labs = models.format_balanced_multiple_phrase_input(data_test,labels_test,phrase_count=phrase_count)

            X_train = np.array(current_train_formatted_feats).astype('float64')
            y_train = np.array(current_train_formatted_labs)

            X_test = np.array(current_test_formatted_feats).astype('float64')
            y_test = np.array(current_test_formatted_labs)

            #print X_train.shape
            #print X_test.shape

            if len(y_train) > 0 and len(y_test) > 0:

                X_train, X_test = normalize_data(X_train,X_test)
                if C is None and len(y_train) > 500:
                    model = models.train_cv_logistic_regression_faster(X_train,y_train)
                    C = model.best_params_['C']
                else:
                    model = models.train_logistic_regression(X_train,y_train)
                y_true, y_pred = models.evaluate_model(model, X_test, y_test)
                
                all_true += list(y_true)
                all_pred += list(y_pred)
                
                #print len(all_true)
            print
    #print len(all_true)
    #print len(all_pred)
    return all_true, all_pred

In [24]:
def run_logistic_single_speaker_n_gram_and_deltas(feature_set, phrase_count):
    feature_set.remove('bigrams')
    all_true = []; all_pred = []
    C = None
    for speaker in speaker_names:
        print speaker + ": "
        other, speaker_data = get_train_and_test_sets(speaker)
        speaker_data = np.array(speaker_data)
        #speaker_labels = get_labels(speaker_data)
        
        if len(speaker_data) == 1:
            continue
        
        if len(speaker_data) > 10:
            kf = KFold(n_splits=10)
        else:
            kf = KFold(n_splits=len(speaker_data))
            
        for train_index, test_index in kf.split(speaker_data):
            train_set = speaker_data[train_index]
            test_set = speaker_data[test_index]
            #train_set, test_set = train_test_split(speaker_data, test_size=0.2, random_state=43)

            labels_train = get_labels(train_set)
            data_train = get_features(train_set,feature_set)
            
            n_grams_train = get_features(train_set,['bigrams'])

            labels_test = get_labels(test_set)
            data_test = get_features(test_set,feature_set)
            n_grams_test = get_features(test_set,['bigrams'])

            current_train_formatted_feats, current_train_formatted_labs = models.format_balanced_multiple_phrase_input_with_deltas(data_train,labels_train,phrase_count=phrase_count)
            current_test_formatted_feats, current_test_formatted_labs = models.format_balanced_multiple_phrase_input_with_deltas(data_test,labels_test,phrase_count=phrase_count)

            train_ngram_formatted_feats, current_train_formatted_labs = models.format_balanced_multiple_phrase_input(n_grams_train,labels_train,phrase_count=1)
            test_ngram_formatted_feats, current_test_formatted_labs = models.format_balanced_multiple_phrase_input(n_grams_test,labels_test,phrase_count=1)

            
            X_train = np.array(current_train_formatted_feats).astype('float64')
            y_train = np.array(current_train_formatted_labs)
            
            X_train_2 = np.array(train_ngram_formatted_feats).astype('float64')

            X_test = np.array(current_test_formatted_feats).astype('float64')
            y_test = np.array(current_test_formatted_labs)
            
            X_test_2 = np.array(test_ngram_formatted_feats).astype('float64')

            print X_train.shape
            print X_train_2.shape
            print X_test.shape
            print X_test_2.shape
            
            X_train = np.hstack([X_train,X_train_2])
            X_test = np.hstack([X_test,X_test_2])

            if len(y_train) > 0 and len(y_test) > 0:

                X_train, X_test = normalize_data(X_train,X_test)
                if C is None and len(y_train) > 500:
                    model = models.train_cv_logistic_regression_faster(X_train,y_train)
                    C = model.best_params_['C']
                else:
                    model = models.train_logistic_regression(X_train,y_train)
                y_true, y_pred = models.evaluate_model(model, X_test, y_test)
                
                all_true += list(y_true)
                all_pred += list(y_pred)
                
                #print len(all_true)
            print
    #print len(all_true)
    #print len(all_pred)
    return all_true, all_pred

In [34]:
#all_true, all_pred = run_logistic_single_speaker_n_gram_and_deltas(['bigrams','audio','liu','euphony','phone','cosine','substring','word_overlap','rst'],phrase_count=1)

In [307]:
all_true,all_pred = run_logistic_cross_validation(['bigrams'],phrase_count=1)

donald_trump: 
(22678, 26704)
Accuracy: 0.575 +/- 0.009 (7387/12846) | Precision: 0.626 | Recall: 0.373 | F1: 0.467

hilary_clinton: 
(28264, 26704)
Accuracy: 0.612 +/- 0.011 (4440/7260) | Precision: 0.645 | Recall: 0.497 | F1: 0.561

bernie_sanders: 
(29006, 26704)
Accuracy: 0.628 +/- 0.012 (4092/6518) | Precision: 0.677 | Recall: 0.489 | F1: 0.568

ted_cruz: 
(33442, 26704)
Accuracy: 0.626 +/- 0.021 (1304/2082) | Precision: 0.684 | Recall: 0.469 | F1: 0.556

marco_rubio: 
(33938, 26704)
Accuracy: 0.627 +/- 0.024 (994/1586) | Precision: 0.658 | Recall: 0.528 | F1: 0.586

john_kasich: 
(34886, 26704)
Accuracy: 0.630 +/- 0.037 (402/638) | Precision: 0.706 | Recall: 0.445 | F1: 0.546

barack_obama: 
(33684, 26704)
Accuracy: 0.559 +/- 0.023 (1028/1840) | Precision: 0.624 | Recall: 0.296 | F1: 0.401

bill_clinton: 
(34940, 26704)
Accuracy: 0.596 +/- 0.040 (348/584) | Precision: 0.665 | Recall: 0.387 | F1: 0.489

joe_biden: 
(34984, 26704)
Accuracy: 0.615 +/- 0.041 (332/540) | Precision: 0.

In [309]:
total_correct_n_gram = 7387 + 4440 + 4092 + 1304 + 994 + 402 + 1028 + 348 + 332 + 314 + 163 + 174+58+48+19
over_n_gram = 12846 + 7260 + 6518 + 2082 + 1586 + 638 + 1840 + 584 + 540 + 492 + 258 + 382 + 268 + 112 + 84 + 34
acc_here = float(total_correct_n_gram) / over_n_gram

In [310]:
acc_here

0.5940490935705438

In [305]:
all_true,all_pred = run_logistic_single_speaker(['bigrams','audio','liu','euphony','phone','cosine','substring','word_overlap','rst'],phrase_count=1)

donald_trump: 
Accuracy: 0.650 +/- 0.022 (1210/1862) | Precision: 0.674 | Recall: 0.581 | F1: 0.624

Accuracy: 0.664 +/- 0.028 (752/1132) | Precision: 0.689 | Recall: 0.599 | F1: 0.641

Accuracy: 0.674 +/- 0.027 (781/1158) | Precision: 0.705 | Recall: 0.599 | F1: 0.648

Accuracy: 0.664 +/- 0.027 (756/1138) | Precision: 0.679 | Recall: 0.622 | F1: 0.650

Accuracy: 0.676 +/- 0.022 (1143/1692) | Precision: 0.684 | Recall: 0.651 | F1: 0.667

Accuracy: 0.640 +/- 0.023 (1108/1730) | Precision: 0.664 | Recall: 0.568 | F1: 0.612

Accuracy: 0.657 +/- 0.027 (796/1212) | Precision: 0.666 | Recall: 0.630 | F1: 0.647

Accuracy: 0.688 +/- 0.033 (523/760) | Precision: 0.698 | Recall: 0.663 | F1: 0.680

Accuracy: 0.642 +/- 0.026 (860/1340) | Precision: 0.649 | Recall: 0.618 | F1: 0.633

Accuracy: 0.676 +/- 0.032 (556/822) | Precision: 0.693 | Recall: 0.633 | F1: 0.662

hilary_clinton: 
Accuracy: 0.689 +/- 0.033 (528/766) | Precision: 0.677 | Recall: 0.723 | F1: 0.699

Accuracy: 0.700 +/- 0.035 (466/66

In [None]:
trump_correct = 1210+752+781+756+1143+1108+796+523+860+556
trump_total = 1862+1132+1158+1138+1692+1730+1212+760+1340+822

In [33]:
all_true,all_pred = run_logistic_cross_validation(['audio','liu','euphony','phone','cosine','substring','word_overlap','rst'],phrase_count=1)

donald_trump: 
(22678, 155)
Accuracy: 0.588 +/- 0.009 (7559/12846) | Precision: 0.600 | Recall: 0.531 | F1: 0.563

hilary_clinton: 
(28264, 155)
Accuracy: 0.658 +/- 0.011 (4774/7260) | Precision: 0.653 | Recall: 0.674 | F1: 0.663

bernie_sanders: 
(29006, 155)
Accuracy: 0.616 +/- 0.012 (4017/6518) | Precision: 0.637 | Recall: 0.542 | F1: 0.586

ted_cruz: 
(33442, 155)
Accuracy: 0.632 +/- 0.021 (1315/2082) | Precision: 0.659 | Recall: 0.545 | F1: 0.597

marco_rubio: 
(33938, 155)
Accuracy: 0.637 +/- 0.024 (1011/1586) | Precision: 0.639 | Recall: 0.631 | F1: 0.635

john_kasich: 
(34886, 155)
Accuracy: 0.672 +/- 0.036 (429/638) | Precision: 0.688 | Recall: 0.630 | F1: 0.658

barack_obama: 
(33684, 155)
Accuracy: 0.637 +/- 0.022 (1173/1840) | Precision: 0.655 | Recall: 0.582 | F1: 0.616

bill_clinton: 
(34940, 155)
Accuracy: 0.625 +/- 0.039 (365/584) | Precision: 0.629 | Recall: 0.610 | F1: 0.619

joe_biden: 
(34984, 155)
Accuracy: 0.617 +/- 0.041 (333/540) | Precision: 0.618 | Recall: 0.6

In [308]:
#total_correct = np.sum(np.array(all_true) == np.array(all_pred))
acc=float(total_correct) / len(all_true)
std=math.sqrt( (acc * (1-acc)) / len(all_true) )
precision, recall, f1, support = [l[1] for l in precision_recall_fscore_support(all_true, all_pred)]
print "Accuracy: %.3f +/- %.3f (%s/%s) | Precision: %.3f | Recall: %.3f | F1: %.3f" % (acc, 1.96*std, total_correct, len(all_true), precision, recall, f1)

Accuracy: 0.668 +/- 0.005 (23740/35524) | Precision: 0.650 | Recall: 0.435 | F1: 0.521


In [29]:
total_correct = 1171 + 748 + 767 + 756 + 1135 + 1101 + 794 + 522 + 868 + 571+ 528 + 465 + 381 + 506 + 563 + 539 + 741 + 497 + 470 + 421 + 584 + 341 + 294 + 615 + 321 + 619 + 249 + 475 + 571+ 586 + 178 + 118 + 195+135 +81 + 126 + 154 + 127 + 174 + 95+130 + 115+ 84 + 93 + 174 + 39 + 118 + 118 + 51 + 69 + 31 + 49 + 58 + 51 + 59 + 78 + 33 + 11 + 15 + 35 + 98 + 245 + 81 + 161 + 197 + 77 + 110 + 129 + 21 + 57 + 30 + 28 + 13 + 116 + 40 + 11 + 91 + 29 + 32 + 115+ 60 + 25 + 84 + 50 + 71 + 52+ 5+57 + 65 + 24 + 25 + 34 + 40 + 9 + 124 + 44 + 15 + 34 + 4 + 31 + 33 + 1 + 75 + 16 + 8 + 37 + 14 + 14 + 15
total_predicted = 1862 + 1132 + 1158 + 1138 + 1692 + 1730 + 1212 + 760 + 1340 + 822 + 766+ 666+536 + 744+780 +556 + 1122 + 760 + 694 + 636 + 824 + 466 + 406 + 884 + 466 + 882 + 348 + 626 + 794 + 822 + 260+ 182+316+208 + 114 + 180+228+188+276+130 + 236 + 196 + 136 + 142 + 274 + 54 + 188 + 166 + 72 + 122 + 40 + 82 + 84 + 74 + 88 + 108 + 52 + 26 + 20 + 64 + 152 + 392 + 116 + 222 + 328 + 138 + 170 + 196 + 34+92 + 56 + 48 + 22 + 190 + 62 + 18 + 140 + 48 + 54 + 212 + 92 + 60 + 122+80 + 126 + 90 + 8 + 84 + 104 + 40 + 58 + 62 + 78 + 20 + 200 + 76 + 32 + 64 + 10 + 62 + 62+2+142+26+16+70+20+34+30
float(total_correct)/total_predicted

0.6689208227669766

In [13]:
import math

In [699]:
train_rst_feats2 = data_formatter.get_rst_data(train_speeches)

100%|██████████| 248/248 [00:01<00:00, 148.27it/s]


In [700]:
test_rst_feats2 = data_formatter.get_rst_data(test_speeches)

100%|██████████| 62/62 [00:00<00:00, 128.87it/s]


In [336]:
featurizer.Featurizer(s).get_rst_feature_names()

['rst_cat_N-antithesis',
 'rst_cat_N-attribution',
 'rst_cat_N-circumstance',
 'rst_cat_N-comment',
 'rst_cat_N-comparison',
 'rst_cat_N-concession',
 'rst_cat_N-condition',
 'rst_cat_N-consequence',
 'rst_cat_N-contrast',
 'rst_cat_N-definition',
 'rst_cat_N-disjunction',
 'rst_cat_N-elaboration',
 'rst_cat_N-evidence',
 'rst_cat_N-example',
 'rst_cat_N-explanation',
 'rst_cat_N-hypothetical',
 'rst_cat_N-list',
 'rst_cat_N-manner',
 'rst_cat_N-means',
 'rst_cat_N-purpose',
 'rst_cat_N-question',
 'rst_cat_N-reason',
 'rst_cat_N-restatement',
 'rst_cat_N-result',
 'rst_cat_N-same_unit',
 'rst_cat_N-sequence',
 'rst_cat_N-temporal',
 'rst_cat_N-textualorganization',
 'rst_cat_N-topic',
 'rst_cat_S-antithesis',
 'rst_cat_S-attribution',
 'rst_cat_S-circumstance',
 'rst_cat_S-comment',
 'rst_cat_S-comparison',
 'rst_cat_S-concession',
 'rst_cat_S-condition',
 'rst_cat_S-consequence',
 'rst_cat_S-definition',
 'rst_cat_S-elaboration',
 'rst_cat_S-example',
 'rst_cat_S-explanation',
 'rst_