In [23]:
import numpy as np, pickle
from IPython.display import Audio
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import featurizer, data_formatter, models
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
with open('/data/jrgillick/speeches.pkl') as f:
    speeches = pickle.load(f)

In [3]:
for s in tqdm(speeches):
    s.phrase_audio_features = s.get_phrase_audio_features()

100%|██████████| 311/311 [00:15<00:00, 20.17it/s]


In [4]:
audio_feats = data_formatter.get_audio_data(speeches)

100%|██████████| 311/311 [00:10<00:00, 30.99it/s]


In [5]:
labels = data_formatter.get_binary_labels(speeches)

100%|██████████| 311/311 [00:58<00:00, 12.07it/s]


In [6]:
euphony_feats = data_formatter.get_euphony_data(speeches)

100%|██████████| 311/311 [01:30<00:00,  3.44it/s]


In [7]:
liu_feats = data_formatter.get_liu_data(speeches)

100%|██████████| 311/311 [15:20<00:00,  2.05s/it]


In [8]:
vector_feats = data_formatter.get_skip_thought_data(speeches)

100%|██████████| 311/311 [13:20<00:00,  2.27s/it]


In [26]:
cosine_similarity(vector_feats[0][1], vector_feats[0][2])



array([[ 0.82335079]], dtype=float32)

In [13]:
all_feats = [np.hstack([audio_feats[i],euphony_feats[i],liu_feats[i],vector_feats[i]]) for i in range(len(audio_feats))]

In [14]:
['vec_' + str(i) for i in range(4800)]

['vec_0',
 'vec_1',
 'vec_2',
 'vec_3',
 'vec_4',
 'vec_5',
 'vec_6',
 'vec_7',
 'vec_8',
 'vec_9',
 'vec_10',
 'vec_11',
 'vec_12',
 'vec_13',
 'vec_14',
 'vec_15',
 'vec_16',
 'vec_17',
 'vec_18',
 'vec_19',
 'vec_20',
 'vec_21',
 'vec_22',
 'vec_23',
 'vec_24',
 'vec_25',
 'vec_26',
 'vec_27',
 'vec_28',
 'vec_29',
 'vec_30',
 'vec_31',
 'vec_32',
 'vec_33',
 'vec_34',
 'vec_35',
 'vec_36',
 'vec_37',
 'vec_38',
 'vec_39',
 'vec_40',
 'vec_41',
 'vec_42',
 'vec_43',
 'vec_44',
 'vec_45',
 'vec_46',
 'vec_47',
 'vec_48',
 'vec_49',
 'vec_50',
 'vec_51',
 'vec_52',
 'vec_53',
 'vec_54',
 'vec_55',
 'vec_56',
 'vec_57',
 'vec_58',
 'vec_59',
 'vec_60',
 'vec_61',
 'vec_62',
 'vec_63',
 'vec_64',
 'vec_65',
 'vec_66',
 'vec_67',
 'vec_68',
 'vec_69',
 'vec_70',
 'vec_71',
 'vec_72',
 'vec_73',
 'vec_74',
 'vec_75',
 'vec_76',
 'vec_77',
 'vec_78',
 'vec_79',
 'vec_80',
 'vec_81',
 'vec_82',
 'vec_83',
 'vec_84',
 'vec_85',
 'vec_86',
 'vec_87',
 'vec_88',
 'vec_89',
 'vec_90',
 'vec_91'

In [15]:
feature_names = np.array(list(featurizer.Featurizer(s).get_combined_feature_names()) + ['vec_' + str(i) for i in range(4800)])

In [16]:
len(feature_names)

4890

In [17]:
all_feature_names = []
for prefix in ['t','t-1','t-2','delta_1','delta_2']:
    for f in feature_names:
        all_feature_names.append(prefix+'_'+f)

In [28]:
len(all_feature_names)

24450

In [109]:
len(formatted_feats[0])

450

In [107]:
reload(data_formatter)
reload(featurizer)
reload(models)

<module 'models' from 'models.py'>

In [11]:
len(liu_feats)

311

In [12]:
np.hstack(labels).shape

(80934,)

In [72]:
np.array(all_feats).shape

(311,)

In [75]:
lstm_feats, lstm_labs = models.format_lstm_input(all_feats,labels,input_size=91,lstm_length=3)


  0%|          | 0/243 [00:00<?, ?it/s][A
[A

ValueError: cannot reshape array of size 14670 into shape (1,3,91)

In [79]:
reload(models)
reload(data_formatter)
reload(featurizer)

<module 'featurizer' from 'featurizer.pyc'>

In [188]:
formatted_feats, formatted_labs = models.format_multiple_phrase_input_with_deltas(vector_feats,labels,phrase_count=5)

100%|██████████| 243/243 [00:00<00:00, 742.37it/s]
100%|██████████| 297/297 [00:00<00:00, 841.26it/s]
100%|██████████| 139/139 [00:00<00:00, 1043.75it/s]
100%|██████████| 442/442 [00:00<00:00, 1145.25it/s]
100%|██████████| 35/35 [00:00<00:00, 1074.28it/s]
100%|██████████| 171/171 [00:00<00:00, 858.03it/s] 
100%|██████████| 206/206 [00:00<00:00, 814.71it/s]
100%|██████████| 197/197 [00:00<00:00, 823.40it/s]
100%|██████████| 393/393 [00:00<00:00, 967.81it/s]
100%|██████████| 186/186 [00:00<00:00, 1070.21it/s]
100%|██████████| 392/392 [00:00<00:00, 1078.10it/s]
100%|██████████| 44/44 [00:00<00:00, 1069.42it/s]
100%|██████████| 368/368 [00:00<00:00, 1088.78it/s]
100%|██████████| 459/459 [00:00<00:00, 1098.34it/s]
100%|██████████| 457/457 [00:00<00:00, 1100.22it/s]
100%|██████████| 205/205 [00:00<00:00, 1090.55it/s]
100%|██████████| 108/108 [00:00<00:00, 1094.75it/s]
100%|██████████| 451/451 [00:00<00:00, 1127.51it/s]
100%|██████████| 386/386 [00:00<00:00, 1153.95it/s]
100%|██████████| 537/

In [189]:
X, y = np.array(formatted_feats), np.array(formatted_labs)

In [27]:
X.shape

NameError: name 'X' is not defined

In [169]:
reload(models)

<module 'models' from 'models.py'>

In [132]:
#X = np.hstack([np.vstack(audio_feats), np.vstack(euphony_feats), np.vstack(liu_feats)])
#X = np.vstack(audio_feats)
#y = np.hstack(labels)

In [191]:
X,y = data_formatter.balance(X,y)

In [192]:
X_train, X_test = train_test_split(X,test_size=0.2, random_state=43)
y_train, y_test = train_test_split(y,test_size=0.2, random_state=43)
X_train = np.array(X_train)
X_test = np.array(X_test)

In [156]:
with open('/data/jrgillick/x_train_deltas.pkl','wb') as f:
    pickle.dump(X_train,f)
    
with open('/data/jrgillick/x_test_deltas.pkl','wb') as f:
    pickle.dump(X_test,f)

with open('/data/jrgillick/y_train_deltas.pkl','wb') as f:
    pickle.dump(y_train,f)
    
with open('/data/jrgillick/y_test_deltas.pkl','wb') as f:
    pickle.dump(y_test,f)


In [None]:
model = models.train_cv_logistic_regression(X_train,y_train)

In [128]:
with open('/data/jrgillick/log_reg_model.pkl','wb') as f:
    pickle.dump(model, f)

In [129]:
# 3 phrases with deltas
models.evaluate_model(model, X_test, y_test)

Accuracy: 0.639 +/- 0.011 (4881/7633) | Precision: 0.649 | Recall: 0.616 | F1: 0.632


(0.6394602384383598,
 0.64909390444810544,
 0.6159458051068265,
 0.6320855614973262)

In [33]:
s = speeches[61]
s.file_path

'donald_trump/donald_trump_114'

In [168]:
phrases = s.alignment.get_phrase_text()
s_labels = featurizer.Featurizer(s).get_binary_labels()
times = s.alignment.get_phrase_times()
#feature_list = featurizer.Featurizer(s).get_all_features()

In [163]:
s_feats, s_labs = models.format_multiple_phrase_input_with_deltas([feature_list],[s_labels],phrase_count=3)

100%|██████████| 668/668 [00:00<00:00, 14922.06it/s]


In [None]:
i = 11
print("Prediction: " + str(model.predict_proba(s_feats[i])[0][1]))
print("True label: %d" % s_labs[i])
phrases[i-2:i+1]

In [152]:
coefs = model.best_estimator_.coef_[0]

In [153]:
s.load_librosa()
s.load_librosa_crowd()
s.load_librosa_voice()

In [174]:
Audio(s.y[int(times[i-2][0]*s.sr):int((times[i][1]+2)*s.sr)],rate=s.sr)

In [69]:
reload(featurizer)
reload(data_formatter)

<module 'data_formatter' from 'data_formatter.py'>

In [62]:
from scipy.spatial.distance import cosine as cosine_distance

In [67]:
vectors = featurizer.Featurizer(s).get_skip_thought_features()

In [71]:
cosine_feats = data_formatter.get_vector_cosine_distance_data(speeches)

 11%|█         | 34/311 [01:09<11:22,  2.46s/it]

KeyboardInterrupt: 

In [65]:
1-cosine_distance([1,2],[1,2])

1.0

In [58]:
cosine_similarity(vectors[0].reshape(-1,1),vectors[0].reshape(-1,1))

array([[ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       ..., 
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.]], dtype=float32)

In [43]:
featurizer.Featurizer(s).get_vector_cosine_sims()

[1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 -1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0]

In [143]:
zip(np.array(all_feature_names)[np.argsort(coefs)],coefs[np.argsort(coefs)])

[('delta_1_min_energy', -5.3331412176435844),
 ('t_range_energy', -3.7987877529767142),
 ('delta_1_mean_energy', -3.2876116221160498),
 ('delta_1_max_energy', -2.6744090819347108),
 ('delta_2_mean_energy', -2.591703157930854),
 ('delta_2_min_energy', -1.5493668703809391),
 ('t-1_range_energy', -1.1400554998324293),
 ('t-2_max_energy', -1.067698892577204),
 ('t-2_min_energy', -0.84401951551697285),
 ('delta_2_max_energy', -0.63299062651404325),
 ('delta_1_applause_feature', -0.45032232456843896),
 ('t-1_max_energy', -0.43470826606265861),
 ('delta_1_homogeneity', -0.40243126603563639),
 ('t-2_mean_energy', -0.34277295451187861),
 ('t_LIWC_shehe', -0.24083157646188816),
 ('t_LIWC_function', -0.23940568192229625),
 ('t_LIWC_death', -0.22860996088981517),
 ('t-2_range_energy', -0.22368004587993517),
 ('t_LIWC_focuspast', -0.21149386259085526),
 ('t-1_LIWC_shehe', -0.20404594155490416),
 ('delta_1_LIWC_filler', -0.19736223456553884),
 ('t-2_LIWC_shehe', -0.19305332987880999),
 ('t_LIWC_see'

In [148]:
x=[('t_thank_you_feature', 0.3774252899630714),
('t-1_homogeneity', 0.38359345291463331),
('t_internal_silence', 0.51532797033195787),
('t-1_min_energy', 0.7053473548642436),
('t_homogeneity', 0.78602471894641057),
 ('t-1_applause_feature', 0.83723373483706343),
 ('delta_2_std_energy', 0.87771915384597043),
 ('delta_2_range_energy', 0.91637545394360398),
 ('t-2_applause_feature', 1.0031853932975721),
 ('t_applause_feature', 1.2875560594055036),
 ('t_max_energy', 2.2397008158732721),
 ('t-1_mean_energy', 2.2489302034200751),
 ('delta_1_std_energy', 2.3550216311442367),
 ('delta_1_range_energy', 2.6587322531427282),
 ('t-1_std_energy', 2.7222193509839596),
 ('t-2_std_energy', 3.599938504830182),
 ('t_mean_energy', 5.5365418255368963),
 ('t_min_energy', 6.0384885725075756)]

In [151]:
x

[('t_min_energy', 6.038488572507576),
 ('t_mean_energy', 5.536541825536896),
 ('t-2_std_energy', 3.599938504830182),
 ('t-1_std_energy', 2.7222193509839596),
 ('delta_1_range_energy', 2.658732253142728),
 ('delta_1_std_energy', 2.3550216311442367),
 ('t-1_mean_energy', 2.248930203420075),
 ('t_max_energy', 2.239700815873272),
 ('t_applause_feature', 1.2875560594055036),
 ('t-2_applause_feature', 1.003185393297572),
 ('delta_2_range_energy', 0.916375453943604),
 ('delta_2_std_energy', 0.8777191538459704),
 ('t-1_applause_feature', 0.8372337348370634),
 ('t_homogeneity', 0.7860247189464106),
 ('t-1_min_energy', 0.7053473548642436),
 ('t_internal_silence', 0.5153279703319579),
 ('t-1_homogeneity', 0.3835934529146333),
 ('t_thank_you_feature', 0.3774252899630714)]

In [137]:
np.argsort(coefs)

array([275,   7, 273, 271, 363, 365,  97, 181, 185, 361, 358,  91, 282,
       183,  77,  44,  32, 187,  41, 167, 309, 257,  75, 270, 134, 122,
        46,  84, 131, 359, 136,  82, 165,  30,  37, 399, 172, 127, 221,
       373, 224, 120, 296,  81, 360,  51, 262,  50,  60,  17, 193, 310,
       247,  47, 421,  80, 141, 449, 236, 125, 264, 158, 210, 174, 171,
        69, 423, 342, 319,  29, 225, 206, 355, 371, 308, 146, 230, 140,
        35, 107, 215, 372, 135, 427, 231, 294, 324, 430, 213, 298, 376,
        62, 139, 159, 255, 197,  56, 266, 313, 198, 176, 124, 386, 253,
        68, 436, 431, 425, 383, 119, 400, 346, 217, 398, 315, 414, 233,
       207,  86, 445, 248, 335, 433, 413, 108, 415, 212, 338, 202, 123,
       289,  53,  78, 387, 133, 379, 226, 137,  10, 157,  34, 214, 384,
       426, 261, 209, 393, 353, 250, 284, 116, 229, 150, 288,  31,  25,
        45,  33, 112, 375, 331, 104, 100, 447, 249, 304, 337, 432, 328,
       326, 388, 343, 306, 190, 305,  22, 285, 349,  52, 378, 41

In [127]:
# 3 phrases with no deltas
models.evaluate_model(model, X_test, y_test)

Accuracy: 0.647 +/- 0.011 (4941/7633) | Precision: 0.650 | Recall: 0.631 | F1: 0.640


(0.6473208437049652,
 0.65021691973969631,
 0.63105263157894742,
 0.64049145299145305)

In [84]:
s = speeches[81]

In [85]:
s.file_path

'donald_trump/donald_trump_33'

In [86]:
text = s.alignment.get_phrase_text()
times = s.alignment.get_phrase_times()

In [87]:
end_labels = [s.alignment.applause_follows(t[1]) for t in times]

In [88]:
zip(text,end_labels)

[(u'YOU EVERYBODY THANK YOU', True),
 (u'ARE GOING TO MAKE WE ARE', True),
 (u'AM THRILLED TO BE HERE TONIGHT BEAUTIFUL CITY MANNHEIM', True),
 (u'WILL TELL YOU WHAT WE WIN THE GREAT STATE OF PENNSYLVANIA I WENT TO SCHOOL IN PENNSYLVANIA',
  True),
 (u'ARE GOING TO WIN PENNSYLVANIA WE ARE GOING TO WIN BACK THE WHITE HOUSE AND WE ARE GOING TO BE SO HAPPY WE ARE GOING TO BE SO HAPPY AND WE GOING TO AGAIN BE PROUD OF OUR COUNTRY WE WILL BE PROUD WE GOING TO TAKE ON THE CORRUPT MEDIA THE POWERFUL LOBBYISTS AND THE SPECIAL INTERESTS THAT HAVE STOLEN YOUR JOBS HERE FACTORIES IN YOUR FUTURE THAT',
  False),
 (u'WHAT HAS HAPPENED WE ARE GOING TO STOP HILLARY CLINTON FROM CONTINUING TO RATE THE INDUSTRY FROM YOUR STATE FOR HER PROFIT',
  True),
 (u'CLINTON HAS COLLECTED MILLIONS OF DOLLARS FROM THE SAME GLOBAL CORPORATIONS SHIPPING YOUR JOBS YOUR DREAMS TO OTHER COUNTRIES YOU KNOW IT AND EVERYBODY ELSE KNOWS IT',
  False),
 (u'IS WHY CLINTON', False),
 (u'100 APPROVE TRANSPACIFIC', False),
 (u'

In [83]:
np.sum(end_labels)

7

In [147]:
x

NameError: name 'x' is not defined