In [1]:
import vggish_slim
import vggish_params
import vggish_input_modified
import vggish_postprocess
import numpy as np
from scipy.io import wavfile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
import scipy

  from ._conv import register_converters as _register_converters


In [2]:

def CreateVGGishNetwork(hop_size=0.96):   # Hop size is in seconds.
  """Define VGGish model, load the checkpoint, and return a dictionary that points
  to the different tensors defined by the model.
  """
  vggish_slim.define_vggish_slim()
  checkpoint_path = 'vggish_model.ckpt'
  vggish_params.EXAMPLE_HOP_SECONDS = hop_size
  
  vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

  features_tensor = sess.graph.get_tensor_by_name(
      vggish_params.INPUT_TENSOR_NAME)
  embedding_tensor = sess.graph.get_tensor_by_name(
      vggish_params.OUTPUT_TENSOR_NAME)

  layers = {'conv1': 'vggish/conv1/Relu',
            'pool1': 'vggish/pool1/MaxPool',
            'conv2': 'vggish/conv2/Relu',
            'pool2': 'vggish/pool2/MaxPool',
            'conv3': 'vggish/conv3/conv3_2/Relu',
            'pool3': 'vggish/pool3/MaxPool',
            'conv4': 'vggish/conv4/conv4_2/Relu',
            'pool4': 'vggish/pool4/MaxPool',
            'fc1': 'vggish/fc1/fc1_2/Relu',
            'fc2': 'vggish/fc2/Relu',
            'embedding': 'vggish/embedding',
            'features': 'vggish/input_features',
         }
  g = tf.get_default_graph()
  for k in layers:
    layers[k] = g.get_tensor_by_name( layers[k] + ':0')
    
  return {'features': features_tensor,
          'embedding': embedding_tensor,
          'layers': layers,
         }

In [3]:
def ProcessWithVGGish(vgg, x, sr,hop_length):
  '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a whitened version of the embeddings. Sound must be scaled to be
  floats between -1 and +1.'''

  # Produce a batch of log mel spectrogram examples.
  input_batch = vggish_input_modified.waveform_to_examples(wav_data, sr,hop_length)
  # print('Log Mel Spectrogram example: ', input_batch[0])

  [embedding_batch] = sess.run([vgg['embedding']],
                               feed_dict={vgg['features']: input_batch})
  return embedding_batch
  # Postprocess the results to produce whitened quantized embeddings.
  pca_params_path = 'vggish_pca_params.npz'

  pproc = vggish_postprocess.Postprocessor(pca_params_path)
  postprocessed_batch = pproc.postprocess(embedding_batch)
  # print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
  return postprocessed_batch[0]


In [4]:
import tensorflow as tf
tf.reset_default_graph()
sess = tf.Session()

vgg = CreateVGGishNetwork()

INFO:tensorflow:Restoring parameters from vggish_model.ckpt


In [5]:

sr, wav_data = wavfile.read("1.wav")
assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]

In [6]:
values = ProcessWithVGGish(vgg,samples,sr,1)

In [7]:
df = pd.read_csv("../data/created_csv/bi.csv")

In [8]:
df.head()

Unnamed: 0,start_timestamp,end_timestamp,Gesture,Unnamed: 3
0,0,1,beats,
1,1,2,,
2,2,3,,
3,3,4,beats,
4,4,5,,


In [9]:
gesture_detected = df["Gesture"].apply(lambda x: int(isinstance(x,str)))

In [10]:
df = df.assign(gesture_detected=gesture_detected)

In [36]:
df_final = df[["end_timestamp","gesture_detected"]]

In [40]:
df_final.head()

Unnamed: 0,end_timestamp,gesture_detected,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature118,feature119,feature120,feature121,feature122,feature123,feature124,feature125,feature126,feature127
0,1,1,0.0,0.0,0.473912,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.05154,0.195061,0.0,0.0,0.057337
1,2,0,0.0,0.090621,0.609878,0.0,0.0,0.0,0.618379,0.633474,...,0.0,0.0,0.164849,0.0,0.0,0.0,0.080482,0.0,0.118717,0.133484
2,3,0,0.0,0.0,0.525054,0.0,0.0,0.0,0.230554,0.016512,...,0.0,0.0,0.0,0.0,0.0,0.0,0.258882,0.0,0.195438,0.0
3,4,1,0.0,0.0,0.634765,0.0,0.0,0.0,0.299442,0.270102,...,0.0,0.0,0.145458,0.0,0.0,0.0,0.159751,0.0,0.273586,0.0
4,5,0,0.0,0.009879,0.727963,0.0,0.0,0.0,0.590635,0.11212,...,0.0,0.0,0.176967,0.0,0.0,0.0,0.37884,0.0,0.67748,0.0


In [37]:
features = []
for number in range(128):
    features.append("feature"+str(number))

In [38]:
df_final = pd.concat([df_final,pd.DataFrame(values,columns=features)],axis=1)

In [39]:
X= df_final[features]
y = df["gesture_detected"]

In [116]:
train_X,test_X,train_y,test_y=train_test_split(X,y)

In [117]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

## try logistic regression

In [118]:
logreg = LogisticRegression()
rfe = RFE(logreg, 20)
rfe = rfe.fit(X, y)
print(rfe.support_)
print(rfe.ranking_)

[False False False False False False False  True False False False False
 False False False  True False False False  True False False False False
 False  True False  True False False False False False  True False False
 False False False False False False  True False  True False False  True
 False  True False False False False False False False False False False
 False False False False False False False False False False  True False
  True False False False False  True False False False  True False False
 False  True  True False False  True False False False  True  True  True
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False]
[109  59  37  76  73  71  20   1  70  82  23  28  89  35  50   1   3  42
  96   1 100  38 103  44  10   1  24   1  26  63  41  15   2   1  14  33
  22  90  92  13  66  77   1  80   1   7  19   1   8   1  34  79  67  91
 

In [119]:
features = np.array(features)

In [120]:
logistic_features = features[rfe.support_]

## try with all features

In [121]:
logreg = LogisticRegression()
logreg.fit(train_X, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [122]:
pred_y = logreg.predict(test_X)

In [123]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(test_X, test_y)))

Accuracy of logistic regression classifier on test set: 0.45


In [124]:
from sklearn.metrics import confusion_matrix
confusion_mat = confusion_matrix(test_y, pred_y)
print(confusion_mat)

[[21 15]
 [26 13]]


## try with selected features

In [125]:
logreg = LogisticRegression()
logreg.fit(train_X[logistic_features], train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [126]:
pred_y = logreg.predict(test_X[logistic_features])

In [127]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(test_X[logistic_features], test_y)))

Accuracy of logistic regression classifier on test set: 0.53


In [128]:
confusion_mat = confusion_matrix(test_y, pred_y)
print(confusion_mat)

[[28  8]
 [27 12]]


## try svc with linear kernel

In [129]:
svmLinear = SVC(kernel="linear", C=0.15)
svmLinear.fit(train_X,train_y)

SVC(C=0.15, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [130]:
pred_y = svmLinear.predict(test_X)

In [131]:
print('Accuracy of Linear SVM classifier on test set: {:.2f}'.format(svmLinear.score(test_X, test_y)))

Accuracy of Linear SVM classifier on test set: 0.48


In [132]:
confusion_mat = confusion_matrix(test_y, pred_y)
print(confusion_mat)

[[36  0]
 [39  0]]


## try with logit features

In [133]:
svmLinear = SVC(kernel="linear", C=200)
svmLinear.fit(train_X[logistic_features], train_y)

SVC(C=200, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [134]:
pred_y = svmLinear.predict(test_X[logistic_features])

In [135]:
print('Accuracy of linear svm classifier on test set: {:.2f}'.format(svmLinear.score(test_X[logistic_features], test_y)))

Accuracy of linear svm classifier on test set: 0.63


In [136]:
confusion_mat = confusion_matrix(test_y, pred_y)
print(confusion_mat)

[[26 10]
 [18 21]]


In [137]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
C_range = np.linspace(180,300, 100)
param_grid = dict(C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(svmLinear, param_grid=param_grid, cv=cv)

In [None]:
grid.fit(train_X[logistic_features], train_y)

In [None]:
grid.best_estimator_

In [None]:
pred_y = grid.predict(test_X[logistic_features])

In [None]:
print('Accuracy of linear SVM classifier on test set: {:.2f}'.format(grid.score(test_X[logistic_features], test_y)))

In [41]:
confusion_mat = confusion_matrix(test_y, pred_y)
print(confusion_mat)

[[28 10]
 [23 14]]


In [42]:
test_y.index

Int64Index([132, 277, 293, 197, 181, 141,  50, 128, 184, 232, 116, 162,  66,
            135, 290, 151, 236, 186,  65,  44,  71, 218, 122,  96, 297, 113,
             53,  88, 110, 137,  91, 237,  33, 245,  36,  85,  46,  20,  87,
             47,  28, 274, 129, 210, 269,  97, 159, 200,   7, 291, 225,  12,
            133, 105, 213,   2, 194, 208,  57, 215,  95, 168, 189, 145, 247,
             93,  86, 167, 150, 242, 121, 204, 165,  83, 198],
           dtype='int64')

In [43]:
pred_y ==1 and test_y == 1 and pred_y==test_y

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [44]:
correct_gestures_index = np.logical_and(pred_y ==1 , test_y == 1)

In [45]:
incorrect_gestures_index = np.logical_and(pred_y ==0 , test_y == 1)

In [46]:
correct_gestures = test_y.index[correct_gestures_index]

In [47]:
incorrect_gestures = test_y.index[incorrect_gestures_index]

In [55]:
len(incorrect_prediction_indexes)

23

## try SVM with rbf kernel

In [92]:
svmrbf = SVC(gamma=2, C=1)

perform grid search 

In [93]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
C_range = np.linspace(20,22, 20)
gamma_range = np.linspace(0.2, 0.3, 20)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(svmrbf, param_grid=param_grid, cv=cv)

In [94]:
grid.fit(train_X, train_y)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=2, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'gamma': array([0.2    , 0.20526, 0.21053, 0.21579, 0.22105, 0.22632, 0.23158,
       0.23684, 0.24211, 0.24737, 0.25263, 0.25789, 0.26316, 0.26842,
       0.27368, 0.27895, 0.28421, 0.28947, 0.29474, 0.3    ]), 'C': array([20.     , 20.10526, 20.21053, 20.31579, 20.42105, 20.52632,
       20.63158, 20.73684, 20.84211, 20.94737, 21.05263, 21.15789,
       21.26316, 21.36842, 21.47368, 21.57895, 21.68421, 21.78947,
       21.89474, 22.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [95]:
grid.best_estimator_

SVC(C=20.210526315789473, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.27368421052631575,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [96]:
pred_y = grid.predict(test_X)

In [97]:
print('Accuracy of rbf SVM classifier on test set: {:.2f}'.format(grid.score(test_X, test_y)))

Accuracy of rbf SVM classifier on test set: 0.48


In [98]:
confusion_mat = confusion_matrix(test_y, pred_y)
print(confusion_mat)

[[24 18]
 [21 12]]


## try with logit features

In [62]:
svmrbf = SVC()

In [63]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
C_range = np.linspace(5, 15, 20)
gamma_range = np.linspace(0, 1, 20)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(svmrbf, param_grid=param_grid, cv=cv)

In [64]:
grid.fit(train_X[logistic_features], train_y)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'gamma': array([0.     , 0.05263, 0.10526, 0.15789, 0.21053, 0.26316, 0.31579,
       0.36842, 0.42105, 0.47368, 0.52632, 0.57895, 0.63158, 0.68421,
       0.73684, 0.78947, 0.84211, 0.89474, 0.94737, 1.     ]), 'C': array([ 5.     ,  5.52632,  6.05263,  6.57895,  7.10526,  7.63158,
        8.15789,  8.68421,  9.21053,  9.73684, 10.26316, 10.78947,
       11.31579, 11.84211, 12.36842, 12.89474, 13.42105, 13.94737,
       14.47368, 15.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [65]:
grid.best_estimator_

SVC(C=6.052631578947368, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.15789473684210525,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [66]:
pred_y = grid.predict(test_X[logistic_features])

In [67]:
print('Accuracy of linear svm classifier on test set: {:.2f}'.format(grid.score(test_X[logistic_features], test_y)))

Accuracy of linear svm classifier on test set: 0.49


In [68]:
confusion_mat = confusion_matrix(test_y, pred_y)
print(confusion_mat)

[[26  8]
 [30 11]]


## trying out PCA

In [134]:
pca = PCA()

In [135]:
pca.fit(train_X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [145]:
(pca.explained_variance_ratio_ > 0.007).sum()
# if variance explained is more than 1/128 then the vector is considered

17

In [75]:
pca = PCA(n_components=17)

In [76]:
pca.fit(train_X)

PCA(copy=True, iterated_power='auto', n_components=17, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [77]:
pca.explained_variance_ratio_

array([0.41423574, 0.10448867, 0.08048943, 0.06946487, 0.0380066 ,
       0.0356939 , 0.03052056, 0.02845258, 0.02527583, 0.01904555,
       0.01570564, 0.01519603, 0.01320736, 0.01149082, 0.00933643,
       0.00809164, 0.00750064])

In [78]:
train_t_X = pca.transform(train_X)

In [79]:
test_t_X = pca.transform(test_X)

In [80]:
train_t_X.shape

(224, 17)

In [81]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
svmLinear = SVC(kernel="linear", C=200)
C_range = np.linspace(850,1250, 10)
param_grid = dict(C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(svmLinear, param_grid=param_grid, cv=cv)

In [82]:
grid.fit(train_t_X, train_y)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=SVC(C=200, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([ 850.     ,  894.44444,  938.88889,  983.33333, 1027.77778,
       1072.22222, 1116.66667, 1161.11111, 1205.55556, 1250.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [83]:
grid.best_estimator_

SVC(C=850.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [84]:
pred_y = grid.predict(test_t_X)

In [283]:
print('Accuracy of linear svm classifier on test set: {:.2f}'.format(grid.score(test_t_X, test_y)))

Accuracy of linear svm classifier on test set: 0.56


In [284]:
confusion_mat = confusion_matrix(test_y, pred_y)
print(confusion_mat)

[[29 13]
 [20 13]]


# trying KNN with PCA

In [261]:
neigh = KNeighborsClassifier(n_neighbors=3)

In [299]:
K_range = np.linspace(50,100, 100,dtype=int)
param_grid = dict(n_neighbors=K_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(neigh, param_grid=param_grid, cv=cv)

In [300]:
grid.fit(train_t_X,train_y)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([ 50,  50, ...,  99, 100])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [301]:
grid.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=51, p=2,
           weights='uniform')

In [302]:
pred_y = grid.predict(test_t_X)

In [303]:
print('Accuracy of linear svm classifier on test set: {:.2f}'.format(grid.score(test_t_X, test_y)))

Accuracy of linear svm classifier on test set: 0.49


In [304]:
confusion_mat = confusion_matrix(test_y, pred_y)
print(confusion_mat)

[[35  7]
 [31  2]]


## explore the features with incorrect prediction

In [85]:
gesture_vectors = test_X[correct_gestures_index][logistic_features]

In [86]:
gesture_vectors

Unnamed: 0,feature7,feature15,feature19,feature25,feature27,feature33,feature42,feature44,feature47,feature49,feature70,feature72,feature77,feature81,feature85,feature86,feature89,feature93,feature94,feature95
293,0.0,1.050539,0.593949,0.698747,0.125549,0.09524,0.60625,0.525725,0.0,0.0,0.0,0.152932,0.0,1.399698,0.414531,0.833366,0.083583,0.778674,0.0,0.340296
66,0.088447,1.019851,1.299303,0.849852,0.0,0.0,0.62407,0.159913,0.361939,0.0,0.094094,0.037443,0.0,0.303977,0.03087,1.44556,0.0,0.460027,0.0,0.354137
186,0.161037,0.59067,0.560151,0.815514,0.0,0.098457,0.450015,0.399671,0.099739,0.211942,0.0,0.203626,0.239165,1.082809,0.232795,0.871531,0.02911,0.687545,0.0,0.679346
65,0.222193,0.794831,0.657372,0.70464,0.139071,0.0,0.487143,0.465329,0.0,0.0,0.0,0.0,0.0,0.817026,0.381625,0.975616,0.0,0.593835,0.0,0.317326
44,0.317994,0.946493,0.92856,0.599522,0.216953,0.0,0.629331,0.452732,0.058591,0.0,0.0,0.0,0.0,0.35502,0.153901,1.151834,0.0,0.361952,0.0,0.450965
110,0.0,0.751726,0.405692,0.771498,0.182711,0.041515,0.468365,0.426574,0.0,0.092029,0.0,0.400876,0.094903,1.515668,0.567489,0.872784,0.071744,0.949196,0.0,0.632253
85,0.0,0.992556,0.451385,0.752795,0.0,0.10412,0.655352,0.187845,0.16159,0.010937,0.0,0.475213,0.284642,1.560792,0.271585,1.171083,0.007592,0.799521,0.0,0.426501
47,0.160532,0.903563,0.87595,0.632075,0.157957,0.0,0.669741,0.265637,0.412459,0.0,0.0,0.24102,0.0,0.893415,0.34909,0.908026,0.0,0.164854,0.0,0.678671
28,0.0,1.429519,1.345505,0.989929,0.0,0.0,0.937417,0.0,0.0,0.0,0.362795,0.0,0.0,0.269136,0.252607,1.684342,0.0,0.304357,0.0,0.281744
200,0.323878,0.841928,0.649186,0.460776,0.0,0.116305,0.536175,0.471294,0.0,0.064429,0.0,0.0,0.122496,1.144354,0.246423,1.079741,0.168732,1.030521,0.020452,0.523444


In [127]:
scipy.spatial.distance.cosine(gesture_vectors.loc[200,:],gesture_vectors.loc[247,:])

0.06590479612350464

In [128]:
average_detected_gesture = gesture_vectors.sum()

In [129]:
scipy.spatial.distance.cosine(gesture_vectors.loc[200,:],average_detected_gesture)

0.041121482849121094

In [260]:
df_8m = pd.DataFrame(columns=["video_id","start_second"]+list(features))

In [261]:
import glob, os
for file in glob.glob("youtube8m/*.wav"):
    sr, wav_data = wavfile.read(file)
    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
    samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]
    values = ProcessWithVGGish(vgg,samples,sr,1)
    
    temp_df = pd.DataFrame(values,columns=features)
    temp_df['video_id'] = file[10:-4]
    temp_df = temp_df.reset_index().rename(columns={'index':'start_second'})
    df_8m = pd.concat([df_8m,temp_df[df_8m.columns]],axis=0).reset_index(drop=True)

In [262]:
df_8m.tail()

Unnamed: 0,video_id,start_second,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature118,feature119,feature120,feature121,feature122,feature123,feature124,feature125,feature126,feature127
620,zdtVT2xwrHU,5,0.0,0.0,0.210122,0.0,0.0,0.0,1.160942,0.0,...,0.0,0.0,0.100832,0.0,0.0,0.0,0.436314,0.0,0.492902,0.0
621,zdtVT2xwrHU,6,0.0,0.00643,0.304539,0.0,0.0,0.0,0.971064,0.070915,...,0.0,0.0,0.069267,0.0,0.0,0.0,0.31213,0.0,0.387916,0.0
622,zdtVT2xwrHU,7,0.0,0.0,0.145601,0.0,0.0,0.0,0.886971,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.18653,0.0,0.45535,0.0
623,zdtVT2xwrHU,8,0.0,0.0,0.241,0.0,0.0,0.0,0.974537,0.141948,...,0.0,0.0,0.098884,0.0,0.0,0.0,0.081664,0.0,0.454245,0.0
624,zdtVT2xwrHU,9,0.0,0.0,0.117311,0.0,0.0,0.0,1.037534,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.090246,0.0,0.382102,0.0


In [263]:
df_8m.shape

(625, 130)

In [264]:
df_8m_log = df_8m[logistic_features]

In [265]:
yt8m_rows_selected = []
yt8m_rows_selected_cosine = []

In [266]:
df_8m_log.loc[1,:]

feature7     0.000000
feature15    1.133957
feature19    0.374565
feature25    0.502693
feature27    0.292413
feature33    0.000000
feature42    0.673598
feature44    0.622310
feature47    0.000000
feature49    0.000000
feature70    0.000000
feature72    0.000000
feature77    0.000000
feature81    1.029591
feature85    0.198672
feature86    0.652134
feature89    0.240396
feature93    0.881775
feature94    0.054436
feature95    0.531172
Name: 1, dtype: float32

In [267]:
for i in df_8m_log.index:
    cosin_dist = scipy.spatial.distance.cosine(df_8m_log.loc[i,:],average_detected_gesture)
    if cosin_dist >= 0.1:
        yt8m_rows_selected.append(i)
        yt8m_rows_selected_cosine.append(cosin_dist)

In [268]:
yt8m_rows_selected = np.array(yt8m_rows_selected)

In [269]:
yt8m_rows_selected_sorted = yt8m_rows_selected[np.argsort(yt8m_rows_selected_cosine)]

In [271]:
df_8m_top = df_8m.loc[yt8m_rows_selected_sorted,:]

In [287]:
df_8m_top.shape

(148, 130)

In [295]:
rank = 1
for i in df_8m_top.index:
    vidId = df_8m_top.loc[i,"video_id"]
    print(vidId)
    start_second = df_8m_top.loc[i,"start_second"]
    a = os.system("ffmpeg -ss {} -t {} -i {} {}".format(start_second,1,"youtube8m/"+vidId+".wav","yt8m_similar/"+"{0:03}".format(rank)+vidId+str(start_second)+".wav"))
    print("{0:03}".format(rank)+vidId+str(start_second)+".wav")
    rank += 1

j2JoL0Bmx4M
001j2JoL0Bmx4M3.wav
yVwt49os4F0
002yVwt49os4F06.wav
mEslZvugQCc
003mEslZvugQCc9.wav
j2JoL0Bmx4M
004j2JoL0Bmx4M7.wav
GnmgSQq6E54
005GnmgSQq6E547.wav
72RI4phr0F8
00672RI4phr0F83.wav
sKglR9n1aFg
007sKglR9n1aFg3.wav
JbPNmYKs7P0
008JbPNmYKs7P02.wav
A7wdgVaqhPI
009A7wdgVaqhPI5.wav
9wkgTdE7nz8
0109wkgTdE7nz85.wav
wZopmfXTtxw
011wZopmfXTtxw8.wav
_bAVmK7n0fs
012_bAVmK7n0fs3.wav
Sv9Pm4AburQ
013Sv9Pm4AburQ4.wav
mh6JxDJ0Bno
014mh6JxDJ0Bno3.wav
7oHaG3Ez5-4
0157oHaG3Ez5-48.wav
jMDHWwO-1fY
016jMDHWwO-1fY6.wav
X2IUVELRuzs
017X2IUVELRuzs3.wav
72RI4phr0F8
01872RI4phr0F87.wav
5kQF4r03yRI
0195kQF4r03yRI1.wav
M8BaWFyXNqA
020M8BaWFyXNqA8.wav
wZopmfXTtxw
021wZopmfXTtxw5.wav
_3ZOigx8e3c
022_3ZOigx8e3c1.wav
_bAVmK7n0fs
023_bAVmK7n0fs1.wav
9wkgTdE7nz8
0249wkgTdE7nz80.wav
A7wdgVaqhPI
025A7wdgVaqhPI0.wav
X2IUVELRuzs
026X2IUVELRuzs5.wav
QNRche7AZkM
027QNRche7AZkM9.wav
72RI4phr0F8
02872RI4phr0F80.wav
mZ_z2lorJeY
029mZ_z2lorJeY3.wav
OVhPqvG0ezA
030OVhPqvG0ezA3.wav
wZopmfXTtxw
031wZopmfXTtxw7.wav
j2JoL0Bm

In [286]:
rank

149

## find least similar rows

In [346]:
yt8m_rows_selected = []
yt8m_rows_selected_cosine = []

In [347]:
for i in df_8m_log.index:
    cosin_dist = scipy.spatial.distance.cosine(df_8m_log.loc[i,:],average_detected_gesture)
    if cosin_dist <= 0.04:
        yt8m_rows_selected.append(i)
        yt8m_rows_selected_cosine.append(cosin_dist)

In [348]:
yt8m_rows_selected = np.array(yt8m_rows_selected)

In [356]:
np.flip(np.argsort(yt8m_rows_selected_cosine),axis=0)

array([59, 13, 87, 66, 38, 50, 65,  3, 32, 27, 18, 34, 16,  9, 61, 12, 26,
       35, 15, 55, 20, 31, 45, 70, 58, 17, 11, 23,  5, 49, 60, 25,  8, 86,
       47, 77, 46,  1, 29, 68,  7, 53, 69, 43, 85, 21, 33, 41, 52, 74, 78,
       36, 39, 67, 28, 56, 62, 73, 10, 30, 48,  6, 64, 19, 84, 40, 37, 57,
       76, 82, 75, 72, 81, 24, 80, 22, 71,  0, 44, 63, 79,  4, 14, 42, 54,
       83, 51,  2])

In [357]:
yt8m_rows_selected_sorted = yt8m_rows_selected[np.flip(np.argsort(yt8m_rows_selected_cosine),axis=0)]

In [361]:
df_8m_top = df_8m.loc[yt8m_rows_selected_sorted,:]

In [362]:
df_8m_top.shape

(88, 130)

In [363]:
rank = 1
for i in df_8m_top.index:
    vidId = df_8m_top.loc[i,"video_id"]
    print(vidId)
    start_second = df_8m_top.loc[i,"start_second"]
    a = os.system("ffmpeg -ss {} -t {} -i {} {}".format(start_second,1,"youtube8m/"+vidId+".wav","yt8m_least_similar/"+"{0:03}".format(rank)+vidId+str(start_second)+".wav"))
    print("{0:03}".format(rank)+vidId+str(start_second)+".wav")
    rank += 1

OYkZxbD0vr8
001OYkZxbD0vr81.wav
MY0PsDE3xHs
002MY0PsDE3xHs2.wav
YuvmP7Lsr2k
003YuvmP7Lsr2k7.wav
sKglR9n1aFg
004sKglR9n1aFg0.wav
hYk2rw9wcXc
005hYk2rw9wcXc5.wav
7oHaG3Ez5-4
0067oHaG3Ez5-41.wav
W4acKG-KSTE
007W4acKG-KSTE8.wav
Fihamhv6TJo
008Fihamhv6TJo4.wav
PNCwMnNW-jw
009PNCwMnNW-jw9.wav
EMLpB_ZP0c0
010EMLpB_ZP0c06.wav
JLKsLx_SGOU
011JLKsLx_SGOU0.wav
MW0ZTvRCS1o
012MW0ZTvRCS1o5.wav
X2IUVELRuzs
013X2IUVELRuzs9.wav
1OFDyTzUj24
0141OFDyTzUj247.wav
H85puQUqrQY
015H85puQUqrQY5.wav
T-8xETNgtvE
016T-8xETNgtvE8.wav
EMLpB_ZP0c0
017EMLpB_ZP0c05.wav
MW0ZTvRCS1o
018MW0ZTvRCS1o6.wav
X2IUVELRuzs
019X2IUVELRuzs7.wav
tPE0GmzqkRY
020tPE0GmzqkRY7.wav
JLKsLx_SGOU
021JLKsLx_SGOU3.wav
PNCwMnNW-jw
022PNCwMnNW-jw4.wav
BuIMVqz85uk
023BuIMVqz85uk7.wav
guRyU4B5LlA
024guRyU4B5LlA5.wav
XIW8Ai-YMdA
025XIW8Ai-YMdA8.wav
_bAVmK7n0fs
026_bAVmK7n0fs9.wav
T-8xETNgtvE
027T-8xETNgtvE7.wav
JLKsLx_SGOU
028JLKsLx_SGOU7.wav
GnmgSQq6E54
029GnmgSQq6E548.wav
M8BaWFyXNqA
030M8BaWFyXNqA9.wav
OYkZxbD0vr8
031OYkZxbD0vr85.wav
EMLpB_ZP

## try to find how scores to on detected and non detected gestures

In [155]:
#frame numbers of questions
framestamps = np.array([1410 ,2160 ,3180 ,3630 ,5730 ,6720 ])

In [156]:
timestamps = framestamps/30

In [157]:
timestamps

array([ 47.,  72., 106., 121., 191., 224.])

In [158]:
test = df_final.iloc[timestamps,:]

In [159]:
train_indexes = ~df_final.index.isin(timestamps)

In [160]:
train = df_final[train_indexes]

In [161]:
train_X,train_y = train[features],train["gesture_detected"]

In [162]:
test_X,test_y = test[features],test["gesture_detected"]

In [163]:
test_X

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature118,feature119,feature120,feature121,feature122,feature123,feature124,feature125,feature126,feature127
47,0.0,0.023563,0.940871,0.0,0.0,0.0,0.232757,0.160532,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.148817,0.0,0.370461,0.062253
72,0.0,0.0,0.785413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0692,0.117243,0.0,0.0,0.0
106,0.0,0.104077,0.249532,0.0,0.0,0.0,0.272323,0.018525,0.0,0.0,...,0.0,0.0,0.013792,0.0,0.0,0.0,0.255551,0.0,0.382913,0.01302
121,0.0,0.0,0.212845,0.0,0.0,0.0,0.340554,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.17695,0.0,0.370093,0.0
191,0.0,0.0,0.532508,0.0,0.0,0.0,0.056741,0.013445,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.010354,0.0,0.202845,0.17839
224,0.0,0.0,0.6854,0.0,0.0,0.0,0.278676,0.124498,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.138487,0.0,0.221932,0.0


In [164]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
C_range = np.linspace(180,300, 100)
param_grid = dict(C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(svmLinear, param_grid=param_grid, cv=cv)

In [165]:
grid.fit(train_X[logistic_features], train_y)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=SVC(C=200, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([180.     , 181.21212, ..., 298.78788, 300.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [166]:
pred_y = grid.predict(test_X[logistic_features])

In [167]:
print('Accuracy of linear SVM classifier on test set: {:.2f}'.format(grid.score(test_X[logistic_features], test_y)))

Accuracy of linear SVM classifier on test set: 0.50


In [170]:
pred_y==1

array([ True,  True, False, False,  True, False])

## we are able to predict gesture on question 1,2,5
compare the scores for these questions

In [200]:
df_scores = pd.read_excel("scores_of_test.xlsx")

In [201]:
df_bicycle_scores = df_scores.iloc[:,-7:-1] #select the scores for the bicycle questions from the data frame

In [204]:
df_bicycle_scores.describe()

Unnamed: 0,Q1.2,Q2.2,Q3.2,Q4.2,Q5.2,Q6.2
count,61.0,61.0,61.0,61.0,61.0,61.0
mean,0.770492,0.639344,0.590164,0.704918,0.819672,0.377049
std,0.424006,0.484176,0.495885,0.459865,0.387651,0.488669
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,1.0,0.0
50%,1.0,1.0,1.0,1.0,1.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [220]:
df_pred_corr_ques= df_bicycle_scores.loc[:,pred_y==1]

In [221]:
df_pred_incorr_ques= df_bicycle_scores.loc[:,pred_y!=1]

In [222]:
from scipy import stats

In [230]:
for i in range(3):
    for j in range(3):
        print(i,j)
        tstat = stats.ttest_rel(df_pred_corr_ques.iloc[:,i],df_pred_incorr_ques.iloc[:,j])
        print(f"for {df_pred_corr_ques.columns[i]} and {df_pred_incorr_ques.columns[j]} is {tstat}")

0 0
for Q1.2 and Q3.2 is Ttest_relResult(statistic=2.3797114365109153, pvalue=0.020522194601492103)
0 1
for Q1.2 and Q4.2 is Ttest_relResult(statistic=0.8142379415222338, pvalue=0.41872858301815674)
0 2
for Q1.2 and Q6.2 is Ttest_relResult(statistic=4.618802153517007, pvalue=2.0904438134357708e-05)
1 0
for Q2.2 and Q3.2 is Ttest_relResult(statistic=0.5039822601366593, pvalue=0.6161194771733105)
1 1
for Q2.2 and Q4.2 is Ttest_relResult(statistic=-0.8508712596230341, pvalue=0.3982238760222032)
1 2
for Q2.2 and Q6.2 is Ttest_relResult(statistic=2.8138098324506364, pvalue=0.0066103755070775466)
2 0
for Q5.2 and Q3.2 is Ttest_relResult(statistic=3.0453978310767242, pvalue=0.003449172529857805)
2 1
for Q5.2 and Q4.2 is Ttest_relResult(statistic=1.5447859516333118, pvalue=0.12765651816462362)
2 2
for Q5.2 and Q6.2 is Ttest_relResult(statistic=6.135307372712406, pvalue=7.293398040368594e-08)
