In [1]:
import numpy as np
import pickle
import os
from sklearn.utils import shuffle

In [2]:
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Convolution2D, MaxPooling2D, Flatten, Dropout
import keras.optimizers
from keras.models import load_model
import keras.regularizers
from keras.regularizers import l2, l1

Using TensorFlow backend.


In [3]:
def load_hash(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [44]:
TRAIN_DIR = "/data/jrgillick/laughter/stored_features1/train_set/"
VAL_DIR = "/data/jrgillick/laughter/stored_features1/val_set/"
TEST_DIR = "/data/jrgillick/laughter/stored_features1/test_set/"

In [None]:
window_size = 37

In [50]:
def get_laughter_and_speech_clips(directory):
    laughter_files = [directory + f for f in os.listdir(directory) if 'laughter' in f]
    speech_files = [directory + f for f in os.listdir(directory) if not 'laughter' in f]

    laughter_data = [load_hash(f) for f in laughter_files]

    laughter_clips = []
    for f in laughter_data:
        for clip in f:
            laughter_clips.append(clip)
        
    speech_clips = [load_hash(f) for f in speech_files]
    
    return (laughter_clips, speech_clips)

In [58]:
def format_laughter_inputs(clip):
    mfcc_feat = clip['mfcc']
    delta_feat = clip['delta']
    labels = clip['labels']
    laughter_frame_indices = np.nonzero(labels)[0]
    X = None
    for index in laughter_frame_indices:
        #print index
        features = np.append(mfcc_feat[index-window_size:index+window_size],delta_feat[index-window_size:index+window_size])
        if X is None:
            X = features
        else:
            X = np.vstack([X,features])
    return (X,np.ones(len(laughter_frame_indices)))

def format_speech_inputs(clip):
    mfcc_feat = clip['mfcc']
    delta_feat = clip['delta']
    labels = clip['labels']
    speech_frame_indices = np.array(list(xrange(len(labels))))[window_size:-window_size]
    X = []
    for index in speech_frame_indices:
        #print index
        features = np.append(mfcc_feat[index-window_size:index+window_size],delta_feat[index-window_size:index+window_size])
        X.append(features)
    return (np.array(X),np.zeros(len(speech_frame_indices)))

def format_laughter_clips(laughter_clips):
    formatted_laughter_clips = []
    for index, clip in enumerate(laughter_clips):
        if index % 500 == 0: print "formatting %d out of %d" % (index, len(laughter_clips))
        formatted_laughter_clips.append(format_laughter_inputs(clip))
    return formatted_laughter_clips
    
def format_speech_clips(speech_clips):
    formatted_speech_clips = []
    for index, clip in enumerate(speech_clips):
        if index % 500 == 0: print "formatting %d out of %d" % (index, len(speech_clips))
        formatted_speech_clips.append(format_speech_inputs(clip))
    return formatted_speech_clips

In [75]:
def format_data_and_labels(formatted_laughter_clips, formatted_speech_clips):
    train_data = []; train_labels = []
    for j in xrange(len(formatted_laughter_clips)):
        #print "Processing %d of %d" % (j,len(formatted_laughter_clips))
        clip, label = formatted_laughter_clips[j]
        if not clip is None and not label is None:
            for i in xrange(len(clip)):
                train_data.append(clip[i])
                train_labels.append(label[i])

    for j in xrange(len(formatted_speech_clips)):
        #print "Processing %d of %d" % (j,len(formatted_speech_clips))
        clip, label = formatted_speech_clips[j]
        if not clip is None and not label is None:
            for i in xrange(len(clip)):
                train_data.append(clip[i])
                train_labels.append(label[i])
                
    return (train_data, train_labels)

In [76]:
def get_data_and_labels_from_dir(directory):
    laughter_clips, speech_clips = get_laughter_and_speech_clips(directory)
    formatted_laughter_clips = format_laughter_clips(laughter_clips)
    formatted_speech_clips = format_speech_clips(speech_clips)
    train_data, train_labels = format_data_and_labels(formatted_laughter_clips, formatted_speech_clips)
    return (train_data, train_labels)

In [77]:
def divide_data_and_labels_into_parts(train_data,train_labels,part_size=5):
    train_data_parts = []
    train_label_parts = []
    i = 0
    while i < len(train_data) - part_size:
        train_data_parts.append(train_data[i:i+part_size])
        train_label_parts.append(train_labels[i:i+part_size])
        i += part_size
    return (train_data_parts, train_label_parts)

In [78]:
def get_data_subset(train_data_parts, train_label_parts, start, end):
    X = np.vstack(train_data_parts[start:end])
    y = np.hstack(train_label_parts[start:end])
    return X, y

In [100]:
laughter_clips, speech_clips = get_laughter_and_speech_clips(TRAIN_DIR)
# Remove some clips that were failing
del laughter_clips[677]
del laughter_clips[6079]
del laughter_clips[7235]
formatted_laughter_clips = format_laughter_clips(laughter_clips)
formatted_speech_clips = format_speech_clips(speech_clips)
train_data, train_labels = format_data_and_labels(formatted_laughter_clips, formatted_speech_clips)
train_data_parts, train_label_parts = divide_data_and_labels_into_parts(train_data,train_labels,part_size=1)

In [107]:
val_laughter_clips, val_speech_clips = get_laughter_and_speech_clips(VAL_DIR)
val_formatted_laughter_clips = format_laughter_clips(val_laughter_clips)
val_formatted_speech_clips = format_speech_clips(val_speech_clips)
val_data, val_labels = format_data_and_labels(val_formatted_laughter_clips, val_formatted_speech_clips)
val_data_parts, val_label_parts = divide_data_and_labels_into_parts(val_data,val_labels,part_size=1)

In [143]:
def initialize_model():
    model = Sequential()
    model.add(Dense(500, use_bias=True,input_dim=1924))
    model.add(keras.layers.BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Activation("relu"))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return model    

In [144]:
model = initialize_model()

In [145]:
def train_on_parts(train_data_parts, train_label_parts):
    train_data_parts, train_label_parts = shuffle(train_data_parts, train_label_parts, random_state=0)
    i = 0
    accs = []
    while i < len(train_data_parts):
        #print i
        X_subset, y_subset = get_data_subset(train_data_parts, train_label_parts, i, i+2000)
        model.fit(X_subset,y_subset,shuffle=True,batch_size = 500, epochs=1,verbose=False)
        acc = model.evaluate(X_subset, y_subset,verbose=False)[1]
        accs.append(acc)
        #print np.mean(accs)
        i += 2000
    print "Training accuracy %f" % (np.mean(accs))

In [146]:
def evaluate_on_parts(data_parts, label_parts):
    #train_data_parts, train_label_parts = shuffle(train_data_parts, train_label_parts, random_state=0)
    i = 0
    accs = []
    while i < len(data_parts):
        #if i % 10000 == 0: print i
        X_subset, y_subset = get_data_subset(data_parts, label_parts, i, i+100)
        #model.fit(X_subset,y_subset,shuffle=True,batch_size = 2000, epochs=1,verbose=False)
        acc = model.evaluate(X_subset, y_subset,verbose=False)[1]
        accs.append(acc)
        i += 100
    print "Validation accuracy %f " % (np.mean(accs))

In [147]:
for epoch in xrange(25):
    print "Epoch %d" % (epoch)
    train_on_parts(train_data_parts, train_label_parts)
    evaluate_on_parts(val_data_parts, val_label_parts)

Epoch 0
Training accuracy 0.819953
Validation accuracy 0.810259 
Epoch 1
Training accuracy 0.853212
Validation accuracy 0.821743 
Epoch 2
Training accuracy 0.866210
Validation accuracy 0.827857 
Epoch 3
Training accuracy 0.874704
Validation accuracy 0.832835 
Epoch 4
Training accuracy 0.881175
Validation accuracy 0.835605 
Epoch 5
Training accuracy 0.886035
Validation accuracy 0.835831 
Epoch 6


KeyboardInterrupt: 

In [27]:
#train and test on a little bit of data for exploration

clip = laughter_data[560][0]
speech_clip = speech_data[1]
clip2 = laughter_data[610][0]
speech_clip2 = speech_data[61]
X, y = format_laughter_inputs(clip)
X1, y1 = format_speech_inputs(speech_clip)
X2, y2 = format_laughter_inputs(clip2)
X3, y3 = format_speech_inputs(speech_clip2)

X_train = np.vstack([X,X1])
y_train = np.append(y,y1)

X_test = np.vstack([X2,X3])
y_test = np.append(y2,y3)

model = initialize_model()

model.fit(X_train,y_train,shuffle=True,batch_size = 32, epochs=1)

model.evaluate(X_test,y_test)