In [1]:
# read audio wav from dataset TESS and RAVDESS
import numpy as np
import librosa
from scipy.io import wavfile
import os, time, csv, datetime

# part = 'test'
parameters = [7, -1, 1024, 512, 80, 300, 8000, 50]
[emo_read_num, file_read_num, win_size, hop_size, min_freq, max_fund_freq, max_freq, mfcc_size] = [int(x) for x in parameters]

TESS_trim = 0.62
RAVDESS_trim = 0.26
magic = 43195

time_very_start = time.time()
print('Start')

features = []
labels = []
nowdate = datetime.datetime.now()
savename = 'feat_'  + str(win_size) + 'win_' + \
           '[' + str(nowdate.month).zfill(2) + str(nowdate.day).zfill(2) + '-' + \
           str(nowdate.hour).zfill(2) + str(nowdate.minute).zfill(2) + '].csv'

parent_dir = os.path.dirname(os.getcwd())

for dataset in ['TESS', 'RAVDESS']:
    dataset_dir = os.path.join(parent_dir, 'project', dataset)
    for emotion in range(7):
        if emotion >= emo_read_num:
            break
        time_start = time.time()
        print('Reading emotion #' + str(emotion) + ' in ' + dataset + '...')
        emotion_dir = os.path.join(dataset_dir, str(emotion))
        file_count = 0
        file_list = os.listdir(emotion_dir)
        for file in file_list:
            if file_read_num != -1 and file_count >= file_read_num:
                break
            if (not file.endswith('.wav')) or file[0] == '.':
                continue
            fs, x = wavfile.read(os.path.join(emotion_dir, file))
            
            if len(x) >= magic:
                if dataset == 'TESS':
                    x = x[int((len(x) - magic) / 2):][:magic]
                else:
                    x = x[-magic:]
            else: # add zeros at end
                x = np.concatenate((x, [0] * (magic - len(x))))

            x = x / 32768 # convert 16-bit PCM to [-1, 1]

            s = librosa.feature.melspectrogram(y=x, sr=fs, n_fft=win_size, hop_length=hop_size)
            mfcc = librosa.feature.mfcc(S=librosa.power_to_db(s), sr=fs, n_mfcc=mfcc_size)

            rms = librosa.feature.rmse(y=x, frame_length=win_size, hop_length=hop_size)
            zcr = librosa.feature.zero_crossing_rate(y=x, frame_length=win_size, hop_length=hop_size)
            centroid = librosa.feature.spectral_centroid(y=x, sr=fs, n_fft=win_size, hop_length=hop_size)
            
            # pitch
            min_lag = int(fs / max_fund_freq)
            max_lag = int(fs / min_freq)
            L = range(min_lag, max_lag + 1)
            spec = librosa.core.stft(x, n_fft=win_size, hop_length=hop_size, win_length=win_size)
            dividend = np.transpose([np.real(np.fft.ifft(row)) for row in (np.absolute(spec) ** 2).transpose()])
            divisor = np.transpose([win_size - lag + 1 for lag in L])
            acf = dividend[L] / divisor[:, None]
            i_max = np.argmax(acf, axis=0)
            pitch = fs / (i_max - 1 + min_lag)
            
            if len(set([len(mfcc[0]), len(rms[0]), len(zcr[0]), len(centroid[0]), len(pitch)])) != 1:
                print('  Error: File ' + file + ' has different numbers of windows among different features!')
                continue

            if dataset == 'TESS':
                gender = np.vstack(([0] * len(rms[0]), [1] * len(rms[0])))
            else:
                if int(file[19]) % 2 == 0:
                    gender = np.vstack(([0] * len(rms[0]), [1] * len(rms[0]))) # female
                else:
                    gender = np.vstack(([1] * len(rms[0]), [0] * len(rms[0]))) # male

            # vertically concatenate features of all windows
            concat = np.vstack((mfcc, rms, zcr, centroid, pitch, gender))
            features.append(concat)
            labels.append(emotion)
            file_count += 1
        print('    ' + str(file_count) + ' files feature extracted. (' + str(int(time.time() - time_start)) + ' s)')

print('Finished. (' + str(int(time.time() - time_very_start)) + ' s in total)')


Start
Reading emotion #0 in TESS...
    196 files feature extracted. (2 s)
Reading emotion #1 in TESS...
    196 files feature extracted. (2 s)
Reading emotion #2 in TESS...
    196 files feature extracted. (2 s)
Reading emotion #3 in TESS...
    196 files feature extracted. (2 s)
Reading emotion #4 in TESS...
    196 files feature extracted. (2 s)
Reading emotion #5 in TESS...
    195 files feature extracted. (2 s)
Reading emotion #6 in TESS...
    195 files feature extracted. (2 s)
Reading emotion #0 in RAVDESS...
    192 files feature extracted. (2 s)
Reading emotion #1 in RAVDESS...
    96 files feature extracted. (1 s)
Reading emotion #2 in RAVDESS...
    192 files feature extracted. (2 s)
Reading emotion #3 in RAVDESS...
    192 files feature extracted. (2 s)
Reading emotion #4 in RAVDESS...
    192 files feature extracted. (2 s)
Reading emotion #5 in RAVDESS...
    192 files feature extracted. (2 s)
Reading emotion #6 in RAVDESS...
    192 files feature extracted. (2 s)
Finished

In [2]:
fea = np.array(features)
lab = np.array(labels)
print(fea.shape)
print(lab.shape)
rowdim = fea.shape[0]
ydim = fea.shape[1]
xdim = fea.shape[2]
# print(ydim)

(2618, 56, 85)
(2618,)


In [3]:
# import pandas as pd
# import numpy as np

# # read data from xls file 
# df = pd.read_csv('features.csv', header = None, na_values = '?', index_col = None)

# # filenames = ['features2.csv', 'features3.csv', 'features4.csv', 'features5.csv', 'features6.csv','test1.csv','test2.csv']
# # for filename in filenames:
# #     df1 = pd.read_csv(filename, header = None, na_values = '0', index_col = None)
# #     df = pd.concat([df,df1], ignore_index = True)
# data_column = np.shape(df)[1]
# data_row = np.shape(df)[0]
# print(np.shape(df))
# df.head(5)


In [4]:
# # get the value 
# y_raw = np.array(df[data_column-2])

# # list of 1D-features in the order of MFCC, Energy, Pitch
# X_raw = np.array(pd.DataFrame(df, columns = df.columns[0:(data_column-2)]))

# print(np.shape(X_raw))
# print(np.shape(y_raw))
# # print(X_raw)
# # print(y_raw)


In [5]:
X_raw = fea.reshape(rowdim, ydim*xdim)
y_raw = lab
print(X_raw.shape)
print(y_raw.shape)

(2618, 4760)
(2618,)


In [6]:
# import for one hot coding 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from collections import Counter

# y_reshape = y_raw.reshape(-1, 1)
# encoder = preprocessing.OneHotEncoder(sparse=False)
# yhot = encoder.fit_transform(y_reshape)
# print(yhot)
# print(yhot.shape)

In [7]:
# preprocessing the data
# normalize

X = preprocessing.scale(X_raw)
# print(X.shape)
# scaler = preprocessing.standardScaler().fit(X_raw)
# X = scaler.transform(X_re)

In [8]:
# from sklearn.model_selection import StratifiedShuffleSplit
test_rate = 0.2
Xtr, Xts, ytr, yts = train_test_split(X, y_raw, test_size=test_rate, random_state=0)
print(Xtr.shape)
print(ytr.shape)
print(Xts.shape)
print(yts.shape)

cnt = Counter(ytr)
print("Number of files in each category in TRAIN set:")
for k in sorted(cnt.keys()):
    print(cnt[k])

(2094, 4760)
(2094,)
(524, 4760)
(524,)
Number of files in each category in TRAIN set:
318
240
308
305
310
300
313


In [9]:
## use SVM model to predict class

In [10]:
# from sklearn import svm
# svc = svm.SVC(probability=False,  kernel="rbf", C=2.8, gamma=.0073,verbose=10)
# svc.fit(Xtr,ytr)

In [11]:
# yhat_ts = svc.predict(Xts)
# acc = np.mean(yhat_ts == yts)
# print('Accuaracy = {0:f}'.format(acc))

In [12]:
import keras
from keras import applications
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense, Input, Convolution2D, MaxPooling2D, Activation, concatenate
from keras.layers.normalization import BatchNormalization
from sklearn.metrics import accuracy_score


Using TensorFlow backend.


In [13]:
import keras.backend as K
K.clear_session()

In [14]:
# try to use pre-trained deep learning network vgg16
Xtr = Xtr.reshape(Xtr.shape[0], ydim, xdim, 1)
Xts = Xts.reshape(Xts.shape[0], ydim, xdim, 1)
print(Xtr.shape)
print(Xts.shape)
Xtr_1 = []
Xts_1 = []
Xtr_1 = Xtr.repeat(3, axis=3)
Xts_1 = Xts.repeat(3, axis=3)
print(Xtr_1.shape)
print(Xts_1.shape)

pre_trained = 'vgg16'

# Load appropriate packages
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import decode_predictions, preprocess_input    

input_shape = (ydim,xdim,3)
base_model = applications.VGG16(weights='imagenet', include_top = False, input_shape = input_shape)

(2094, 56, 85, 1)
(524, 56, 85, 1)
(2094, 56, 85, 3)
(524, 56, 85, 3)


In [21]:
model = Sequential()

for layers in base_model.layers:
    model.add(layers)
    
for layers in model.layers:
    layers.trainable = False
    
model.add(Flatten())
model.add(Dense(256,activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 56, 85, 3)         0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 56, 85, 64)        1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 56, 85, 64)        36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 28, 42, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 28, 42, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 28, 42, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 14, 21, 128)       0         
__________

In [22]:
opt = optimizers.Adam(lr=0.001) # beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=opt,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [17]:
nepochs = 5  # Number of epochs

# Call the fit function
model.fit(Xtr_1, ytr, batch_size=32, epochs=nepochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x13ff69c88>

In [20]:
yhat = model.predict(Xts_1)
print(yhat.shape)
yhat

(524, 7)


array([[  7.49151368e-05,   2.20860784e-05,   1.44108359e-04, ...,
          5.94521291e-04,   9.10398722e-01,   1.87610777e-03],
       [  2.74991756e-03,   1.79289109e-05,   1.23766847e-01, ...,
          1.10413166e-04,   1.32999371e-03,   2.14829750e-04],
       [  1.45582721e-01,   2.16202796e-01,   3.19356292e-01, ...,
          1.83869466e-01,   2.15335384e-01,   1.06152520e-01],
       ..., 
       [  3.95036710e-04,   2.76911378e-05,   1.15247816e-03, ...,
          1.28045169e-04,   6.84309661e-01,   1.11410598e-04],
       [  3.19653191e-02,   7.05338374e-04,   1.18935563e-01, ...,
          7.97546748e-03,   2.35727489e-01,   7.66533753e-03],
       [  1.14437297e-03,   3.69272282e-04,   5.22594929e-01, ...,
          2.61763460e-04,   7.00118719e-04,   4.49161511e-04]], dtype=float32)

In [19]:
accuracy_score(yts, yhat)

ValueError: Can't handle mix of multiclass and continuous-multioutput

In [None]:
## another method use self defined CNN model

In [None]:
#kares package use Tensorflow as backend
# Xtr = Xtr.reshape(Xtr.shape[0], ydim, xdim, 1)
# Xts = Xts.reshape(Xts.shape[0], ydim, xdim, 1)
test_rate = 0.2
Xtr, Xts, ytr, yts = train_test_split(X, y_raw, test_size=test_rate, random_state=0)
# print(Xtr.shape)
# print(ytr)
# print(Xts.shape)
# print(yts.shape)

#change the features to improve accurancy
rdim = Xtr.shape[0]
sdim = Xts.shape[0]
Xtr = Xtr.reshape(rdim, ydim, xdim)
# Xtr = np.delete(Xtr, [19,20], 1)
Xts = Xts.reshape(sdim, ydim, xdim)
# Xts = np.delete(Xts, [19,20], 1)
print(Xtr.shape)
# print(type(Xtr))
ydim = ydim - 0

Xtr = Xtr.reshape(rdim, ydim, xdim, 1)
Xts = Xts.reshape(sdim, ydim, xdim, 1)
ytr_reshape = ytr.reshape(-1, 1)
encoder = preprocessing.OneHotEncoder(sparse=False)
ytr_hot = encoder.fit_transform(ytr_reshape)

# cnt = Counter(Xtr)
# print("Number of files in each category in TRAIN set:")
# for k in sorted(cnt.keys()):
#     print(cnt[k])
# print(Xtr.shape)
# print(Xts.shape)
in_shape = (ydim,xdim,1)

In [None]:
np.random.seed(0)
# in_shape = (ydim,xdim,3)

model = Sequential()

conv_filters = 16   # number of convolution filters (= CNN depth)
#conv_filters = 32   # number of convolution filters (= CNN depth)

# normalize for each batch 
# model.add(BatchNormalization(input_shape=in_shape))

# Layer 1
model.add(Convolution2D(conv_filters, (3,3), input_shape=in_shape))
model.add(MaxPooling2D(pool_size=(2,2))) 
# model.add(Dropout(0.1)) 

# Layer 2
model.add(Convolution2D(conv_filters, (3,3)))
model.add(MaxPooling2D(pool_size=(2,2)))
# model.add(Dropout(0.1))

# Layer 3
# model.add(Convolution2D(conv_filters, (3, 3)))
# model.add(MaxPooling2D(pool_size=(1, 1)))
# model.add(Dropout(0.1))

model.add(Activation('relu'))

# After Convolution, we have a 16*x*y matrix output
# In order to feed this to a Full(Dense) layer, we need to flatten all data
# Note: Keras does automatic shape inference, i.e. it knows how many (flat) input units the next layer will need,
# so no parameter is needed for the Flatten() layer.
model.add(Flatten()) 

# Full layer
model.add(Dense(200, activation='sigmoid')) 
#model.add(Dense(256, activation='sigmoid')) 
model.add(Dropout(0.25))

# Output layer
# For binary/2-class problems use ONE sigmoid unit, for multi-class/multi-label problems use n output units 
# activation should be 'softmax' for multi-class / single-label output, 'sigmoid' for binary or multi-label tasks
model.add(Dense(7,activation='softmax'))
# model.add(Dense(1,activation='softmax'))

In [None]:
# from keras.models import Model
# # CNN layers
# # specify desired number of filters
# n_filters = 16
# input = Input(in_shape)

# # The functional API allows to specify the predecessor in (brackets) after the new Layer function call
# conv_layer1 = Convolution2D(n_filters, (10,4), activation='relu')(input)  # a vertical filter
# conv_layer2 = Convolution2D(n_filters, (4,10), activation='relu')(input)  # a horizontal filter

# # Pooling layers - equal sized
# #maxpool1 = MaxPooling2D(pool_size=(2,2))(conv_layer1)
# #maxpool2 = MaxPooling2D(pool_size=(2,2))(conv_layer2)

# # ALTERNATIVE: Pooling layers - complementary to vertical/horizontal filter
# #maxpool1 = MaxPooling2D(pool_size=(1,2))(conv_layer1)
# #maxpool2 = MaxPooling2D(pool_size=(2,1))(conv_layer2)

# # LARGER Pooling layers - complementary to vertical/horizontal filter
# maxpool1 = MaxPooling2D(pool_size=(1,5))(conv_layer1)
# maxpool2 = MaxPooling2D(pool_size=(5,1))(conv_layer2) # used 4,1 first

# # Dropout for both layers
# maxpool1 = Dropout(0.25)(maxpool1)
# maxpool2 = Dropout(0.25)(maxpool2)

# # we have to flatten the Pooling output in order to be concatenated
# poolflat1 = Flatten()(maxpool1)
# poolflat2 = Flatten()(maxpool2)

# # Merge the 2 parallel pipelines
# merged = concatenate([poolflat1, poolflat2],1)

# full = Dense(256, activation='sigmoid')(merged)
# output_layer = Dense(7, activation='softmax')(full)

# # finally create the model
# model = Model(input=input, output=output_layer)

In [None]:
model.summary()

In [None]:
# Define a loss function 
loss = 'categorical_crossentropy' 
# loss = 'sparse_categorical_crossentropy' 
# learn_rate = 0.5
# # Optimizer = Stochastic Gradient Descent
# optimizer = optimizers.SGD(lr=learn_rate)
opt = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=opt,
              loss=loss,
              metrics=['accuracy'])

# Compiling the model
# model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])

In [None]:
history = None

In [None]:
# TRAINING the model

# YOU MAY RUN THIS CELL MULTIPLE TIMES TO CONTINUE TO TRAIN THE MODEL FURTHER

# for how many epochs (iterations) to train
epochs = 10

# for training we need the "1 hot encoded" numeric classes of the ground truth
# History = model.fit(train_set, train_classes_1hot, batch_size=32, nb_epoch=epochs)
validation_percent = 0.1
History = model.fit(Xtr, ytr_hot, validation_split=validation_percent, batch_size=32, epochs=epochs)

# we keep the history of accuracies on training set
# we append this to previous history in case we execute this cell multiple times
if history is None:
    history = History.history
else:
    for key in History.history.keys():
        history[key].extend(History.history[key])

In [None]:
test_pred = model.predict(Xts)
test_pred = np.argmax(test_pred, axis=1)
test_pred

In [None]:
test_pred.shape

In [None]:
accuracy_score(yts, test_pred)

In [None]:
# print(xtr.shape)
# nin = xtr.shape[1]  # dimension of input data
# nh = 100     # number of hidden units
# nout = 8    # number of outputs = 10 since there are 10 classes
# model = Sequential()
# model.add(Dense(nh, input_shape=(nin,), activation='sigmoid', name='hidden'))
# model.add(Dense(nout, activation='softmax', name='output'))
# model.summary()

In [None]:
# class LossHistory(keras.callbacks.Callback):
#     def on_train_begin(self, logs={}):
#         # TODO:  Create two empty lists, self.loss and self.val_acc
#         self.loss = []
#         self.val_acc = []
 
#     def on_batch_end(self, batch, logs={}):
#         # TODO:  This is called at the end of each batch.  
#         # Add the loss in logs.get('loss') to the loss list
#         loss = logs.get('loss')
#         self.loss.append(loss)
        
#     def on_epoch_end(self, epoch, logs):
#         # TODO:  This is called at the end of each epoch.  
#         # Add the test accuracy in logs.get('loss') to the val_acc list
#         acc = logs.get('val_acc')
#         self.val_acc.append(acc)

# # Create an instance of the history callback
# history_cb = LossHistory()

In [None]:
# from keras import optimizers
# opt = optimizers.Adam(lr=1e-5) # beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
# model.compile(optimizer=opt,
#               loss='sparse_categorical_crossentropy',
#               metrics=['accuracy'])

In [None]:
# batch_size = 50
# model.fit(xtr, ytr, epochs=10, batch_size=batch_size, validation_data=(xts,yts), callbacks=[history_cb])