In [1]:
import pandas as pd
import numpy as np
import librosa
import librosa.display
import pysptk
from scipy.io import wavfile
import parselmouth
from parselmouth.praat import call
import wave
import statistics
import keras
#import pytorch

In [2]:
def hm_jit_shim(wav_file_path, f0min=75, f0max=300, unit=1):
    sound = parselmouth.Sound(wav_file_path) # read the sound
    #duration = call(sound, "Get total duration") # duration
    pitch = call(sound, "To Pitch", 0.0, f0min, f0max) #create a praat pitch object
    #meanF0 = call(pitch, "Get mean", 0, 0, unit) # get mean pitch
    #stdevF0 = call(pitch, "Get standard deviation", 0 ,0, unit) # get standard deviation
    harmonicity = call(sound, "To Harmonicity (cc)", 0.01, f0min, 0.1, 1.0)
    hnr = call(harmonicity, "Get mean", 0, 0)
    pointProcess = call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
    localJitter = call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
    localabsoluteJitter = call(pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3)
    rapJitter = call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
    ppq5Jitter = call(pointProcess, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
    ddpJitter = call(pointProcess, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
    localShimmer =  call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    localdbShimmer = call([sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    apq3Shimmer = call([sound, pointProcess], "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    aqpq5Shimmer = call([sound, pointProcess], "Get shimmer (apq5)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    apq11Shimmer =  call([sound, pointProcess], "Get shimmer (apq11)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    ddaShimmer = call([sound, pointProcess], "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    
    return [hnr, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer]

def get_formants(wav_file_path, f0min=75, f0max=300):
    sound = parselmouth.Sound(wav_file_path) # read the sound
    pitch = call(sound, "To Pitch (cc)", 0, f0min, 15, 'no', 0.03, 0.45, 0.01, 0.35, 0.14, f0max)
    pointProcess = call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
    
    formants = call(sound, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50)
    numPoints = call(pointProcess, "Get number of points")

    f1_list = []
    f2_list = []
    f3_list = []
    f4_list = []
    
    # Measure formants only at glottal pulses
    for point in range(0, numPoints):
        point += 1
        t = call(pointProcess, "Get time from index", point)
        f1 = call(formants, "Get value at time", 1, t, 'Hertz', 'Linear')
        f2 = call(formants, "Get value at time", 2, t, 'Hertz', 'Linear')
        f3 = call(formants, "Get value at time", 3, t, 'Hertz', 'Linear')
        f4 = call(formants, "Get value at time", 4, t, 'Hertz', 'Linear')
        f1_list.append(f1)
        f2_list.append(f2)
        f3_list.append(f3)
        f4_list.append(f4)
    
    f1_list = [f1 for f1 in f1_list if str(f1) != 'nan']
    f2_list = [f2 for f2 in f2_list if str(f2) != 'nan']
    f3_list = [f3 for f3 in f3_list if str(f3) != 'nan']
    f4_list = [f4 for f4 in f4_list if str(f4) != 'nan']
    
    # calculate mean formants across pulses
    f1_mean = statistics.mean(f1_list)
    f2_mean = statistics.mean(f2_list)
    f3_mean = statistics.mean(f3_list)
    f4_mean = statistics.mean(f4_list)
    
    # calculate median formants across pulses, this is what is used in all subsequent calcualtions
    # you can use mean if you want, just edit the code in the boxes below to replace median with mean
    f1_median = statistics.median(f1_list)
    f2_median = statistics.median(f2_list)
    f3_median = statistics.median(f3_list)
    f4_median = statistics.median(f4_list)
    
    return [f1_mean, f2_mean, f3_mean, f4_mean, f1_median, f2_median, f3_median, f4_median]


PARSF = 120

def get_aud_features_all(wav_path, num_mfcc = 40, hop_length = 512, n_fft = 2048, duration = 3, pad_mode = 'wrap'):
    snd, r = librosa.load(wav_path, duration = 3)
    #print(librosa.get_duration(filename = wav_path))
    frames = r*duration #set default # of frames
    if len(snd) < frames:
        snd = np.pad(snd, frames-len(snd), mode = pad_mode)
    snd = snd[:frames]
    mfcc = librosa.feature.mfcc(snd, sr = r, n_mfcc = num_mfcc)
    avg_mfcc = np.mean(mfcc, axis = 0)
    #mel_freq_raw = librosa.feature.melspectrogram(snd, sr = r)
    mel_raw = np.abs(librosa.stft(snd, n_fft = n_fft, hop_length = hop_length))
    mel_freq = librosa.amplitude_to_db(mel_raw, ref = np.max)
    avg_mel_freq = np.mean(mel_freq, axis = 0)
    stft = np.abs(librosa.stft(snd)) #resolve complex values
    chroma = librosa.feature.chroma_stft(S=stft, sr = r)
    avg_chroma = np.mean(chroma, axis = 0)
    oenv = librosa.onset.onset_strength(y = snd, sr=r, hop_length=hop_length)
    tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=r, hop_length=512)
    avg_tempogram = np.mean(tempogram, axis = 0)
    pars_aud = parselmouth.Sound(wav_path)
    intensity_obj = pars_aud.to_intensity()
    intensity = intensity_obj.xs()
    if len(intensity) < duration*PARSF:
        intensity = np.pad(intensity, (0, duration*PARSF-len(intensity)), mode = pad_mode)
        #print(intensity)
    intensity = intensity[:duration*PARSF]
    formants = get_formants(wav_path)
    glottal = hm_jit_shim(wav_path)
    #print(r, snd.shape, mfcc.shape, mel_freq.shape, chroma.shape, len(intensity), intensity.shape)#, tempogram.shape)
     
    return mfcc, avg_mfcc, mel_freq, avg_mel_freq, chroma, avg_chroma, intensity, formants, glottal, avg_tempogram

def get_aud_features_vec(wav_path, num_mfcc = 40, hop_length = 512):
    mfcc, avg_mfcc, mel_freq, avg_mel_freq, chroma, avg_chroma, intensity, formants, glottal, avg_tempogram = get_aud_features_all(wav_path, num_mfcc = num_mfcc, hop_length = hop_length)
    return avg_mfcc, avg_mel_freq, avg_chroma, intensity, formants, glottal, avg_tempogram


In [3]:
def get_aud_2d_feat(wav_path, num_mfcc = 40, hop_length = 512, n_fft = 2048, duration = 3, pad_mode = 'wrap'):
    mfcc, avg_mfcc, mel_freq, avg_mel_freq, chroma, avg_chroma, intensity, formants, glottal, avg_tempogram = get_aud_features_all(wav_path, num_mfcc = num_mfcc, n_fft = n_fft, duration = duration, hop_length = hop_length, pad_mode = pad_mode)
    mfcc = mfcc/240
    mel_freq = mel_freq/(-80)
    f_2d = np.concatenate((mfcc, mel_freq, chroma))
    return f_2d

In [9]:
#load the dataframe

std_len = 3
frames = 22050

SAVEE_path = "AudioData/"
actors = ["DC/", "JE/", "JK/", "KL/"]
emotions = ["a", "d", "f", "h", "n", "sa", "su"]
n_samples = []

for i in range(1, 16):
    if i < 10:
        num = "0"+str(i)
    else:
        num = str(i)
    n_samples.append(num)

features = []

for i in actors:
    for e in emotions:
        for n in n_samples:
            path = SAVEE_path+i+e+n+".wav"
            #get_aud_features_all(path, hop_length=512, n_fft = 128)
            #print(get_aud_2d_feat(path, hop_length=512, n_fft = 128))
            #print(path)
            if e == "a":
                label = 1
            else:
                label = 0
            data = get_aud_2d_feat(path, hop_length=512, n_fft = 128)
            features.append([data, label])
        
SAVEEdf = pd.DataFrame(features, columns = ["feature", "class_label"])
#print(SAVEEdf)

In [5]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
X = np.array(SAVEEdf.feature.tolist())
y = np.array(SAVEEdf.class_label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y)) 

# split the dataset 
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)

In [6]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics 

num_rows = 117
num_columns = 130
num_channels = 1

#print(x_train.shape)
x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]
filter_size = 2

# Construct model 
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax'))




(336, 117, 130)


In [7]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy) 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 116, 129, 16)      80        
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 58, 64, 16)        0         
_________________________________________________________________
dropout (Dropout)            (None, 58, 64, 16)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 57, 63, 32)        2080      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 28, 31, 32)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 28, 31, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 27, 30, 128)       1

In [8]:
from keras.callbacks import ModelCheckpoint 
from datetime import datetime 

num_epochs = 72
num_batch_size = 256

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_cnn.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)


# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Epoch 1/72
Epoch 00001: val_loss improved from inf to 0.60419, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 2/72
Epoch 00002: val_loss improved from 0.60419 to 0.53832, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 3/72
Epoch 00003: val_loss improved from 0.53832 to 0.47748, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 4/72
Epoch 00004: val_loss improved from 0.47748 to 0.42454, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 5/72
Epoch 00005: val_loss improved from 0.42454 to 0.38435, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 6/72
Epoch 00006: val_loss improved from 0.38435 to 0.36233, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 7/72
Epoch 00007: val_loss improved from 0.36233 to 0.35703, saving model to saved_models/weights.best.basic_cnn.hdf5
Epoch 8/72
Epoch 00008: val_loss did not improve from 0.35703
Epoch 9/72
Epoch 00009: val_loss did not improve from 0.35703
Epoch 10/72


Epoch 30/72
Epoch 00030: val_loss did not improve from 0.35703
Epoch 31/72
Epoch 00031: val_loss did not improve from 0.35703
Epoch 32/72
Epoch 00032: val_loss did not improve from 0.35703
Epoch 33/72
Epoch 00033: val_loss did not improve from 0.35703
Epoch 34/72
Epoch 00034: val_loss did not improve from 0.35703
Epoch 35/72
Epoch 00035: val_loss did not improve from 0.35703
Epoch 36/72
Epoch 00036: val_loss did not improve from 0.35703
Epoch 37/72
Epoch 00037: val_loss did not improve from 0.35703
Epoch 38/72
Epoch 00038: val_loss did not improve from 0.35703
Epoch 39/72
Epoch 00039: val_loss did not improve from 0.35703
Epoch 40/72
Epoch 00040: val_loss did not improve from 0.35703
Epoch 41/72
Epoch 00041: val_loss did not improve from 0.35703
Epoch 42/72
Epoch 00042: val_loss did not improve from 0.35703
Epoch 43/72
Epoch 00043: val_loss did not improve from 0.35703
Epoch 44/72
Epoch 00044: val_loss did not improve from 0.35703
Epoch 45/72
Epoch 00045: val_loss did not improve from 

Epoch 00060: val_loss did not improve from 0.35703
Epoch 61/72
Epoch 00061: val_loss did not improve from 0.35703
Epoch 62/72
Epoch 00062: val_loss did not improve from 0.35703
Epoch 63/72
Epoch 00063: val_loss did not improve from 0.35703
Epoch 64/72
Epoch 00064: val_loss did not improve from 0.35703
Epoch 65/72
Epoch 00065: val_loss did not improve from 0.35703
Epoch 66/72
Epoch 00066: val_loss did not improve from 0.35703
Epoch 67/72
Epoch 00067: val_loss did not improve from 0.35703
Epoch 68/72
Epoch 00068: val_loss did not improve from 0.35703
Epoch 69/72
Epoch 00069: val_loss did not improve from 0.35703
Epoch 70/72
Epoch 00070: val_loss did not improve from 0.35703
Epoch 71/72
Epoch 00071: val_loss did not improve from 0.35703
Epoch 72/72
Epoch 00072: val_loss did not improve from 0.35703
Training completed in time:  0:06:30.326183
Training Accuracy:  0.8452380895614624
Testing Accuracy:  0.9047619104385376
