In [1]:
import os

In [2]:
augmented_root = './datasets/augmented'
padding_root = './datasets/padding'

In [3]:
# CREATE FUNCTION TO EXTRACT EMOTION LABEL
emotion = []
file_path = []

def get_emotion_path_list(root):
    emotion_list = []
    path_list = []
    for path, subdirs, files in os.walk(root):
        for name in files:
            if name.endswith(".wav"):
                # print(path.split("/")[2]) # label
                emotion_list.append(path.split("/")[3])
                path_list.append(os.path.join(path, name))
    return emotion_list, path_list

In [4]:
emotion, file_path = get_emotion_path_list(padding_root)

In [5]:
import pandas as pd

RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

In [6]:
# PUT EXTRACTED LABELS WITH FILEPATH INTO DATAFRAME
padding_audio_df = pd.DataFrame(emotion)
padding_audio_df.columns = ['labels']
padding_audio_df = pd.concat([padding_audio_df, pd.DataFrame(file_path, columns = ['path'])], axis=1)
print(len(padding_audio_df))
padding_audio_df.head()

1049


Unnamed: 0,labels,path
0,taunt,./datasets/padding/taunt/padding_re-threat-57.wav
1,taunt,./datasets/padding/taunt/padding_re-threat-43.wav
2,taunt,./datasets/padding/taunt/padding_re-threat-80.wav
3,taunt,./datasets/padding/taunt/padding_re-threat-81.wav
4,taunt,./datasets/padding/taunt/padding_re-threat-42.wav


In [None]:
emotion, file_path = get_emotion_path_list(augmented_root)

In [None]:
# PUT EXTRACTED LABELS WITH FILEPATH INTO DATAFRAME
augmented_audio_df = pd.DataFrame(emotion)
augmented_audio_df.columns = ['labels']
augmented_audio_df = pd.concat([augmented_audio_df, pd.DataFrame(file_path, columns = ['path'])], axis=1)
augmented_audio_df.head()

In [None]:
audio_df = pd.concat([padding_audio_df, augmented_audio_df], axis=0, ignore_index=True )
audio_df

In [None]:
print(len(audio_df[audio_df.labels == 'taunt']))
print(len(audio_df[audio_df.labels == 'upset']))
print(len(audio_df[audio_df.labels == 'angry']))
print(len(audio_df[audio_df.labels == 'calm']))

In [None]:
# ENSURE COLUMN VALUES ARE CORRECT
pd.set_option('display.max_colwidth', -1)
audio_df.sample(10)

In [None]:
# LOOK AT DISTRIBUTION OF CLASSES
audio_df.labels.value_counts().plot(kind='bar')

In [7]:
import librosa
import numpy as np

In [8]:
# ITERATE OVER ALL AUDIO FILES AND EXTRACT LOG MEL SPECTROGRAM MEAN VALUES INTO DF FOR MODELING 
df = pd.DataFrame(columns=['features'])

counter=0

for index, path in enumerate(audio_df.path):
    X, sample_rate = librosa.load(path, res_type='kaiser_fast', duration=10.0, sr=None)
    
    stft = np.abs(librosa.stft(X))

    # fmin 和 fmax 對應於人類語音的最小最大基本頻率
    pitches, magnitudes = librosa.piptrack(X, sr=sample_rate, S=stft, fmin=70, fmax=400)
    pitch = []
    for i in range(magnitudes.shape[1]):
        index = magnitudes[:, 1].argmax()
        pitch.append(pitches[index, i])

    pitch_tuning_offset = librosa.pitch_tuning(pitches)
    pitchmean = np.mean(pitch)
    pitchstd = np.std(pitch)
    pitchmax = np.max(pitch)
    pitchmin = np.min(pitch)

    # 頻譜質心
    cent = librosa.feature.spectral_centroid(y=X, sr=sample_rate)
    cent = cent / np.sum(cent)
    meancent = np.mean(cent)
    stdcent = np.std(cent)
    maxcent = np.max(cent)

    # 譜平面
    flatness = np.mean(librosa.feature.spectral_flatness(y=X))

    # 使用系數為13的MFCC特徵
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13).T, axis=0)
    mfccsstd = np.std(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13).T, axis=0)
    mfccmax = np.max(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13).T, axis=0)

    # 色譜圖
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)

    # 梅爾頻率
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)

    # ottava對比
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)

    # 過零率
    zerocr = np.mean(librosa.feature.zero_crossing_rate(X))

    S, phase = librosa.magphase(stft)
    meanMagnitude = np.mean(S)
    stdMagnitude = np.std(S)
    maxMagnitude = np.max(S)

    # 均方根能量
    rms = librosa.feature.rms(S=S)[0]
    meanrms = np.mean(rms)
    stdrms = np.std(rms)
    maxrms = np.max(rms)

    ext_features = np.array([
        flatness, zerocr, meanMagnitude, maxMagnitude, meancent, stdcent,
        maxcent, stdMagnitude, pitchmean, pitchmax, pitchstd,
        pitch_tuning_offset, meanrms, maxrms, stdrms
    ])

    ext_features = np.concatenate((ext_features, mfccs, mfccsstd, mfccmax, chroma, mel, contrast))
        
    df.loc[counter] = [ext_features]
    counter=counter+1   

print(len(df))
df.head()

1049


Unnamed: 0,features
0,"[-387.3323, 41.278095, -26.217411, 23.478987, ..."
1,"[-371.9092, 75.07741, -31.255543, 11.7239065, ..."
2,"[-370.3928, 39.215595, -22.409412, 15.02184, -..."
3,"[-447.38538, 29.748251, -6.235247, 5.735924, -..."
4,"[-419.0164, 65.34434, -6.7146897, 9.1543865, -..."


In [9]:
df_combined = pd.concat([audio_df, pd.DataFrame(df['features'].values.tolist())], axis=1)

In [10]:
df_combined = df_combined.fillna(0)

In [11]:
# DROP PATH COLUMN FOR MODELING
df_combined.drop(columns='path', inplace=True)

In [12]:
# CHECK TOP 5 ROWS
df_combined.head()

Unnamed: 0,labels,0,1,2,3,4,5,6,7,8,...,29,30,31,32,33,34,35,36,37,38
0,taunt,-387.332306,41.278095,-26.217411,23.478987,-21.221752,-3.16787,-2.555068,-7.530316,-7.81287,...,136.050446,27.569355,59.00058,48.622795,31.80101,24.387249,21.796705,15.529959,28.728931,26.144257
1,taunt,-371.90921,75.077408,-31.255543,11.723907,1.083341,-5.691078,-6.333925,3.3937,-7.370813,...,86.733742,34.921051,19.416603,25.906837,47.580502,20.488594,25.857126,31.09198,11.447498,13.655737
2,taunt,-370.392792,39.215595,-22.409412,15.02184,-19.398071,-0.971972,-0.389927,-5.652776,-8.247676,...,119.755768,19.23361,65.026779,33.770046,23.649931,7.127609,36.682632,13.555176,30.341942,22.073078
3,taunt,-447.385376,29.748251,-6.235247,5.735924,-7.789796,2.991443,-6.330763,-0.119231,-5.61409,...,89.400787,24.531925,46.112801,15.662816,31.338982,11.347013,18.708534,13.111612,19.38027,29.571518
4,taunt,-419.016388,65.344337,-6.71469,9.154387,-8.516296,2.05101,-5.422795,4.942542,-3.922849,...,103.666702,10.647741,48.276596,16.777142,35.848019,9.932405,26.47854,17.704189,8.592093,22.425457


In [None]:
df_combined.to_csv('for_lstm_features.csv', index=False)

In [13]:
X = df_combined.iloc[:, 1:].values
Y = df_combined['labels'].values

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

In [15]:
encoder = OneHotEncoder()
print(np.array(Y).reshape(-1, 1))
Y = encoder.fit_transform(np.array(Y).reshape(-1, 1)).toarray()

[['taunt']
 ['taunt']
 ['taunt']
 ...
 ['upset']
 ['upset']
 ['upset']]


In [16]:
X[0]

array([-387.3323   ,   41.278095 ,  -26.217411 ,   23.478987 ,
        -21.221752 ,   -3.1678698,   -2.5550678,   -7.5303164,
         -7.81287  ,   -1.375793 ,   -5.238518 ,   -4.3855257,
         -1.7967204,  151.23877  ,   47.150234 ,   56.979397 ,
         33.466167 ,   33.72315  ,   20.807877 ,   18.469158 ,
         17.305733 ,   13.989556 ,    9.296504 ,    8.787268 ,
          8.41936  ,    8.934577 ,  -25.01341  ,  165.70148  ,
         69.390305 ,  136.05045  ,   27.569355 ,   59.00058  ,
         48.622795 ,   31.80101  ,   24.387249 ,   21.796705 ,
         15.529959 ,   28.728931 ,   26.144257 ], dtype=float32)

In [17]:
# splitting data
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((786, 39), (786, 4), (263, 39), (263, 4))

In [18]:
x_train[0]

array([-524.29095   ,   52.08505   ,    4.8364167 ,   12.7117195 ,
          1.6259842 ,   13.291577  ,    2.7669728 ,    5.052408  ,
         -0.90490085,   -0.68331736,    1.7970308 ,    1.8669195 ,
          1.4591393 ,  126.71241   ,   64.488976  ,   17.640116  ,
         21.681257  ,   14.017623  ,   18.543316  ,   11.415767  ,
          9.276158  ,    8.251394  ,    7.165609  ,    7.1093373 ,
          6.762876  ,    6.9752545 , -214.26495   ,  217.60704   ,
         76.92479   ,  118.00828   ,   48.57874   ,   77.765656  ,
         28.894896  ,   50.303497  ,   19.493467  ,   17.693916  ,
         26.904854  ,   22.867046  ,   43.541748  ], dtype=float32)

In [19]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((786, 39), (786, 4), (263, 39), (263, 4))

In [20]:
x_train[0]

array([-0.89290524,  0.3572929 ,  1.280064  ,  0.11423127,  1.3901958 ,
        1.7453734 ,  1.3705877 ,  1.2546271 ,  0.8713353 ,  0.3020591 ,
        1.5129595 ,  1.6216    ,  0.72618717, -0.9500413 ,  0.6809286 ,
       -1.2027407 , -0.44562182, -1.2228798 ,  1.2412442 , -0.76055324,
       -0.53835773, -1.020421  , -0.6679094 , -0.57252115, -0.9182528 ,
       -0.19809683, -1.325526  ,  0.89036185,  1.2346404 ,  0.48670873,
        1.333638  ,  1.1848007 ,  0.32143828,  1.2232277 , -0.14829738,
       -0.80525887,  0.72423744,  0.6890617 ,  1.3716886 ], dtype=float32)

In [21]:
x_train = np.expand_dims(x_train, axis=2)
x_test = np.expand_dims(x_test, axis=2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((786, 39, 1), (786, 4), (263, 39, 1), (263, 4))

In [22]:
x_train[0]

array([[-0.89290524],
       [ 0.3572929 ],
       [ 1.280064  ],
       [ 0.11423127],
       [ 1.3901958 ],
       [ 1.7453734 ],
       [ 1.3705877 ],
       [ 1.2546271 ],
       [ 0.8713353 ],
       [ 0.3020591 ],
       [ 1.5129595 ],
       [ 1.6216    ],
       [ 0.72618717],
       [-0.9500413 ],
       [ 0.6809286 ],
       [-1.2027407 ],
       [-0.44562182],
       [-1.2228798 ],
       [ 1.2412442 ],
       [-0.76055324],
       [-0.53835773],
       [-1.020421  ],
       [-0.6679094 ],
       [-0.57252115],
       [-0.9182528 ],
       [-0.19809683],
       [-1.325526  ],
       [ 0.89036185],
       [ 1.2346404 ],
       [ 0.48670873],
       [ 1.333638  ],
       [ 1.1848007 ],
       [ 0.32143828],
       [ 1.2232277 ],
       [-0.14829738],
       [-0.80525887],
       [ 0.72423744],
       [ 0.6890617 ],
       [ 1.3716886 ]], dtype=float32)

In [67]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, LSTM, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import numpy as np

In [78]:
model = Sequential()
model.add(LSTM(units=64, dropout=0.05, recurrent_dropout=0.20, activation="tanh", return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(LSTM(units=32, dropout=0.05, recurrent_dropout=0.20, activation="tanh", return_sequences=False))
model.add(Dense(4, activation='softmax')) # A, B, C
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_8 (LSTM)                (None, 39, 64)            16896     
_________________________________________________________________
lstm_9 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 132       
Total params: 29,444
Trainable params: 29,444
Non-trainable params: 0
_________________________________________________________________


In [79]:
model_file = "./lstm_model.hdf5"

MAX_PATIENT = 12
MAX_EPOCHS = 50
MAX_BATCH = 64

print("training started")

rlrp = ReduceLROnPlateau(patience=MAX_PATIENT, monitor='loss', factor=0.4, verbose=1, min_lr=0.0000001)
callback = [rlrp, ModelCheckpoint(filepath=model_file, monitor='loss', verbose=1, save_best_only=True)]

history = model.fit(x_train, y_train, batch_size=MAX_BATCH, epochs=MAX_EPOCHS, verbose=1, validation_data=(x_test, y_test), callbacks=callback)

print("training finished")


training started
Epoch 1/50


2022-05-04 04:10:18.339471: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-05-04 04:11:35.818864: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.



Epoch 00001: loss improved from inf to 1.35737, saving model to ./lstm_model.hdf5
Epoch 2/50

Epoch 00002: loss improved from 1.35737 to 1.20946, saving model to ./lstm_model.hdf5
Epoch 3/50

Epoch 00003: loss improved from 1.20946 to 1.14380, saving model to ./lstm_model.hdf5
Epoch 4/50

Epoch 00004: loss improved from 1.14380 to 1.12260, saving model to ./lstm_model.hdf5
Epoch 5/50

Epoch 00005: loss improved from 1.12260 to 1.08504, saving model to ./lstm_model.hdf5
Epoch 6/50

Epoch 00006: loss improved from 1.08504 to 1.06750, saving model to ./lstm_model.hdf5
Epoch 7/50

Epoch 00007: loss improved from 1.06750 to 1.03824, saving model to ./lstm_model.hdf5
Epoch 8/50

Epoch 00008: loss improved from 1.03824 to 1.01354, saving model to ./lstm_model.hdf5
Epoch 9/50

Epoch 00009: loss improved from 1.01354 to 1.01057, saving model to ./lstm_model.hdf5
Epoch 10/50

Epoch 00010: loss improved from 1.01057 to 0.99687, saving model to ./lstm_model.hdf5
Epoch 11/50

Epoch 00011: loss did


Epoch 00040: loss improved from 0.81282 to 0.80958, saving model to ./lstm_model.hdf5
Epoch 41/50

Epoch 00041: loss improved from 0.80958 to 0.80093, saving model to ./lstm_model.hdf5
Epoch 42/50

Epoch 00042: loss did not improve from 0.80093
Epoch 43/50

Epoch 00043: loss improved from 0.80093 to 0.79870, saving model to ./lstm_model.hdf5
Epoch 44/50

Epoch 00044: loss improved from 0.79870 to 0.78135, saving model to ./lstm_model.hdf5
Epoch 45/50

Epoch 00045: loss improved from 0.78135 to 0.76581, saving model to ./lstm_model.hdf5
Epoch 46/50

Epoch 00046: loss did not improve from 0.76581
Epoch 47/50

Epoch 00047: loss did not improve from 0.76581
Epoch 48/50

Epoch 00048: loss did not improve from 0.76581
Epoch 49/50

Epoch 00049: loss improved from 0.76581 to 0.75887, saving model to ./lstm_model.hdf5
Epoch 50/50

Epoch 00050: loss improved from 0.75887 to 0.74655, saving model to ./lstm_model.hdf5
training finished


In [82]:
x_train

array([[[-0.89290524],
        [ 0.3572929 ],
        [ 1.280064  ],
        ...,
        [ 0.72423744],
        [ 0.6890617 ],
        [ 1.3716886 ]],

       [[-1.8980803 ],
        [ 0.8409015 ],
        [-0.21610735],
        ...,
        [-0.37916136],
        [ 0.9522005 ],
        [-0.5098771 ]],

       [[-0.8007864 ],
        [ 0.6269899 ],
        [ 1.378972  ],
        ...,
        [ 1.3334322 ],
        [ 0.58688676],
        [ 1.3473978 ]],

       ...,

       [[-1.0099235 ],
        [-0.1820338 ],
        [-0.17817014],
        ...,
        [ 0.94541883],
        [-1.1512233 ],
        [ 0.89792573]],

       [[ 0.4543923 ],
        [-0.5785093 ],
        [-0.02434784],
        ...,
        [ 0.8425137 ],
        [-1.0830256 ],
        [-0.19975953]],

       [[ 1.4269778 ],
        [-0.537312  ],
        [-1.490688  ],
        ...,
        [ 1.2167845 ],
        [-0.29672724],
        [ 0.4474781 ]]], dtype=float32)