In [1]:
# IMPORT NECESSARY LIBRARIES
from tensorflow.keras.models import load_model
import librosa
%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display
from IPython.display import Audio
import numpy as np
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
from sklearn.metrics import confusion_matrix
import IPython.display as ipd  # To play sound in the notebook
import os # interface with underlying OS that python is running on
import sys
import warnings
# ignore warnings 
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from tensorflow.keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization, Dense
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.regularizers import l2
import seaborn as sns
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report

Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
model = load_model("models/Emotion_Model_aug.h5")

Metal device set to: Apple M1 Pro


2022-04-21 22:05:27.619746: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-04-21 22:05:27.619869: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:
import os, glob

test_path_list = []
for file in glob.glob("./datasets/augmented/upset/*wav"):
    test_path_list.append(file)

In [4]:
test_path_list

['./datasets/augmented/upset/data_noise01_padding_看到我是不會打招呼哦 廢物.wav',
 './datasets/augmented/upset/data_shift16_padding_你不幫我買東西 我就不跟你當朋友 拜託.wav',
 './datasets/augmented/upset/data_shift32_padding_拜託你們 考出好成績好嗎.wav',
 './datasets/augmented/upset/data_pitch09_padding_你以為這樣子很厲害嗎.wav',
 './datasets/augmented/upset/data_shift48_padding_re-cry- (75).wav',
 './datasets/augmented/upset/data_shift16_padding_胖子少吃點我都快被你吃垮了.wav',
 './datasets/augmented/upset/data_pitch08_padding_為什麼要打我.wav',
 './datasets/augmented/upset/data_speed09_padding_可不可以坐好 一直扭來扭去的.wav',
 './datasets/augmented/upset/data_speed11_padding_你再不乖乖吃飯我只好就讓你身首異處.wav',
 './datasets/augmented/upset/data_pitch09_padding_沒有腦袋像個智障.wav',
 './datasets/augmented/upset/data_shift16_padding_可以請你安靜嗎 我在讀書.wav',
 './datasets/augmented/upset/data_shift32_padding_你不會動腦嗎.wav',
 './datasets/augmented/upset/data_pitch08_padding_re-cry- (82).wav',
 './datasets/augmented/upset/data_speed08_padding_re-cry- (33).wav',
 './datasets/augmented/upset/data_no

In [5]:
def get_features(path):

    X, sample_rate = librosa.load(path, res_type='kaiser_fast', duration=10.0, sr=None)
    
    stft = np.abs(librosa.stft(X))

    # fmin 和 fmax 對應於人類語音的最小最大基本頻率
    pitches, magnitudes = librosa.piptrack(X, sr=sample_rate, S=stft, fmin=70, fmax=400)
    pitch = []
    for i in range(magnitudes.shape[1]):
        index = magnitudes[:, 1].argmax()
        pitch.append(pitches[index, i])

    pitch_tuning_offset = librosa.pitch_tuning(pitches)
    pitchmean = np.mean(pitch)
    pitchstd = np.std(pitch)
    pitchmax = np.max(pitch)
    pitchmin = np.min(pitch)

    # 頻譜質心
    cent = librosa.feature.spectral_centroid(y=X, sr=sample_rate)
    cent = cent / np.sum(cent)
    meancent = np.mean(cent)
    stdcent = np.std(cent)
    maxcent = np.max(cent)

    # 譜平面
    flatness = np.mean(librosa.feature.spectral_flatness(y=X))

    # 使用系數為13的MFCC特徵
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13).T, axis=0)
    mfccsstd = np.std(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13).T, axis=0)
    mfccmax = np.max(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13).T, axis=0)

    # 色譜圖
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)

    # 梅爾頻率
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)

    # ottava對比
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)

    # 過零率
    zerocr = np.mean(librosa.feature.zero_crossing_rate(X))

    S, phase = librosa.magphase(stft)
    meanMagnitude = np.mean(S)
    stdMagnitude = np.std(S)
    maxMagnitude = np.max(S)

    # 均方根能量
    rms = librosa.feature.rms(S=S)[0]
    meanrms = np.mean(rms)
    stdrms = np.std(rms)
    maxrms = np.max(rms)

    ext_features = np.array([
        flatness, zerocr, meanMagnitude, maxMagnitude, meancent, stdcent,
        maxcent, stdMagnitude, pitchmean, pitchmax, pitchstd,
        pitch_tuning_offset, meanrms, maxrms, stdrms
    ])

    ext_features = np.concatenate((ext_features, mfccs, mfccsstd, mfccmax, chroma, mel, contrast))
        
    return ext_features

In [12]:
scaler = StandardScaler()
encoder = OneHotEncoder()

df_features = pd.read_csv('features.csv')
df_features

Unnamed: 0,labels,0,1,2,3,4,5,6,7,8,...,191,192,193,194,195,196,197,198,199,200
0,taunt,0.349620,0.129026,0.554054,163.037979,0.003195,0.002922,0.011338,2.890938,21.750515,...,0.024860,0.005420,3.369298e-04,19.080706,19.134450,21.125617,19.423748,19.274803,21.902710,25.117049
1,taunt,0.420087,0.068244,0.560991,167.655838,0.003195,0.002847,0.011331,2.839539,7.450559,...,0.002620,0.000413,2.484302e-05,17.962222,16.911051,17.674325,17.033042,16.880189,19.637548,23.111096
2,taunt,0.498637,0.081467,0.773144,273.056702,0.003195,0.003458,0.015535,5.216930,6.002203,...,0.010938,0.002382,1.156494e-04,16.050520,19.224504,20.094982,19.420473,18.647094,21.973569,23.659706
3,taunt,0.648623,0.043919,0.402030,160.456573,0.003195,0.004484,0.017506,2.989140,7.426612,...,0.041246,0.010192,4.100692e-04,16.558265,16.364409,16.976335,16.284359,16.494198,16.978649,20.274362
4,taunt,0.542116,0.040381,0.373848,106.289726,0.003195,0.003759,0.016790,2.185241,11.056521,...,0.017987,0.003564,2.029519e-04,17.101434,13.360917,14.166413,14.598431,13.997962,14.618500,19.963319
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15730,upset,0.590074,0.039775,0.060804,64.716248,0.003195,0.004357,0.021169,0.596097,10.541844,...,0.000552,0.000190,1.256435e-05,21.841346,17.680658,18.387753,18.150996,18.145541,19.234715,22.864406
15731,upset,0.653773,0.028561,0.162391,80.988266,0.003195,0.006068,0.039710,1.285566,4.347139,...,0.004041,0.001087,4.006107e-05,18.034568,15.331230,15.971342,16.239014,16.085378,16.849852,20.068561
15732,upset,0.558546,0.022143,0.115764,57.101089,0.003195,0.004675,0.024271,0.994173,0.000000,...,0.000419,0.000086,6.196451e-06,18.136612,14.629613,16.083037,15.785649,15.428285,16.005202,19.735171
15733,upset,0.442611,0.414153,0.293616,58.416603,0.003195,0.000528,0.003832,0.763809,22.621918,...,0.010631,0.009824,9.442406e-03,13.220441,13.531459,14.359092,13.468283,13.740494,13.739564,13.453153


In [13]:
X = df_features.iloc[:, 1:].values
X = scaler.fit_transform(X)
Y = df_features['labels'].values
Y = encoder.fit_transform(np.array(Y).reshape(-1, 1)).toarray()

In [14]:
for index, element in enumerate(test_path_list):
    df = pd.DataFrame(columns=['features'])
    df.features = get_features(test_path_list[index])
    textfeature_df = pd.DataFrame(df['features'].values.tolist()).T
    X = textfeature_df.iloc[:, :].values
    X = scaler.transform(X)
    X = np.expand_dims(X, axis=2)
    pred_test = model.predict(X)
    pred = encoder.inverse_transform(pred_test)
    print(pred)

[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['calm']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['up

[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['calm']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['taunt']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['calm']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['ups

[['upset']]
[['upset']]
[['upset']]
[['upset']]
[['upset']]


In [None]:
# loading json and model architecture 
json_file = open('model_json_aug.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = tf.keras.models.model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("models/Emotion_Model_aug.h5")
print("Loaded model from disk")

In [None]:
print(type(Y_pred))
print(Y_pred.item(0))
print(type(Y_pred.item(0)))

In [None]:
# Keras optimiser
loaded_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
loaded_model.summary()

In [None]:
import pyaudio
import wave
from datetime import datetime
import speech_recognition as sr

FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024
RECORD_SECONDS = 10
WAVE_OUTPUT_FILENAME = ""

pa = pyaudio.PyAudio()
r = sr.Recognizer()

In [None]:
stream = pa.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True, frames_per_buffer=CHUNK, input_device_index=0)
stream.start_stream()
print("開始錄音......")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)
print("錄音結束!")

stream.stop_stream()
stream.close()
pa.terminate()

WAVE_OUTPUT_FILENAME = "./record/"+datetime.now().strftime("%Y-%m-%d-%H-%M-%S")+".wav"

wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(pa.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

In [None]:

with sr.AudioFile(WAVE_OUTPUT_FILENAME) as source:
    audio = r.record(source)

In [None]:
output_df = pd.DataFrame(columns=['features'])

output_df.features = get_features(WAVE_OUTPUT_FILENAME)

In [None]:
output_feature_df = pd.DataFrame(output_df['features'].values.tolist()).T


In [None]:
X = output_feature_df.iloc[:, :].values
X = scaler.fit_transform(X)
X = np.expand_dims(X, axis=2)
pred_test = model.predict(X)

In [None]:
Y_pred = encoder.inverse_transform(pred_test)
Y_pred