In [29]:
import os
from pathlib import Path
from glob import glob
from tqdm import tqdm
import numpy as np
import pandas as pd
import librosa
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from multiprocessing import Pool, cpu_count

RAW_PATH = "../RawData"
OUT_PATH = "./MFCC"
THREADS = 80

### 原始資料

In [30]:
np_CASIA = np.load('CASIA.npy', allow_pickle=True)
np_EMODB = np.load('EMODB.npy', allow_pickle=True)
np_MERGE_TRAIN = np.load('MERGE_TRAIN.npy', allow_pickle=True)
np_IEMOCAP = np.load('IEMOCAP.npy', allow_pickle=True)

In [31]:
dict_CASIA = np_CASIA.item()
dict_EMODB = np_EMODB.item()
dict_MERGE_TRAIN = np_MERGE_TRAIN.item()
dict_IEMOCAP = np_IEMOCAP.item()
x, y = dict_MERGE_TRAIN['x'], dict_IEMOCAP['y']
print(f'{"X shape":10}', x.shape)
print(f'{"y shape":10}', y.shape)

X shape    (20310, 254, 39)
y shape    (5531, 4)


### 將 mp4 轉換成 wav

In [32]:
def mp42wav(filename):
    ObjPath = Path(filename)
    filename_wo = str(ObjPath.parent.joinpath(ObjPath.stem))
    # print(f"ffmpeg -loglevel error -i {filename_wo}.mp4 -ar:a 0 -vn -y {filename_wo}.wav")
    os.system(f"ffmpeg -loglevel error -i {filename_wo}.mp4 -ar:a 0 -vn -y {filename_wo}.wav")

In [5]:
ls_mp4 = glob(os.path.join(RAW_PATH, 'meld_part*/*.mp4'))
pool = Pool(THREADS)
r = list(tqdm(pool.imap(mp42wav, ls_mp4), desc='mp42wav: ', total=len(ls_mp4)))

# for mp4_file in tqdm(ls_mp4, desc="mp42wav: "):
#     mp42wav(mp4_file)

mp42wav:  43%|████▎     | 5932/13848 [00:39<00:56, 140.92it/s][mov,mp4,m4a,3gp,3g2,mj2 @ 0x55cc21461740] moov atom not found
../RawData/meld_part1/dia125_utt3.mp4: Invalid data found when processing input
mp42wav: 100%|██████████| 13848/13848 [01:26<00:00, 159.75it/s]


### 前處理新資料

In [33]:
def get_feature(file_path: str, mfcc_len: int=39, mean_signal_length: int=130000):
    signal, fs = librosa.load(file_path)
    s_len = len(signal)
    
    if s_len < mean_signal_length:
        pad_len = mean_signal_length - s_len
        pad_rem = pad_len % 2
        pad_len //= 2
        signal = np.pad(signal, (pad_len, pad_len + pad_rem), 'constant', constant_values=0)
    else:
        pad_len = s_len - mean_signal_length
        pad_len //= 2
        signal = signal[pad_len: pad_len + mean_signal_length]

    mfcc = librosa.feature.mfcc(y=signal, sr=fs, n_mfcc=mfcc_len).T
    return mfcc

In [34]:
df_train_map = pd.read_csv(os.path.join(RAW_PATH, 'train_data.csv'))
df_test_map = pd.read_csv(os.path.join(RAW_PATH, 'test_data.csv'))

In [35]:
df_train_map['name'] = df_train_map['name'].str.replace('.mp4', '.wav', regex=False)
df_test_map['name'] = df_test_map['name'].str.replace('.mp4', '.wav', regex=False)

In [36]:
df_train_map['emotion'].value_counts()

neutral     6413
happy       3337
angry       2776
sad         2292
surprise    1782
disgust     1779
fear        1777
calm         154
Name: emotion, dtype: int64

In [37]:
df_train_map

Unnamed: 0,source,name,emotion
0,crema_d,1003_TAI_DIS_XX.wav,disgust
1,ravdess,03-01-03-02-01-02-11.wav,happy
2,meld_part1,dia551_utt3.wav,happy
3,ravdess,03-01-07-01-01-02-09.wav,disgust
4,meld_part1,dia1013_utt2.wav,neutral
...,...,...,...
20305,ravdess,03-01-02-01-01-01-19.wav,calm
20306,ravdess,03-01-06-02-01-01-18.wav,fear
20307,tess,OAF_phone_happy.wav,happy
20308,meld_part1,dia640_utt13.wav,neutral


In [38]:
lbec = LabelEncoder().fit(df_train_map['emotion'])
df_train_map['emotion'] = lbec.transform(df_train_map['emotion'])
df_test_map['emotion'] = lbec.transform(df_test_map['emotion'])
print(lbec.classes_)

ohec = OneHotEncoder().fit(df_train_map[['emotion']])
train_y = ohec.transform(df_train_map[['emotion']]).toarray()
test_y = ohec.transform(df_test_map[['emotion']]).toarray()
print(ohec.categories_)
# lbec.inverse_transform()

['angry' 'calm' 'disgust' 'fear' 'happy' 'neutral' 'sad' 'surprise']
[array([0, 1, 2, 3, 4, 5, 6, 7])]


In [39]:
df_train_map['fullpath'] = RAW_PATH + '/' + df_train_map['source'] + '/' + df_train_map['name']
df_test_map['fullpath'] = RAW_PATH + '/' + df_test_map['source'] + '/' + df_test_map['name']
ls_train_fullpath = df_train_map['fullpath'].tolist()
ls_test_fullpath = df_test_map['fullpath'].tolist()

In [40]:
pool = Pool(THREADS)
train_X = list(tqdm(pool.imap(get_feature, ls_train_fullpath), total=len(ls_train_fullpath), desc='Get Train Features'))
train_X = np.array(train_X)

test_X = list(tqdm(pool.imap(get_feature, ls_test_fullpath), total=len(ls_test_fullpath), desc='Get Test Features'))
test_X = np.array(test_X)

Get Train Features:   9%|▉         | 1874/20310 [01:11<11:38, 26.38it/s]


KeyboardInterrupt: 

In [20]:
# get_feature('../RawData/crema_d/1001_DFA_ANG_XX.wav').shape

In [14]:
train_X.shape

(20310, 196, 39)

In [22]:
np.save(os.path.join('MERGE_TRAIN.npy'), {'x': train_X, 'y': train_y}, allow_pickle=True)
np.save(os.path.join('MERGE_TEST.npy'), {'x': test_X, 'y': test_y}, allow_pickle=True)