## Fine Tuning

In [None]:
import pandas as pd
import numpy as np

import os
import sys

import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

from IPython.display import Audio

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.callbacks import ModelCheckpoint, History

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
!pip install -q transformers datasets evaluate

In [None]:
Ravdess = "/Data/audio_speech_actors_01-24/"
Crema = "/Data/AudioWAV/"
Tess = "/Data/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/"
Savee = "/Data/surrey-audiovisual-expressed-emotion-savee/ALL/"

In [None]:
ravdess_directory_list = os.listdir(Ravdess)

file_emotion = []
file_path = []
for dir in ravdess_directory_list:
    actor = os.listdir(Ravdess + dir)
    for file in actor:
        part = file.split('.')[0]
        part = part.split('-')
        file_emotion.append(int(part[2]))
        file_path.append(Ravdess + dir + '/' + file)
        
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

path_df = pd.DataFrame(file_path, columns=['Path'])
Ravdess_df = pd.concat([emotion_df, path_df], axis=1)

Ravdess_df.Emotions.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'}, inplace=True)
Ravdess_df.head()

In [None]:
crema_directory_list = os.listdir(Crema)

file_emotion = []
file_path = []

for file in crema_directory_list:
    file_path.append(Crema + file)
    part=file.split('_')
    if part[2] == 'SAD':
        file_emotion.append('sad')
    elif part[2] == 'ANG':
        file_emotion.append('angry')
    elif part[2] == 'DIS':
        file_emotion.append('disgust')
    elif part[2] == 'FEA':
        file_emotion.append('fear')
    elif part[2] == 'HAP':
        file_emotion.append('happy')
    elif part[2] == 'NEU':
        file_emotion.append('neutral')
    else:
        file_emotion.append('Unknown')
        
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

path_df = pd.DataFrame(file_path, columns=['Path'])
Crema_df = pd.concat([emotion_df, path_df], axis=1)
Crema_df.head()

In [None]:
tess_directory_list = os.listdir(Tess)

file_emotion = []
file_path = []

for dir in tess_directory_list:
    directories = os.listdir(Tess + dir)
    for file in directories:
        part = file.split('.')[0]
        part = part.split('_')[2]
        if part=='ps':
            file_emotion.append('surprise')
        else:
            file_emotion.append(part)
        file_path.append(Tess + dir + '/' + file)
        
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

path_df = pd.DataFrame(file_path, columns=['Path'])
Tess_df = pd.concat([emotion_df, path_df], axis=1)
Tess_df.head()

In [None]:
savee_directory_list = os.listdir(Savee)

file_emotion = []
file_path = []

for file in savee_directory_list:
    file_path.append(Savee + file)
    part = file.split('_')[1]
    ele = part[:-6]
    if ele=='a':
        file_emotion.append('angry')
    elif ele=='d':
        file_emotion.append('disgust')
    elif ele=='f':
        file_emotion.append('fear')
    elif ele=='h':
        file_emotion.append('happy')
    elif ele=='n':
        file_emotion.append('neutral')
    elif ele=='sa':
        file_emotion.append('sad')
    else:
        file_emotion.append('surprise')
        
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

path_df = pd.DataFrame(file_path, columns=['Path'])
Savee_df = pd.concat([emotion_df, path_df], axis=1)
Savee_df.head()

In [None]:
data_path = pd.concat([Ravdess_df, Crema_df, Tess_df, Savee_df], axis = 0)
data_path.to_csv("data_path.csv",index=False)
data_path.head()

In [None]:
paths = data_path.Path
emotions = data_path.Emotions

def get_waveform(file_path, sample_rate):
    # Load the audio file
    y, sr = librosa.load(file_path, sr=sample_rate)
    return y

audio_data = []

for file_path in paths:
    waveform = get_waveform(file_path, sample_rate=16000)
    audio_data.append(waveform)

In [None]:
from datasets import Dataset, DatasetDict, Features, ClassLabel, Value, Audio
from transformers import AutoFeatureExtractor
import evaluate

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
# def compute_metrics(eval_pred):
#     predictions = np.argmax(eval_pred.predictions, axis=1)
#     return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

accuracy = evaluate.load("accuracy")

paths = data_path.Path
emotions = data_path.Emotions

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
encoder = LabelEncoder()
encoded_emotion = encoder.fit_transform(np.array(emotions).reshape(-1,1))

data = {"audio": audio_data, "label": encoded_emotion.astype(int)}
dataset = Dataset.from_dict(data)

def preprocess_function(examples):
    audio_arrays = [x for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True, padding=True
    )
    return inputs


split = dataset.train_test_split(test_size=0.2)
encoded_split = split.map(preprocess_function, remove_columns="audio", batched=True)
encoded_split = encoded_split.rename_column("label", "labels")

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, balanced_accuracy_score
from datasets import load_metric
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    bal_acc = balanced_accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'balanced_accuracy': bal_acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# labels = np.unique(data_path.Emotions)
label2id, id2label = dict(), dict()
for i, label in enumerate(encoder.classes_):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

In [None]:
!export WANDB_DISABLED=true
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_mind_model",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    do_eval=True,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    save_total_limit=3,
    num_train_epochs=20,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_split["train"],
    eval_dataset=encoded_split["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("/Data/model_saved")

In [None]:
trainer.evaluate(encoded_split['test'])

In [None]:
pred = trainer.predict(encoded_split['test'])

In [None]:
predicted_class_ids = pred.predictions.argmax(-1)
id2label = model.config.id2label
predicted_labels = [id2label[str(class_id)] for class_id in predicted_class_ids]
# print(f"Predicted labels: {predicted_labels}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score

true_class_ids = encoded_split['test']["labels"]

# Create the confusion matrix
cm = confusion_matrix(true_class_ids, predicted_class_ids)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label2id)
print(f"Test Accuracy: {accuracy_score(true_class_ids, predicted_class_ids)*100:.2f}%")
disp.plot(cmap=plt.cm.Blues)
plt.show()

## Demo

In [None]:
import os
import pandas as pd
import numpy as np

Emotion_audio = "/Data/emotions-on-audio-dataset/files" # data luar

emotion_audio_directory_list = os.listdir(Emotion_audio)

file_emotion = []
file_path = []
for folder in emotion_audio_directory_list:
    actor = os.listdir(os.path.join(Emotion_audio, folder))
    for file in actor:
        emotion = file.split('.')[0]
        file_emotion.append(emotion)
        file_path.append(os.path.join(Emotion_audio, folder, file))
        
Emotion_audio_df = pd.DataFrame({"Emotions": file_emotion, "Path": file_path})
Emotion_audio_df['Emotions'] = Emotion_audio_df['Emotions'].replace({'euphoric': 'happy', 'joyfully': 'happy', 'surprised': 'surprise'})
Emotion_audio_df.head()

In [2]:
!pip install -q transformers datasets evaluate

In [3]:
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor

model_dir = "/Data/speech-emotion-recognition-models/model_saved"
model = AutoModelForAudioClassification.from_pretrained(model_dir)
tokenizer = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



In [4]:
import librosa
import torch
# import torchaudio

# def get_waveform(file_path, sample_rate):
#     waveform, sr = torchaudio.load(file_path)
#     if sr != sample_rate:
#         waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=sample_rate)
#     return waveform.squeeze().numpy()

def get_waveform(file_path, sample_rate):
    y, sr = librosa.load(file_path, sr=sample_rate)
    return y

def predict_waveforms(waveforms, return_labeled=False):
    inputs = tokenizer(waveforms, return_tensors="pt", sampling_rate=tokenizer.sampling_rate, max_length=16_000, padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class_ids = torch.argmax(logits, dim=-1).tolist()
    if not return_labeled:
        return predicted_class_ids
    
    predicted_class_labels = [model.config.id2label[pred_id] for pred_id in predicted_class_ids]
    return predicted_class_labels

In [29]:
from IPython.display import display, Audio

for n in [0, 6]:
    print(f"Index: {n}")
    display(Audio(Emotion_audio_df.Path.iloc[n]))
    print(f"Actual: {Emotion_audio_df.Emotions.iloc[n]}")
    print(f"Predicted: {predict_waveforms([get_waveform(Emotion_audio_df.Path.iloc[n], 16_000)], return_labeled=True)[0]}")
    print("\n")

Index: 0


Actual: happy
Predicted: fear


Index: 6


Actual: sad
Predicted: sad


