## Import Libraries

In [1]:
import os
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Activation
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical

## Load Data

In [2]:
DATASET_PATH = "../data/raw/"

In [3]:
# Liste pour stocker les chemins des fichiers et leurs labels
file_paths = []
labels = []

# Map des émotions pour RAVDESS
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

# Parcours des fichiers dans le dossier
for root, dirs, files in os.walk(DATASET_PATH):
    for file in files:
        if file.endswith(".wav"):
            # Ajouter le chemin complet du fichier
            file_paths.append(os.path.join(root, file))
            
            # Extraire l'émotion à partir du nom du fichier
            emotion_code = file.split("-")[2]  # Le 3ème segment du nom correspond à l'émotion
            labels.append(emotion_map[emotion_code])

In [4]:
print(len(file_paths))

1440


In [5]:
print(len(labels))

1440


## Simple Encoding

In [34]:
def extract_features_simple(file_path):
    audio, sr = librosa.load(file_path, sr=None)  # Charger le fichier audio avec son taux d'échantillonnage d'origine
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)  # Calculer 40 coefficients MFCC
    mfccs_scaled = np.mean(mfccs.T, axis=0)  # Moyenne sur le temps pour réduire la dimensionnalité
    return mfccs_scaled

In [35]:
features = []
for file_path in file_paths:
    features.append(extract_features_simple(file_path))

In [36]:
print("Taille du vecteur représentant un fichier :", len(features[1]))
print("Nombre de fichiers :", len(features))

Taille du vecteur représentant un fichier : 40
Nombre de fichiers : 1440


In [37]:
# Convertir les données en DataFrame
data = pd.DataFrame(features)
data['label'] = labels

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,label
0,-726.217224,68.54142,3.293398,12.2053,5.510278,13.667408,-2.983829,3.098029,-3.310813,-1.564384,...,-1.399109,-2.926855,0.013957,-0.490734,-0.570905,0.040399,-1.207217,-1.594982,-1.436487,neutral
1,-719.128296,70.201569,1.168397,13.122543,7.83695,14.41129,-4.11136,4.468973,-3.539367,-3.658607,...,-2.52147,-2.987673,0.409735,-0.484184,-1.398391,0.255203,-0.984978,-2.093061,-1.040791,neutral
2,-714.995728,69.689346,3.924564,11.92419,6.421723,11.011614,-2.878103,4.509558,-4.476109,-2.67155,...,-0.909152,-3.045955,-0.373294,-0.849145,-0.922105,-0.17032,-1.144423,-1.725613,-1.450561,neutral
3,-710.975281,67.56488,5.782241,13.230727,6.190846,12.628252,-1.67517,5.657494,-4.950634,-3.477545,...,-1.329651,-2.513405,-0.190276,-0.645949,-0.553919,0.459299,-1.580085,-1.647682,-1.509511,neutral
4,-759.921753,75.783524,6.023605,14.557394,6.454188,14.631508,-3.004551,4.62097,-5.200016,-0.70743,...,-2.188582,-2.835501,0.463746,-1.019167,-1.411441,0.350433,-1.519892,-1.250112,-0.613852,calm


In [38]:
# Encoder les labels en valeurs numériques
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,label,label_encoded
0,-726.217224,68.54142,3.293398,12.2053,5.510278,13.667408,-2.983829,3.098029,-3.310813,-1.564384,...,-2.926855,0.013957,-0.490734,-0.570905,0.040399,-1.207217,-1.594982,-1.436487,neutral,5
1,-719.128296,70.201569,1.168397,13.122543,7.83695,14.41129,-4.11136,4.468973,-3.539367,-3.658607,...,-2.987673,0.409735,-0.484184,-1.398391,0.255203,-0.984978,-2.093061,-1.040791,neutral,5
2,-714.995728,69.689346,3.924564,11.92419,6.421723,11.011614,-2.878103,4.509558,-4.476109,-2.67155,...,-3.045955,-0.373294,-0.849145,-0.922105,-0.17032,-1.144423,-1.725613,-1.450561,neutral,5
3,-710.975281,67.56488,5.782241,13.230727,6.190846,12.628252,-1.67517,5.657494,-4.950634,-3.477545,...,-2.513405,-0.190276,-0.645949,-0.553919,0.459299,-1.580085,-1.647682,-1.509511,neutral,5
4,-759.921753,75.783524,6.023605,14.557394,6.454188,14.631508,-3.004551,4.62097,-5.200016,-0.70743,...,-2.835501,0.463746,-1.019167,-1.411441,0.350433,-1.519892,-1.250112,-0.613852,calm,1


In [43]:
data.shape

(1440, 42)

In [45]:
data.to_csv("../data/preprocessed/simple_processed_data.csv", index=False)

In [39]:
# Diviser les données
X = data.iloc[:, :-2].values  # Toutes les colonnes sauf 'label' et 'label_encoded'
y = data['label_encoded'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## Advanced Encoding

In [6]:
def extract_features_advanced(file_path):
    # Charger le fichier audio
    audio, sr = librosa.load(file_path, sr=None)  
    
    # MFCCs : 100 coefficients
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=100)
    mfccs_scaled = np.mean(mfccs.T, axis=0)
    
    # Chroma STFT : Capturer l'énergie distribuée à travers les 12 tons
    chroma_stft = librosa.feature.chroma_stft(y=audio, sr=sr)
    chroma_stft_scaled = np.mean(chroma_stft.T, axis=0)
    
    # Spectrogramme de puissance (Mel Spectrogram) : 128 valeurs
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram)
    mel_spectrogram_scaled = np.mean(mel_spectrogram_db.T, axis=0)
    
    # STFT (Short-Time Fourier Transform) : 1025 valeurs
    stft = np.abs(librosa.stft(audio))
    stft_scaled = np.mean(stft.T, axis=0)
    
    # Concaténer toutes les caractéristiques
    combined_features = np.hstack([mfccs_scaled, chroma_stft_scaled, mel_spectrogram_scaled, stft_scaled])
    
    return combined_features

#### Problème : caractéristiques inéquitables en terme de features --> STFT va prendre toute la place.

#### Solutions :
    - StandardScaler pour normaliser toutes les valeurs
    - Réduction de dimensionnalité pour les caractéristiques dominants

In [7]:
features = []
for file_path in file_paths:
    features.append(extract_features_advanced(file_path))

In [8]:
print("Taille du vecteur représentant un fichier :", len(features[1]))
print("Nombre de fichiers :", len(features))

Taille du vecteur représentant un fichier : 1265
Nombre de fichiers : 1440


In [9]:
# Convertir les données en DataFrame
data = pd.DataFrame(features)
data.rename(columns={col: f'feature_{col}' for col in data.columns if isinstance(col, int)}, inplace=True)
data['true_label'] = labels

data.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_1256,feature_1257,feature_1258,feature_1259,feature_1260,feature_1261,feature_1262,feature_1263,feature_1264,true_label
0,-726.217224,68.54142,3.293398,12.2053,5.510278,13.667408,-2.983828,3.098029,-3.310813,-1.564384,...,0.000278,0.000283,0.00031,0.000303,0.000316,0.000304,0.000274,0.000288,0.000286,neutral
1,-719.128296,70.201569,1.168397,13.122541,7.83695,14.41129,-4.11136,4.468973,-3.539367,-3.658607,...,0.00029,0.000306,0.00031,0.000306,0.000302,0.0003,0.000285,0.000275,0.000242,neutral
2,-714.995728,69.689346,3.924564,11.92419,6.421723,11.011614,-2.878103,4.509558,-4.476109,-2.671549,...,0.000286,0.000255,0.000259,0.000288,0.000284,0.000284,0.000261,0.000258,0.000254,neutral
3,-710.975281,67.56488,5.782241,13.230726,6.190845,12.628252,-1.675169,5.657494,-4.950634,-3.477545,...,0.000257,0.000283,0.000289,0.000293,0.00028,0.000279,0.000285,0.000272,0.000221,neutral
4,-759.921753,75.783524,6.023605,14.557394,6.454187,14.631508,-3.004551,4.62097,-5.200016,-0.70743,...,0.000268,0.000268,0.000264,0.000277,0.000279,0.000279,0.000293,0.000264,0.000221,calm


In [10]:
# Encoder les labels en valeurs numériques
label_encoder = LabelEncoder()
data['true_label_encoded'] = label_encoder.fit_transform(data['true_label'])

data.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_1257,feature_1258,feature_1259,feature_1260,feature_1261,feature_1262,feature_1263,feature_1264,true_label,true_label_encoded
0,-726.217224,68.54142,3.293398,12.2053,5.510278,13.667408,-2.983828,3.098029,-3.310813,-1.564384,...,0.000283,0.00031,0.000303,0.000316,0.000304,0.000274,0.000288,0.000286,neutral,5
1,-719.128296,70.201569,1.168397,13.122541,7.83695,14.41129,-4.11136,4.468973,-3.539367,-3.658607,...,0.000306,0.00031,0.000306,0.000302,0.0003,0.000285,0.000275,0.000242,neutral,5
2,-714.995728,69.689346,3.924564,11.92419,6.421723,11.011614,-2.878103,4.509558,-4.476109,-2.671549,...,0.000255,0.000259,0.000288,0.000284,0.000284,0.000261,0.000258,0.000254,neutral,5
3,-710.975281,67.56488,5.782241,13.230726,6.190845,12.628252,-1.675169,5.657494,-4.950634,-3.477545,...,0.000283,0.000289,0.000293,0.00028,0.000279,0.000285,0.000272,0.000221,neutral,5
4,-759.921753,75.783524,6.023605,14.557394,6.454187,14.631508,-3.004551,4.62097,-5.200016,-0.70743,...,0.000268,0.000264,0.000277,0.000279,0.000279,0.000293,0.000264,0.000221,calm,1


In [11]:
data.shape

(1440, 1267)

In [12]:
data.to_csv("../data/ref_data.csv", index=False)

In [13]:
# Diviser les données
X = data.iloc[:, :-2].values  # Toutes les colonnes sauf 'label' et 'label_encoded'
y = data['true_label_encoded'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [14]:
X_train.shape

(1152, 1265)

## Real Preprocessing

In [8]:
def extract_features_real(file_path):
    # Charger le fichier audio
    audio, sr = librosa.load(file_path, sr=None)  
    
    # MFCCs : 100 coefficients
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=100)
    mfccs_scaled = np.mean(mfccs.T, axis=0)
    
    # Chroma STFT : Capturer l'énergie distribuée à travers les 12 tons
    chroma_stft = librosa.feature.chroma_stft(y=audio, sr=sr)
    chroma_stft_scaled = np.mean(chroma_stft.T, axis=0)
    
    # Spectrogramme de puissance (Mel Spectrogram) : 128 valeurs
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram)
    mel_spectrogram_scaled = np.mean(mel_spectrogram_db.T, axis=0)
    
    # STFT (Short-Time Fourier Transform) : Réduction dimensionnelle et agrégation
    stft = np.abs(librosa.stft(audio))
    stft_scaled = StandardScaler().fit_transform(stft.T).T
    pca_stft = PCA(n_components=40)
    stft_reduced = pca_stft.fit_transform(stft_scaled.T)
    stft_final = np.mean(stft_reduced, axis=0).reshape(-1, 1)
    
    # Normalisation des caractéristiques agrégées
    mfccs_scaled = StandardScaler().fit_transform(mfccs_scaled.reshape(-1, 1)).flatten()
    chroma_stft_scaled = StandardScaler().fit_transform(chroma_stft_scaled.reshape(-1, 1)).flatten()
    mel_spectrogram_scaled = StandardScaler().fit_transform(mel_spectrogram_scaled.reshape(-1, 1)).flatten()
    stft_final = StandardScaler().fit_transform(stft_final).flatten()
    
    # Concaténer toutes les caractéristiques
    combined_features = np.hstack([mfccs_scaled, chroma_stft_scaled, mel_spectrogram_scaled, stft_final])
    
    return combined_features

In [9]:
features = []
for file_path in file_paths:
    features.append(extract_features_real(file_path))

In [10]:
print("Taille du vecteur représentant un fichier :", len(features[1]))
print("Nombre de fichiers :", len(features))

Taille du vecteur représentant un fichier : 280
Nombre de fichiers : 1440


In [11]:
# Convertir les données en DataFrame
data = pd.DataFrame(features)
data['label'] = labels

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,271,272,273,274,275,276,277,278,279,label
0,-9.899629,1.040195,0.142059,0.264731,0.172574,0.284856,0.055653,0.139369,0.051152,0.075191,...,0.344707,0.087002,-0.256639,-0.697508,0.105721,0.178121,-0.133344,0.102436,0.58769,neutral
1,-9.895485,1.07216,0.112952,0.279053,0.205611,0.29696,0.03959,0.158813,0.047538,0.045881,...,-0.166417,0.741152,-0.721999,0.61934,-0.139517,-0.237171,-0.00349,0.334601,0.185698,neutral
2,-9.896739,1.071867,0.152583,0.264405,0.18749,0.251649,0.057493,0.160761,0.035156,0.060381,...,-0.062602,-0.330205,-0.267901,-0.38096,-0.424611,0.361369,-0.157882,-0.301333,0.257123,neutral
3,-9.898187,1.046844,0.178279,0.282993,0.184024,0.274523,0.07344,0.176526,0.027392,0.048101,...,1.141044,-0.65814,-1.009656,-0.195508,-0.090185,-0.634706,-0.523063,0.065426,0.039363,neutral
4,-9.89374,1.091616,0.174621,0.286798,0.180281,0.287772,0.055946,0.156183,0.027086,0.086141,...,0.542581,-0.067338,0.127221,-1.047974,-0.056026,0.068401,0.1969,-0.025711,0.267484,calm


In [12]:
# Encoder les labels en valeurs numériques
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,272,273,274,275,276,277,278,279,label,label_encoded
0,-9.899629,1.040195,0.142059,0.264731,0.172574,0.284856,0.055653,0.139369,0.051152,0.075191,...,0.087002,-0.256639,-0.697508,0.105721,0.178121,-0.133344,0.102436,0.58769,neutral,5
1,-9.895485,1.07216,0.112952,0.279053,0.205611,0.29696,0.03959,0.158813,0.047538,0.045881,...,0.741152,-0.721999,0.61934,-0.139517,-0.237171,-0.00349,0.334601,0.185698,neutral,5
2,-9.896739,1.071867,0.152583,0.264405,0.18749,0.251649,0.057493,0.160761,0.035156,0.060381,...,-0.330205,-0.267901,-0.38096,-0.424611,0.361369,-0.157882,-0.301333,0.257123,neutral,5
3,-9.898187,1.046844,0.178279,0.282993,0.184024,0.274523,0.07344,0.176526,0.027392,0.048101,...,-0.65814,-1.009656,-0.195508,-0.090185,-0.634706,-0.523063,0.065426,0.039363,neutral,5
4,-9.89374,1.091616,0.174621,0.286798,0.180281,0.287772,0.055946,0.156183,0.027086,0.086141,...,-0.067338,0.127221,-1.047974,-0.056026,0.068401,0.1969,-0.025711,0.267484,calm,1


In [13]:
data.shape

(1440, 282)

In [14]:
data.to_csv("../data/preprocessed/real_processed_data.csv", index=False)

In [15]:
# Diviser les données
X = data.iloc[:, :-2].values  # Toutes les colonnes sauf 'label' et 'label_encoded'
y = data['label_encoded'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [16]:
X_train.shape

(1152, 280)

IL FAUT GARDER QUELQUES DONNÉES DE CÔTÉ POUR LES UPLOAD APRES !!!