In [1]:
import librosa.display
import os
from scipy.io import wavfile
import librosa
import os
import pandas as pd
import numpy as np
import re
import glob
import matplotlib.pyplot as plt

## Silence elimination

In [13]:
def eliminar_silencio(ruta_base: str, carpetas: list) -> None:
    """
    Removes the silence from the .wav files in the specified folders and saves the clean files in a new folder called 'clean_audios'.

    :param ruta_base: The base path where the folders with the audio files are located.
    :type ruta_base: str
    :param carpetas: A list with the names of the folders to be processed.
    :type carpetas: list
    :return: None
    """

    if not os.path.exists("../data/clean_audios"):
        os.mkdir("../data/clean_audios")

    for carpeta in carpetas:
        archivos_wav = glob.glob(os.path.join(ruta_base, carpeta, '**', '*.wav'), recursive=True)
        for archivo in archivos_wav:
            audio, tasa_muestreo = librosa.load(archivo, sr=None)
            audio_sin_silencio, indices_no_silencio = librosa.effects.trim(audio)
            ruta_limpia = os.path.join("../data/clean_audios", carpeta, os.path.relpath(archivo, os.path.join(ruta_base, carpeta)))
            os.makedirs(os.path.dirname(ruta_limpia), exist_ok=True)
            wavfile.write(ruta_limpia, tasa_muestreo, audio_sin_silencio)


In [14]:
eliminar_silencio("../data/audios/",[ "iemocap"])

## Dataframe Creation

### Ravdess

In [2]:
ravdess = '../data/clean_audios/ravdess'

In [3]:
def create_rav_emotion_df(ruta_clean_audios: str) -> pd.DataFrame:
    """
    Creates a dataframe with two columns: emotion and path, using the emotion label encoded in the file names
    of the .wav files in the specified directory.

    :param ruta_clean_audios: The path where the clean audio files are located.
    :type ruta_clean_audios: str
    :return: A pandas dataframe with the emotion label and the path of each file.
    :rtype: pd.DataFrame
    """

    # List to store the emotion and path for each file
    emotion_paths = []

    # Loop through all the .wav files in the specified directory
    for root, _, files in os.walk(ruta_clean_audios):
        for file in files:
            if file.endswith(".wav"):
                # Get the emotion label from the file name
                emotion_label = int(file[7])

                # Assign emotion based on label
                if emotion_label == 1 or emotion_label == 2:
                    emotion = "neutral"
                elif emotion_label == 3:
                    emotion = "joy"
                elif emotion_label == 4:
                    emotion = "sadness"
                elif emotion_label == 5 or emotion_label == 7:
                    emotion = "anger"
                elif emotion_label == 6:
                    emotion = "fear"
                elif emotion_label == 8:
                    emotion = "surprise"

                # Create tuple with emotion and path
                path = os.path.join(root, file)
                emotion_path = (emotion, path)

                # Append to list
                emotion_paths.append(emotion_path)

    # Create pandas dataframe with emotion and path columns
    df = pd.DataFrame(emotion_paths, columns=["emotion", "path"])

    return df


In [4]:
ravdess_df=create_rav_emotion_df(ravdess)
ravdess_df.head(5)

Unnamed: 0,emotion,path
0,neutral,../data/clean_audios/ravdess\03-01-01-01-01-01...
1,neutral,../data/clean_audios/ravdess\03-01-01-01-01-01...
2,neutral,../data/clean_audios/ravdess\03-01-01-01-01-01...
3,neutral,../data/clean_audios/ravdess\03-01-01-01-01-01...
4,neutral,../data/clean_audios/ravdess\03-01-01-01-01-01...


### Iemocap

In [5]:
iemocapCsv = '../data/aud_em/iemo.csv'

In [6]:
def create_path_dataframe(ruta_csv):
    """
    Reads a csv file with a 'path' column that contains file paths in a specific format,
    modifies the paths to include the correct folder and replaces the emotion values with
    the desired values, then filters out any rows with emotion values that are not in the list
    of emotions to consider.

    :param ruta_csv: The path of the csv file to read.
    :type ruta_csv: str
    :return: A pandas dataframe with the modified paths.
    :rtype: pandas.DataFrame
    """

    # List of emotions to consider
    emotions_to_consider = ['neutral', 'joy', 'sadness', 'anger', 'fear', 'surprise']

    # Read the csv file
    df = pd.read_csv(ruta_csv)

    # Modify the 'path' column
    df['path'] = df['path'].apply(lambda x: "../data/clean_audios/iemocap/" + x)

    # Replace the 'emotion' values
    df['emotion'] = df['emotion'].replace({'neu': 'neutral', 'fru': 'anger', 'sad': 'sadness', 'sur': 'surprise',
                                           'ang': 'anger', 'hap': 'joy', 'exc': 'joy', 'fea': 'fear', 'dis': 'anger'})

    # Filter out any rows with emotion values that are not in the list of emotions to consider
    df = df[df['emotion'].isin(emotions_to_consider)]

    # Select only the 'path' column and return the resulting dataframe
    return df[['emotion', 'path']]


In [7]:
iemocap_df = create_path_dataframe(iemocapCsv)
iemocap_df.head(5)

Unnamed: 0,emotion,path
0,neutral,../data/clean_audios/iemocap/Session1/sentence...
1,anger,../data/clean_audios/iemocap/Session1/sentence...
3,surprise,../data/clean_audios/iemocap/Session1/sentence...
4,neutral,../data/clean_audios/iemocap/Session1/sentence...
6,anger,../data/clean_audios/iemocap/Session1/sentence...


### TESS

In [8]:
tess = '../data/clean_audios/TESS'

In [9]:
def create_emotion_path_dataframe(ruta: str) -> pd.DataFrame:
    """
    Creates a dataframe with two columns: 'emotion' and 'path'. The function reads the names of the .wav files
    contained in the directory specified by the path parameter and, based on the presence of certain keywords
    within the file name, assigns an emotion value to the 'emotion' column. The 'path' column contains the full path
    to the .wav file. Only emotions that are explicitly defined are included in the dataframe.

    :param ruta: The path where the .wav files are located.
    :type ruta: str
    :return: A pandas dataframe with two columns: 'emotion' and 'path'.
    :rtype: pandas.DataFrame
    """
    emotions = {'fear': 'fear', 'ps': 'surprise', 'sad': 'sadness', 'angry': 'anger', 'disgust': 'anger', 'happy': 'joy', 'neutral': 'neutral'}
    file_paths = [os.path.join(ruta, f) for f in os.listdir(ruta) if f.endswith('.wav')]
    data = {'emotion': [], 'path': []}
    for path in file_paths:
        emotion = None
        for word, value in emotions.items():
            if word in path.lower():
                emotion = value
                break
        if emotion is not None:
            data['emotion'].append(emotion)
            data['path'].append(path)
    df = pd.DataFrame(data)
    return df

In [10]:
tess_df=create_emotion_path_dataframe(tess)
tess_df.head(5)

Unnamed: 0,emotion,path
0,anger,../data/clean_audios/TESS\OAF_back_angry.wav
1,anger,../data/clean_audios/TESS\OAF_back_disgust.wav
2,fear,../data/clean_audios/TESS\OAF_back_fear.wav
3,joy,../data/clean_audios/TESS\OAF_back_happy.wav
4,neutral,../data/clean_audios/TESS\OAF_back_neutral.wav


## Dataframe Fusion

In [16]:
merged_df = pd.concat([tess_df, iemocap_df, ravdess_df], ignore_index=True)
merged_df.to_csv("../data/aud_em/path_emotion.csv", index=False)
merged_df.head(5)

Unnamed: 0,emotion,path
0,anger,../data/clean_audios/TESS\OAF_back_angry.wav
1,anger,../data/clean_audios/TESS\OAF_back_disgust.wav
2,fear,../data/clean_audios/TESS\OAF_back_fear.wav
3,joy,../data/clean_audios/TESS\OAF_back_happy.wav
4,neutral,../data/clean_audios/TESS\OAF_back_neutral.wav


In [17]:
merged_df = pd.read_csv("../data/aud_em/path_emotion.csv")

## Audio Length

In [18]:
def obtener_duraciones_df(df, column_path: str) -> list:
    """
    Receives a pandas dataframe with a 'path' column and returns a list of
    the durations of every audio file in the path column.

    :param df: The pandas dataframe with the 'path' column.
    :type df: pandas.DataFrame
    :param column_path: The name of the column that contains the audio file paths.
    :type column_path: str
    :return: A list of durations.
    :rtype: list
    """
    duraciones = []
    for path in df[column_path]:
        if ".wav" in path:
            duracion = librosa.get_duration(filename=path)
            duraciones.append(duracion)
    return duraciones


In [19]:
duraciones = obtener_duraciones_df(merged_df, 'path')
print("Number of audio files found:", len(duraciones))

	This alias will be removed in version 1.0.
  duracion = librosa.get_duration(filename=path)


Number of audio files found: 11769


In [23]:
recommended_n_fft = 2 ** (len(bin(2)) - 2)
print(recommended_n_fft)

4



## Distribution and segment duration determination

In [24]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def graficar_distribucion(duraciones:list) -> float:
    """
    Receives a list of durations and plots their distribution using a Seaborn kernel density plot.
    Also, calculates the value of the duration of the segments.
    
    :param duraciones: The list of durations to plot.
    :type duraciones: list
    :return: The minimum duration for the creation of the segments
    :rtype: list
    """
    audios_sorted = sorted(duraciones)
    duracion_max = audios_sorted[-1]
    pct_per_duracion = []
    for i in range(int(duracion_max)+1):
        pct_per_duracion.append(sum(d >= i for d in audios_sorted) / len(audios_sorted) * 100)
        
    plt.plot(np.arange(duracion_max), pct_per_duracion)
    minima= np.percentile(audios_sorted, 5)
    plt.xlabel('Duración (segundos)')
    plt.ylabel('Porcentaje de audios')
    plt.title('Distribución de duraciones')
    plt.axhline(y=95, color='r', linestyle='--')
    plt.axvline(x=minima, color='g', linestyle='--')
    print('95% of the wav files surpases ',minima,' seconds')
    return plt.gcf(), minima


In [28]:
%matplotlib notebook
fig, minima = graficar_distribucion(duraciones)
fig.show()

<IPython.core.display.Javascript object>

95% of the wav files surpases  1.4877529286474973  seconds


## Features extraction

In [26]:
sr = 22050
n_mfcc = 13
minima = 1.48

In [22]:
import pandas as pd
import librosa

def convert_audios_to_dataframe(df: pd.DataFrame, duration: float, sr: int) -> pd.DataFrame:
     """
    Grafica la matriz de confusión.

    Args:
        y_true: Etiquetas verdaderas.
        y_pred: Etiquetas predichas.
        encoder: Objeto LabelEncoder utilizado para codificar las etiquetas.

    """
    data = []
    labels = []
    for path, emotion in zip(df['path'], df['emotion']):
        if ".wav" in path:
            # Carga el archivo de audio
            audio, sr = librosa.load(path, sr=sr)

            # Calcula la longitud de la ventana en muestras
            longitud_ventana = int(duration * sr)

            if len(audio) >= longitud_ventana:
                # Calcula el centro del audio
                centro_audio = len(audio) // 2

                # Retrocede y avanza la mitad de la longitud de la ventana
                inicio_ventana = centro_audio - (longitud_ventana // 2)
                fin_ventana = centro_audio + (longitud_ventana // 2)

                # Extrae los datos dentro de la ventana
                datos_ventana = audio[inicio_ventana:fin_ventana]

                # Guarda los datos de audio y la etiqueta
                data.append(datos_ventana)
                labels.append(emotion)

    # Crea un nuevo DataFrame con los datos y etiquetas
    df_new = pd.DataFrame({'audio_data': data, 'emotion': labels})
    
    return df_new

In [27]:
audiosInCSV = convert_audios_to_dataframe(merged_df, minima, sr)


In [28]:
audiosInCSV.to_csv("../data/aud_em/segmentAudios.csv", index=False)
audiosInCSV.head(5)

Unnamed: 0,audio_data,emotion
0,"[-0.0015142439, -0.0013728759, -0.0009004156, ...",anger
1,"[0.01816618, 0.020015253, 0.022401942, 0.02322...",anger
2,"[0.004384894, -0.0017522312, -0.0026770048, 0....",fear
3,"[-0.050999276, -0.051911533, -0.05182729, -0.0...",joy
4,"[-0.0050611948, -0.0015096308, 0.0069153113, -...",neutral


In [30]:
import librosa
import numpy as np
import scipy.fft

def extract_features_from_audio(audio_data, frame_len, hop_len):
      """
    Extrae características de un fragmento de audio.

    Args:
        audio_data: Datos de audio.
        frame_len: Longitud de la ventana en muestras.
        hop_len: Longitud del salto entre ventanas en muestras.

    Returns:
        Matriz de características extraídas.

    """
    # Divide el audio en ventanas utilizando frame_length y hop_length
    frames = librosa.util.frame(audio_data, frame_length=frame_len, hop_length=hop_len)
    
    # Aplica una ventana de Hanning a cada ventana
    windowed_frames = np.hanning(frame_len).reshape(-1, 1) * frames
    
    # Lista para almacenar las características extraídas
    features = []
    
    # Itera sobre cada ventana
    for frame in windowed_frames:
        
        result = np.array([])
        # Calcula los coeficientes MFCC
        mfcc = np.mean(librosa.feature.mfcc(y=frame, sr=sr).T, axis=0)
        result = np.hstack((result, mfcc)) # stacking horizontally
        
        # Calcula la transformada de Fourier (DFT)
        dft = np.mean(librosa.stft(y=frame))
        result = np.hstack((result, dft)) # stacking horizontally
        
        # Calcula la transformada discreta del coseno (DCT)
        dct = np.mean(scipy.fft.dct(frame))
        result = np.hstack((result, mfcc)) 
        
        features.append(result)
    
    # Convierte la lista de características en una matriz
    features_matrix = np.array(features)
    features_matrix = features_matrix[1:-1]

    
    return features_matrix

In [33]:
number = 98
audio_data = audiosInCSV.loc[number, 'audio_data']
audio_emotion = audiosInCSV.loc[number,'emotion']
print(audio_emotion)
feat = extract_features_from_audio(audio_data, 50, 25)
print(feat)
print(type(feat))
print(feat.shape)

surprise




[[-7.56659129e+02+0.j -1.78800977e+01+0.j  9.81759696e+00+0.j ...
   1.58564139e+00+0.j -3.33064382e+00+0.j  2.47999784e+00+0.j]
 [-6.23498537e+02+0.j -2.04504137e+01+0.j  8.88726710e+00+0.j ...
   2.29102088e+00+0.j -3.34119734e+00+0.j  1.47444824e+00+0.j]
 [-5.44721417e+02+0.j -2.03224626e+01+0.j  9.88765814e+00+0.j ...
   1.19541494e+00+0.j -2.69159997e+00+0.j  2.37530867e+00+0.j]
 ...
 [-5.43126036e+02+0.j -2.19848194e+01+0.j  5.10297912e+00+0.j ...
   2.35921495e+00+0.j -5.66383313e+00+0.j  3.77474892e+00+0.j]
 [-6.22397516e+02+0.j -2.20548102e+01+0.j  7.80000503e+00+0.j ...
   4.27487994e+00+0.j -1.94323136e+00+0.j  6.02151356e+00+0.j]
 [-7.57441224e+02+0.j -2.01178277e+01+0.j  1.10218134e+01+0.j ...
   4.96780965e+00+0.j -7.01399586e-01+0.j  6.29257033e+00+0.j]]
<class 'numpy.ndarray'>
(48, 41)


In [37]:
import numpy as np

# Creamos un ndarray de ejemplo con dimensiones (48, 41)
arr = feat

# Reshape usando ravel()
arr_reshaped = arr.reshape(1,1968)

# Imprimimos las formas de ambos ndarrays
print("Forma original:", arr.shape)  # Output: (48, 41)
print("Forma reordenada:", arr_reshaped.shape)  # Output: (1968,)
print(arr_reshaped)

Forma original: (48, 41)
Forma reordenada: (1, 1968)
[[-7.56659129e+02+0.j -1.78800977e+01+0.j  9.81759696e+00+0.j ...
   4.96780965e+00+0.j -7.01399586e-01+0.j  6.29257033e+00+0.j]]


In [81]:
def process_audio_data(df):
     """
    Procesa los datos de audio en un DataFrame, extrayendo características y reorganizando los datos.

    Args:
        df: DataFrame que contiene los datos de audio.

    Returns:
        DataFrame procesado con las características extraídas y los datos reorganizados.

    """
    # Creamos un ndarray vacío con las dimensiones resultantes de reshape (1, 1968)
    output_data = np.empty((0, 1968))
    count = 0

    # Iteramos sobre cada fila del dataframe y aplicamos el método extract_features_from_audio
    for i, row in df.iterrows():
        count=count+1
        audio_data = row["audio_data"]
        features = extract_features_from_audio(audio_data, frame_len=50, hop_len=25)
        features_reshaped = features.reshape(1, 1968)
        output_data = np.vstack([output_data, features_reshaped])
        print(i,"/",df.shape[0])

    # Creamos un nuevo dataframe a partir del ndarray reorganizado
    columns = [f"feature_{i}" for i in range(1968)]
    df_output = pd.DataFrame(output_data, columns=columns)
    df_output.to_csv("numeros.csv")
    columns.append("emotion")
    df_output["emotion"] = df["emotion"].values

    return df_output

In [83]:
matrix=process_audio_data(audiosInCSV)
matrix.head(5)



0 / 11199
1 / 11199
2 / 11199
3 / 11199
4 / 11199
5 / 11199
6 / 11199
7 / 11199
8 / 11199
9 / 11199
10 / 11199
11 / 11199
12 / 11199
13 / 11199
14 / 11199
15 / 11199
16 / 11199
17 / 11199
18 / 11199
19 / 11199
20 / 11199
21 / 11199
22 / 11199
23 / 11199
24 / 11199
25 / 11199
26 / 11199
27 / 11199
28 / 11199
29 / 11199
30 / 11199
31 / 11199
32 / 11199
33 / 11199
34 / 11199
35 / 11199
36 / 11199
37 / 11199
38 / 11199
39 / 11199
40 / 11199
41 / 11199
42 / 11199
43 / 11199
44 / 11199
45 / 11199
46 / 11199
47 / 11199
48 / 11199
49 / 11199
50 / 11199
51 / 11199
52 / 11199
53 / 11199
54 / 11199
55 / 11199
56 / 11199
57 / 11199
58 / 11199
59 / 11199
60 / 11199
61 / 11199
62 / 11199
63 / 11199
64 / 11199
65 / 11199
66 / 11199
67 / 11199
68 / 11199
69 / 11199
70 / 11199
71 / 11199
72 / 11199
73 / 11199
74 / 11199
75 / 11199
76 / 11199
77 / 11199
78 / 11199
79 / 11199
80 / 11199
81 / 11199
82 / 11199
83 / 11199
84 / 11199
85 / 11199
86 / 11199
87 / 11199
88 / 11199
89 / 11199
90 / 11199
91 / 1119

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_1959,feature_1960,feature_1961,feature_1962,feature_1963,feature_1964,feature_1965,feature_1966,feature_1967,emotion
0,-713.828017+0.000000j,-45.367782+0.000000j,2.092292+0.000000j,-5.445775+0.000000j,19.902062+0.000000j,-1.200404+0.000000j,1.579054+0.000000j,-6.492089+0.000000j,2.562503+0.000000j,5.996100+0.000000j,...,-1.439508+0.000000j,-4.686062+0.000000j,13.082843+0.000000j,-6.580755+0.000000j,5.764794+0.000000j,-13.006678+0.000000j,5.707501+0.000000j,-0.882507+0.000000j,5.204505+0.000000j,anger
1,-815.775057+0.000000j,-27.901613+0.000000j,35.721426+0.000000j,8.957691+0.000000j,-2.474410+0.000000j,-3.934018+0.000000j,-0.961308+0.000000j,-1.157726+0.000000j,0.020717+0.000000j,-5.818001+0.000000j,...,6.319683+0.000000j,4.207037+0.000000j,3.486640+0.000000j,3.296914+0.000000j,0.680617+0.000000j,1.094365+0.000000j,4.719805+0.000000j,-0.868390+0.000000j,0.779289+0.000000j,anger
2,-736.450602+0.000000j,15.234493+0.000000j,19.709823+0.000000j,-11.686956+0.000000j,17.780772+0.000000j,16.959531+0.000000j,-16.780493+0.000000j,13.185469+0.000000j,-22.309625+0.000000j,25.071071+0.000000j,...,4.148269+0.000000j,-11.163866+0.000000j,6.788097+0.000000j,4.069214+0.000000j,1.461103+0.000000j,6.501691+0.000000j,-1.554199+0.000000j,10.302355+0.000000j,3.350331+0.000000j,fear
3,-777.161139+0.000000j,-22.496489+0.000000j,18.079190+0.000000j,-14.778136+0.000000j,-4.668306+0.000000j,-1.567470+0.000000j,4.500020+0.000000j,-3.646335+0.000000j,4.288988+0.000000j,-11.392629+0.000000j,...,-2.881768+0.000000j,3.625926+0.000000j,5.600251+0.000000j,1.785295+0.000000j,0.074829+0.000000j,3.938529+0.000000j,-4.034892+0.000000j,1.611167+0.000000j,-3.497172+0.000000j,joy
4,-873.100758+0.000000j,-9.655969+0.000000j,31.359661+0.000000j,-7.544092+0.000000j,-6.290158+0.000000j,6.100780+0.000000j,25.866054+0.000000j,-13.405154+0.000000j,6.116602+0.000000j,7.452989+0.000000j,...,19.879887+0.000000j,-11.733910+0.000000j,17.910082+0.000000j,-8.245636+0.000000j,11.741662+0.000000j,5.346438+0.000000j,1.283021+0.000000j,-0.817700+0.000000j,7.271237+0.000000j,neutral


In [89]:
# Convertir las columnas de la primera a la penúltima de tipo str a tipo float
matrix.iloc[:, :-1] = matrix.iloc[:, :-1].astype(float)

  return arr.astype(dtype, copy=True)


In [90]:
matrix.to_csv("../data/aud_em/features.csv")

In [91]:
matrix.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_1959,feature_1960,feature_1961,feature_1962,feature_1963,feature_1964,feature_1965,feature_1966,feature_1967,emotion
0,-713.828017,-45.367782,2.092292,-5.445775,19.902062,-1.200404,1.579054,-6.492089,2.562503,5.9961,...,-1.439508,-4.686062,13.082843,-6.580755,5.764794,-13.006678,5.707501,-0.882507,5.204505,anger
1,-815.775057,-27.901613,35.721426,8.957691,-2.47441,-3.934018,-0.961308,-1.157726,0.020717,-5.818001,...,6.319683,4.207037,3.48664,3.296914,0.680617,1.094365,4.719805,-0.86839,0.779289,anger
2,-736.45062,15.234493,19.709823,-11.686956,17.780772,16.959531,-16.780493,13.185469,-22.309625,25.071071,...,4.148269,-11.163866,6.788097,4.069214,1.461103,6.501691,-1.554199,10.302355,3.350331,fear
3,-777.161139,-22.496489,18.07919,-14.778136,-4.668306,-1.56747,4.50002,-3.646335,4.288988,-11.392629,...,-2.881768,3.625926,5.600251,1.785295,0.074829,3.938529,-4.034892,1.611167,-3.497172,joy
4,-873.100758,-9.655969,31.359661,-7.544092,-6.290158,6.10078,25.866054,-13.405154,6.116602,7.452989,...,19.879887,-11.73391,17.910082,-8.245636,11.741662,5.346438,1.283021,-0.8177,7.271237,neutral
