In [None]:
from tensorflow.python.keras.layers.pooling import GlobalAveragePooling2D 

import librosa.display
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau


In [None]:
def extract_labels_emotions_sentiments(csv_file):
    df = pd.read_csv(csv_file)
    file_paths =  ["./train_set/dia{}_utt{}.wav".format(row['Dialogue_ID'], row['Utterance_ID']) for index, row in df.iterrows()]
    labels = df['Emotion'].tolist()
    sentiments = df['Sentiment'].tolist()
    speakers = df['Speaker'].tolist()
    return file_paths, labels, sentiments

def extract_labels_emotions_sentiments_dev(csv_file):
    df = pd.read_csv(csv_file)
    file_paths =  ["./dev_set/dia{}_utt{}.wav".format(row['Dialogue_ID'], row['Utterance_ID']) for index, row in df.iterrows()]
    labels = df['Emotion'].tolist()
    sentiments = df['Sentiment'].tolist()
    speakers = df['Speaker'].tolist()
    return file_paths, labels, sentiments

def extract_labels_emotions_sentiments_test(csv_file):
    df = pd.read_csv(csv_file)
    file_paths =  ["./test_set/dia{}_utt{}.wav".format(row['Dialogue_ID'], row['Utterance_ID']) for index, row in df.iterrows()]
    labels = df['Emotion'].tolist()
    sentiments = df['Sentiment'].tolist()
    return file_paths, labels, sentiments

train_file_path = './filtered_file.csv'
test_file_path = './test_sent_emo.csv'
dev_file_path = './dev_sent_emo.csv'

train_file_paths, train_labels, train_sentiments = extract_labels_emotions_sentiments(train_file_path)
test_file_paths, test_labels, test_sentiments = extract_labels_emotions_sentiments_test(test_file_path)
dev_file_paths, dev_labels, dev_sentiments = extract_labels_emotions_sentiments_dev(dev_file_path)


In [None]:
def plot_wave_mel(audio_path):
    y, sr = librosa.load(audio_path)
    plt.figure(figsize=(14, 5))
    plt.subplot(1, 2, 1)
    librosa.display.waveshow(y, sr=sr)
    plt.title('Waveplot')

    plt.subplot(1, 2, 2)
    S = librosa.feature.melspectrogram(y=y, sr=sr)
    S_dB = librosa.power_to_db(S, ref=np.max)
    librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel Spectrogram')
    plt.show()

print(train_file_path[0])

plot_wave_mel(train_file_paths[0])

In [None]:
# NOISE
def add_noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data
# STRETCH
def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)
# SHIFT
def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)
# PITCH
def pitch_shift(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Dropout
from sklearn.decomposition import PCA

def extract_features(audio_path, sr=22050, n_mfcc=13, n_mels=128, n_components=30, fixed_length=200):
    
    y, sr = librosa.load(audio_path, sr=sr)
    
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    
    msf = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    msf_db = librosa.power_to_db(msf, ref=np.max)
    
    pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
    pitch = pitches[np.nonzero(pitches)]
    pitch_mean = np.mean(pitch) if len(pitch) > 0 else 0
    
    energy = np.sum(librosa.feature.rms(y=y))
    
    combined_features = np.vstack((mfccs, msf_db))
    
    pitch_energy_features = np.array([[pitch_mean, energy]])
    pitch_energy_features = np.tile(pitch_energy_features, (combined_features.shape[1], 1)).T
    combined_features = np.vstack((combined_features, pitch_energy_features))

    if combined_features.shape[1] < fixed_length:
        padding = np.zeros((combined_features.shape[0], fixed_length - combined_features.shape[1]))
        combined_features = np.hstack((combined_features, padding))
    elif combined_features.shape[1] > fixed_length:
        combined_features = combined_features[:, :fixed_length]
    
    combined_features_flattened = combined_features.reshape(combined_features.shape[0], -1).T
    
    pca = PCA(n_components=n_components)
    reduced_features = pca.fit_transform(combined_features_flattened)
    
    reduced_features = reduced_features.T
    reduced_features = np.expand_dims(reduced_features, axis=-1)
    
    return reduced_features

audio_file = './train_set/dia445_utt3.wav'

feat = extract_features(audio_file)
feat



In [None]:
feat = extract_features(audio_file)
feat

df_list = [pd.DataFrame(sub_array) for sub_array in feat]

df_list

In [None]:
unfound_files_train = []
unfound_files_test = []
unfound_files_dev = []

featurs_dir = './featuers/'
def extract_and_save_features(file_paths):
    features = []
    for file_path in file_paths:
        feature = extract_features(file_path)
        features.append(feature)
    return features
audio_base_path = './train_set/'
feature_base_path_train = './filtered_file.csv'
feature_base_path_test = './test_sent_emo.csv'
feature_base_path_dev = './dev_sent_emo.csv'




In [None]:

X_dev = extract_and_save_features(dev_file_paths, feature_base_path_dev)

X_dev

In [None]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

def plot_msf_with_waveplot(audio_path, sr=22050, n_mels=128):
    y, sr = librosa.load(audio_path, sr=sr)
    
    msf = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    msf_db = librosa.power_to_db(msf, ref=np.max)
    fig = plt.figure(figsize=(14, 8))

    ax2 = fig.add_subplot(2, 1, 2, projection='3d')
    time = np.linspace(0, len(y) / sr, msf.shape[1])
    mel_frequencies = librosa.mel_frequencies(n_mels=n_mels, fmin=0, fmax=sr/2)
    time, mel_frequencies = np.meshgrid(time, mel_frequencies)
    
    ax2.plot_surface(time, mel_frequencies, msf_db, cmap='viridis')
    ax2.set_title('MSF')
    ax2.set_xlabel('Time')
    ax2.set_ylabel('Mel Frequency')
    ax2.set_zlabel('Amplitude (dB)')
    
    plt.tight_layout()
    plt.show()

# Example usage
audio_file = './dia445_utt3.wav'
plot_msf_with_waveplot(audio_file)

In [None]:
x_dev_nparray = np.array(X_dev)

In [None]:
X_train = extract_and_save_features(train_file_paths, feature_base_path_train)


In [None]:
X_test = extract_and_save_features( test_file_paths, feature_base_path_test)

In [None]:
print(type(X_train), type(X_test), type(x_dev_nparray))

In [None]:
x_train_nparray = np.array(X_train)
x_test_nparray = np.array(X_test)
print(type(x_train_nparray), type(x_test_nparray), type(x_dev_nparray))
print(x_train_nparray.shape, x_test_nparray.shape, x_dev_nparray.shape)

In [None]:

print(f"Train X: {len(X_train)}, y: {len(train_labels)}")
print(f"Test X: {len(X_test)}, y: {len(test_labels)}")
print(f"Dev X: {len(X_dev)}, y: {len(dev_labels)}")

print(f"Train X: {len(X_train)}, y: {len(train_labels)}")
print(f"Test X: {len(X_test)}, y: {len(test_labels)}")
print(f"Dev X: {len(X_dev)}, y: {len(dev_labels)}")

In [None]:

print(f"Train X: {len(X_train)}, y: {len(train_labels)}")
print(f"Test X: {len(X_test)}, y: {len(test_labels)}")
print(f"Dev X: {len(X_dev)}, y: {len(dev_labels)}")

print(f"Train X: {len(X_train)}, y: {len(train_labels)}")
print(f"Test X: {len(X_test)}, y: {len(test_labels)}")
print(f"Dev X: {len(X_dev)}, y: {len(dev_labels)}")

train_arr = np.array(X_train)
test_arr = np.array(X_test)
dev_arr = np.array(X_dev)

X_train_model = np.expand_dims(x_train_nparray, axis=-2)
X_test_model = np.expand_dims(x_test_nparray, axis=-2)
X_dev_model = np.expand_dims(x_dev_nparray, axis=-2)


In [None]:
x_train_nparray.shape
len(X_train_model.shape)
X_train_model.shape
target_shape = (-1, 30, 200, 1)
X_train_model_res = X_train_model.reshape(target_shape)
X_train_model_res.shape

In [None]:
import os

def get_label_from_file_path(file_path):
    filename = os.path.basename(file_path)
    label = filename.split('_')[0] 
    return label

def load_data(data_dir):
    features = []
    labels = []
    for file_name in os.listdir(data_dir):
        if file_name.endswith('.wav'):
            file_path = os.path.join(data_dir, file_name)
            data = extract_features(file_path)
            if data is not None:
                features.append(data)
                labels.append(get_label_from_file_path(file_path))
    return np.array(features), np.array(labels)

data_dir = './train_set/'
X_train, y_train = load_data(data_dir)

print("X_train shape:", X_train.shape)

In [None]:

le_labels = LabelEncoder()
y_train_encoded = le_labels.fit_transform(train_labels)
y_test_encoded = le_labels.transform(test_labels)
y_dev_encoded = le_labels.transform(dev_labels)

encoder = OneHotEncoder()
y_train_categorical = encoder.fit_transform(np.array(y_train_encoded).reshape(-1,1)).toarray()
y_test_categorical = encoder.fit_transform(np.array(y_test_encoded).reshape(-1,1)).toarray()
y_dev_categorical = encoder.fit_transform(np.array(y_dev_encoded).reshape(-1,1)).toarray()


In [None]:

encoder = OneHotEncoder(sparse=False)
labels_one_hot = encoder.fit_transform(y_train_encoded.reshape(-1, 1))

encoder = OneHotEncoder(sparse=False)
labels_one_hot_test = encoder.fit_transform(y_test_encoded.reshape(-1, 1))

print("Розмірність лейблів після кодування:", labels_one_hot.shape)

In [None]:
from tensorflow.keras import backend as K

def f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    
    f1_val = 2 * (precision * recall) / (precision + recall + K.epsilon())
    return f1_val

model = Sequential()

model.add(Conv2D(128, (5, 5), activation='relu', input_shape=(200, 216, 1)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'))
model.add(Dropout(0.3))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'))
model.add(Dropout(0.3))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 1), strides=(2, 1), padding='valid'))
model.add(Dropout(0.3))

model.add(Conv2D(128, (1, 1), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(1, 1), strides=(1, 1), padding='valid'))
model.add(Dropout(0.3))

model.add(GlobalAveragePooling2D())

model.add(Dense(64, activation='relu'))

model.add(Dense(7, activation='softmax'))

initial_lr = 0.001
optimiser = Adam(learning_rate=initial_lr)
model.compile(optimizer=optimiser, loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

history = model.fit(X_train_model_res, labels_one_hot, 
                    epochs=50, 
                    batch_size=64, 
                    validation_data=(X_test_model, labels_one_hot_test), 
                    callbacks=[reduce_lr])


In [None]:
def plot_history(history):
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 6))

    ax1.plot(history.history['accuracy'])
    ax1.plot(history.history['val_accuracy'])
    ax1.set_title('Model Accuracy')
    ax1.set_ylabel('Accuracy')
    ax1.set_xlabel('Epoch')
    ax1.legend(['Train', 'Validation'], loc='upper left')

    ax2.plot(history.history['loss'])
    ax2.plot(history.history['val_loss'])
    ax2.set_title('Model Loss')
    ax2.set_ylabel('Loss')
    ax2.set_xlabel('Epoch')
    ax2.legend(['Train', 'Validation'], loc='upper left')

    plt.show()

plot_history(history)


In [None]:
sample_file_path = './dia4_utt6.wav'
sample2_file_path = './Recording2.wav'

sample_features = extract_features(sample2_file_path)
reshaped_sample = sample_features.reshape(target_shape)
prediction = model.predict(reshaped_sample)
predicted_label = le_labels.inverse_transform([np.argmax(prediction)])
print(f"Predicted Emotion: {predicted_label[0]}")

In [None]:
isiah_neutral = './redacted_Isiah_neutral.wav'
isiah_anger = './redacted_Isiah.wav'
pogliad = './pogliad.wav'
def plot_wave_mel(audio_path):
    y, sr = librosa.load(audio_path)
    plt.figure(figsize=(14, 5))
    plt.subplot(1, 2, 1)
    librosa.display.waveshow(y, sr=sr)
    plt.title('Waveplot')
    plt.subplot(1, 2, 2)
    S = librosa.feature.melspectrogram(y=y, sr=sr)
    S_dB = librosa.power_to_db(S, ref=np.max)
    librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel Spectrogram')
    plt.show()

plot_wave_mel(pogliad)

In [None]:
import numpy as np
import librosa
import matplotlib.pyplot as plt

def compute_msf(y, sr, n_fft=2048, hop_length=512, n_mels=128):
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    
    log_mel_spectrogram = librosa.amplitude_to_db(mel_spectrogram)
    
    modulation_spectrum = np.abs(np.fft.fft(log_mel_spectrogram, axis=1))
    
    modulation_spectrum = modulation_spectrum[:, :modulation_spectrum.shape[1] // 2]
    
    return modulation_spectrum

y, sr = librosa.load(pogliad)

msf = compute_msf(y, sr)

fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

X, Y = np.meshgrid(np.arange(msf.shape[1]), np.arange(msf.shape[0]))

ax.plot_surface(X, Y, msf, cmap='viridis')

ax.set_xlabel('Modulation Frequency')
ax.set_ylabel('Mel Frequency')
ax.set_zlabel('Amplitude (dB)')
ax.set_title('Modulation Spectral Features (MSF)')

plt.show()