In [None]:
import numpy as np 
import pandas as pd
import operator
import os

import warnings
warnings.filterwarnings("ignore")

from scipy import signal
from scipy.io import wavfile
import matplotlib.pyplot as plt
from matplotlib.backend_bases import RendererBase

from PIL import Image

from scipy.fftpack import fft
import librosa
import librosa.display

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras import models
from tensorflow.keras import layers

In [None]:
def get_files(path):
    data_ = []
    count = {}
    files = os.listdir(path)
    for file in files:
        labels = file.split("-")[:3]
        data = {
            "keyword": labels[0],
            "stress": labels[2],
            "environment": labels[1],
            "path": os.path.join(path, file)
        }
        for label in labels:
            if label not in count.keys():
                count[label] = 1
            else:
                count[label] += 1
        data_.append(data)
    if data_:
        df = pd.DataFrame(data_)
    else:
        df = pd.DataFrame(columns=['keyword', 'stress', 'environment', 'path'])
    return data_, count, df

path = 'House/'
dataset, count, df = get_files(path)
df_nopath = df.iloc[:, :-1]

print(count)
print()
print(df.head())
print("Total Number of samples: ", len(df))

In [None]:
def extract_foreground_and_mfcc(file_path, duration=3, sr=16000, n_mfcc=13, max_frames=150):
    # Load audio file
    y, sr = librosa.load(file_path, sr=sr, duration=duration)
    
    # Compute foreground and background
    D = np.abs(librosa.stft(y))
    S_full, _ = librosa.magphase(librosa.stft(y))
    S_filter = librosa.decompose.nn_filter(S_full,
                                           aggregate=np.median,
                                           metric='cosine',
                                           width=int(librosa.time_to_frames(1, sr=sr)))  # Adjusted width parameter
    S_filter = np.minimum(S_full, S_filter)
    margin_i, margin_v = 2, 10
    power = 2
    mask_i = librosa.util.softmask(S_filter,
                                   margin_i * (S_full - S_filter),
                                   power=power)
    mask_v = librosa.util.softmask(S_full - S_filter,
                                   margin_v * S_filter,
                                   power=power)
    S_foreground = mask_v * S_full
    
    # Apply MFCC on foreground
    mfccs = librosa.feature.mfcc(S=librosa.amplitude_to_db(S_foreground), sr=sr, n_mfcc=n_mfcc)
    
    # Pad or truncate to a fixed number of frames
    if mfccs.shape[1] < max_frames:
        mfccs = np.pad(mfccs, ((0, 0), (0, max_frames - mfccs.shape[1])), mode='constant')
    else:
        mfccs = mfccs[:, :max_frames]
    
    return mfccs.T

# Example usage:
file_path = "your_audio_file_path.wav"
mfcc_features = extract_foreground_and_mfcc(file_path)


In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras import models
from tensorflow.keras import layers

# Encode labels
label_encoder = LabelEncoder()
df['stress_encoded'] = label_encoder.fit_transform(df['stress'])
df['environment_encoded'] = label_encoder.fit_transform(df['environment'])

# Split the dataset into training and test sets
X = np.array([extract_features(file) for file in df['path']])
y_stress = to_categorical(df['stress_encoded'])
y_env = to_categorical(df['environment_encoded'])
X_train, X_test, y_stress_train, y_stress_test, y_env_train, y_env_test = train_test_split(X, y_stress, y_env, test_size=0.2, random_state=42)

# Define the neural network model
input_shape = X_train.shape[1:]
stress_output_classes = len(df['stress'].unique())
env_output_classes = len(df['environment'].unique())

model = models.Sequential([
    layers.Input(shape=input_shape),
    layers.LSTM(128, return_sequences=True),
    layers.LSTM(128),
    layers.Dense(stress_output_classes, activation='softmax', name='stress_output'),
    layers.Dense(env_output_classes, activation='softmax', name='env_output')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, {'stress_output': y_stress_train, 'env_output': y_env_train}, validation_data=(X_test, {'stress_output': y_stress_test, 'env_output': y_env_test}), epochs=10)

# Evaluate the model
# Evaluate the model
print(model.evaluate(X_test, {'stress_output': y_stress_test, 'env_output': y_env_test}))
# loss, stress_loss, env_loss, stress_accuracy, env_accuracy = 

# print(f'Loss: {loss}, Stress Loss: {stress_loss}, Environment Loss: {env_loss}, Stress Accuracy: {stress_accuracy}, Environment Accuracy: {env_accuracy}')


In [None]:
# Prediction example
sample_path = 'House/bacho-safe-calm-20190712132253.wav'
sample_features = extract_features(sample_path)
sample_features = np.expand_dims(sample_features, axis=0)
stress_pred, env_pred = model.predict(sample_features)
predicted_stress = label_encoder.inverse_transform([np.argmax(stress_pred)])
predicted_env = label_encoder.inverse_transform([np.argmax(env_pred)])
print(f'Predicted Stress: {predicted_stress}, Predicted Environment: {predicted_env}')