In [57]:
import os
import pandas as pd
import numpy as np
import librosa
from tqdm import tqdm
from datetime import datetime

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight

# TensorFlow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers import Adam

In [58]:
metadata = pd.read_csv('train_dataset.csv')

In [59]:
def features_extractor(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, sr=22050)

        mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        delta = librosa.feature.delta(mfcc)
        delta2 = librosa.feature.delta(mfcc, order=2)

        combined = np.vstack((mfcc, delta, delta2))

        return np.mean(combined.T, axis=0)  # shape = (120,)

    except Exception as e:
        return None

In [60]:
extracted_features = []

for index, row in tqdm(metadata.iterrows(), total=len(metadata)):
    file_name = row["path"]
    label = row["label"]

    data = features_extractor(file_name)

    if data is not None:
        extracted_features.append([data, label])

# Convert to DataFrame
extracted_features_df = pd.DataFrame(
    extracted_features, columns=['feature', 'class']
)

print("\nClass distribution:")
print(extracted_features_df['class'].value_counts())

100%|██████████| 20252/20252 [03:04<00:00, 109.75it/s]


Class distribution:
class
1    15051
0     5201
Name: count, dtype: int64





In [61]:
X = np.array(extracted_features_df['feature'].tolist())
y = np.array(extracted_features_df['class'].tolist())

print("Feature shape:", X.shape)  # Should be (samples, 120)

Feature shape: (20252, 120)


In [62]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [63]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [64]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = dict(enumerate(class_weights))
print("Class weights:", class_weight_dict)

Class weights: {0: np.float64(1.9467676039413602), 1: np.float64(0.6727990033222592)}


In [65]:
model = Sequential()

model.add(Dense(128, activation='relu', input_shape=(120,)))
model.add(Dropout(0.3))

model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [66]:
os.makedirs("saved_models", exist_ok=True)

checkpointer = ModelCheckpoint(
    filepath="saved_models/audio_classification.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)


In [67]:
start = datetime.now()

history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=32,
    callbacks=[checkpointer, early_stop],
    class_weight=class_weight_dict,
    verbose=1
)

duration = datetime.now() - start
print("Training completed in time:", duration)

Epoch 1/100
[1m482/507[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.5433 - loss: 0.7142
Epoch 1: val_loss improved from None to 0.68741, saving model to saved_models/audio_classification.h5





Epoch 1: finished saving model to saved_models/audio_classification.h5
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.5479 - loss: 0.6995 - val_accuracy: 0.5349 - val_loss: 0.6874
Epoch 2/100
[1m499/507[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.5732 - loss: 0.6770
Epoch 2: val_loss improved from 0.68741 to 0.67659, saving model to saved_models/audio_classification.h5





Epoch 2: finished saving model to saved_models/audio_classification.h5
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5706 - loss: 0.6796 - val_accuracy: 0.5759 - val_loss: 0.6766
Epoch 3/100
[1m506/507[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.6041 - loss: 0.6618
Epoch 3: val_loss did not improve from 0.67659
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5811 - loss: 0.6697 - val_accuracy: 0.5623 - val_loss: 0.6774
Epoch 4/100
[1m492/507[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.6117 - loss: 0.6575
Epoch 4: val_loss improved from 0.67659 to 0.64763, saving model to saved_models/audio_classification.h5





Epoch 4: finished saving model to saved_models/audio_classification.h5
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5975 - loss: 0.6598 - val_accuracy: 0.5989 - val_loss: 0.6476
Epoch 5/100
[1m501/507[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.6044 - loss: 0.6473
Epoch 5: val_loss did not improve from 0.64763
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6001 - loss: 0.6512 - val_accuracy: 0.5616 - val_loss: 0.6742
Epoch 6/100
[1m505/507[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.6250 - loss: 0.6306
Epoch 6: val_loss did not improve from 0.64763
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6035 - loss: 0.6451 - val_accuracy: 0.4981 - val_loss: 0.7037
Epoch 7/100
[1m486/507[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.6193 - loss: 0.6266
Epoch 7: val_loss 




Epoch 8: finished saving model to saved_models/audio_classification.h5
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6280 - loss: 0.6271 - val_accuracy: 0.6092 - val_loss: 0.6441
Epoch 9/100
[1m503/507[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.6446 - loss: 0.6130
Epoch 9: val_loss did not improve from 0.64413
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6292 - loss: 0.6201 - val_accuracy: 0.5120 - val_loss: 0.7008
Epoch 10/100
[1m504/507[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.6410 - loss: 0.6052
Epoch 10: val_loss did not improve from 0.64413
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6305 - loss: 0.6140 - val_accuracy: 0.5579 - val_loss: 0.6726
Epoch 11/100
[1m485/507[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.6617 - loss: 0.5947
Epoch 11: val_l

In [68]:
y_pred = (model.predict(X_test) > 0.5).astype(int)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Confusion Matrix:
[[ 577  463]
 [1120 1891]]

Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.55      0.42      1040
           1       0.80      0.63      0.70      3011

    accuracy                           0.61      4051
   macro avg       0.57      0.59      0.56      4051
weighted avg       0.68      0.61      0.63      4051

