In [7]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import librosa as lb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the annotated data
annotated_data = pd.read_csv('/Users/rachelwang/Downloads/modified_data_corrected.csv')

# Split the data
Xtrain, Xval, ytrain, yval = train_test_split(
    annotated_data, annotated_data.quality, stratify=annotated_data.quality, random_state=42, test_size=0.25)

# Encode labels
le = LabelEncoder()
ytrain = le.fit_transform(ytrain)
yval = le.transform(yval)

# Feature extraction function
def pad_or_truncate(feature, max_len):
    if feature.shape[1] < max_len:
        pad_width = max_len - feature.shape[1]
        feature = np.pad(feature, ((0, 0), (0, pad_width)), mode='constant')
    else:
        feature = feature[:, :max_len]
    return feature

def getFeatures(path, max_len=259):
    soundArr, sample_rate = lb.load(path)
    mfcc = lb.feature.mfcc(y=soundArr, sr=sample_rate)
    mfcc = pad_or_truncate(mfcc, max_len)
    return mfcc

# Custom dataset class
class AudioDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.features = []
        self.labels = []

        for idx, row in df.iterrows():
            path = row['file']
            mfcc = getFeatures(path)
            self.features.append(mfcc)
            self.labels.append(row['quality'])

        self.labels = le.transform(self.labels)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset = AudioDataset(Xtrain)
val_dataset = AudioDataset(Xval)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the model
class MFCCNet(nn.Module):
    def __init__(self):
        super(MFCCNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 3), padding=2)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(2)
        
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 2), padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(2)
        
        self.conv3 = nn.Conv2d(64, 96, kernel_size=(2, 2), padding=1)
        self.bn3 = nn.BatchNorm2d(96)
        self.pool3 = nn.MaxPool2d(2)
        
        self.conv4 = nn.Conv2d(96, 128, kernel_size=(2, 2), padding=1)
        self.bn4 = nn.BatchNorm2d(128)
        self.gmp = nn.AdaptiveMaxPool2d((1, 1))
        
        self.fc1 = nn.Linear(128, 50)
        self.fc2 = nn.Linear(50, 25)
        self.fc3 = nn.Linear(25, 8)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        x = self.pool1(nn.ReLU()(self.bn1(self.conv1(x))))
        x = self.pool2(nn.ReLU()(self.bn2(self.conv2(x))))
        x = self.pool3(nn.ReLU()(self.bn3(self.conv3(x))))
        x = self.gmp(nn.ReLU()(self.bn4(self.conv4(x))))
        x = x.view(x.size(0), -1)
        x = self.dropout(nn.ReLU()(self.fc1(x)))
        x = self.dropout(nn.ReLU()(self.fc2(x)))
        x = self.fc3(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MFCCNet().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Early stopping parameters
early_stop_patience = 50
early_stop_counter = 0
best_val_loss = float('inf')

# File path for saving the best model
best_model_path = 'best_model.pth'

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for features, labels in train_loader:
        mfcc = torch.tensor(features).unsqueeze(1).float().to(device)
        labels = torch.tensor(labels).to(device)

        optimizer.zero_grad()
        outputs = model(mfcc)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = 100 * correct / total

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for features, labels in val_loader:
            mfcc = torch.tensor(features).unsqueeze(1).float().to(device)
            labels = torch.tensor(labels).to(device)

            outputs = model(mfcc)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_loss /= len(val_loader)
    val_accuracy = 100 * val_correct / val_total

    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), best_model_path)
    else:
        early_stop_counter += 1
        if early_stop_counter >= early_stop_patience:
            print("Early stopping triggered.")
            break

print("Training complete.")
print(f"Best model saved at: {best_model_path}")

  mfcc = torch.tensor(features).unsqueeze(1).float().to(device)
  labels = torch.tensor(labels).to(device)


Epoch [1/100], Loss: 1.3674, Accuracy: 61.44%
Validation Loss: 0.9306, Validation Accuracy: 76.50%


  mfcc = torch.tensor(features).unsqueeze(1).float().to(device)
  labels = torch.tensor(labels).to(device)


Epoch [2/100], Loss: 0.9371, Accuracy: 72.95%
Validation Loss: 0.8150, Validation Accuracy: 76.50%
Epoch [3/100], Loss: 0.8294, Accuracy: 74.62%
Validation Loss: 0.7944, Validation Accuracy: 76.50%
Epoch [4/100], Loss: 0.7459, Accuracy: 76.13%
Validation Loss: 0.7500, Validation Accuracy: 76.50%
Epoch [5/100], Loss: 0.6929, Accuracy: 75.13%
Validation Loss: 0.7081, Validation Accuracy: 76.50%
Epoch [6/100], Loss: 0.7053, Accuracy: 76.79%
Validation Loss: 0.6697, Validation Accuracy: 76.50%
Epoch [7/100], Loss: 0.6813, Accuracy: 75.63%
Validation Loss: 0.7017, Validation Accuracy: 76.50%
Epoch [8/100], Loss: 0.6477, Accuracy: 77.13%
Validation Loss: 0.7303, Validation Accuracy: 76.50%
Epoch [9/100], Loss: 0.6119, Accuracy: 77.63%
Validation Loss: 0.6671, Validation Accuracy: 76.50%
Epoch [10/100], Loss: 0.6199, Accuracy: 76.96%
Validation Loss: 0.6511, Validation Accuracy: 76.50%
Epoch [11/100], Loss: 0.5749, Accuracy: 77.63%
Validation Loss: 0.7922, Validation Accuracy: 76.50%
Epoch [1