In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [28]:
# Install Dependencies
!pip install transformers torchaudio --quiet

import os
import numpy as np
import pandas as pd
import torch
import torchaudio
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [49]:
# Paths & constants
DATA_PATH = '/kaggle/input/shl-intern-hiring-assessment/Dataset'
AUDIO_PATH = f'{DATA_PATH}/audios'
TRAIN_AUDIO_DIR = f'{AUDIO_PATH}/train'
TEST_AUDIO_DIR = f'{AUDIO_PATH}/test'
SAMPLE_RATE = 16000
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 1
EPOCHS = 5

In [50]:
from sklearn.model_selection import train_test_split

train_df = pd.read_csv(f'{DATA_PATH}/train.csv')
test_df = pd.read_csv(f'{DATA_PATH}/test.csv')

# Split 80-20 for validation
train_df_, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

print(f"Train: {len(train_df_)} | Val: {len(val_df)} | Test: {len(test_df)}")

Train: 355 | Val: 89 | Test: 204


In [51]:
# Preprocessing
def preprocess_audio(file_path, target_sr=SAMPLE_RATE):
    waveform, sr = torchaudio.load(file_path)

    # Stereo → Mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # Silence trim
    waveform, _ = torchaudio.transforms.Vad(sample_rate=sr)(waveform)

    # Resample
    if sr != target_sr:
        waveform = torchaudio.transforms.Resample(sr, target_sr)(waveform)

    # Normalize
    waveform = waveform / waveform.abs().max()

    return waveform

In [52]:
# Custom dataset

class GrammarDataset(Dataset):
    def __init__(self, df, audio_dir, labels=True):
        self.df = df
        self.audio_dir = audio_dir
        self.labels = labels

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        path = os.path.join(self.audio_dir, row['filename'])
        waveform, sr = torchaudio.load(path)

        # Mono
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Normalize
        waveform = waveform / waveform.abs().max()

        # Resample
        if sr != 16000:
            waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)

        if self.labels:
            return waveform, torch.tensor(row['label'], dtype=torch.float32)
        else:
            return waveform, row['filename']

In [53]:
# Model with Frozen HuBERT + Trainable Head

from transformers import HubertModel

class HuBERTRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = HubertModel.from_pretrained("facebook/hubert-base-ls960")
        for p in self.backbone.parameters():
            p.requires_grad = False

        self.regressor = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):  # x: [B, T]
        with torch.no_grad():
            feats = self.backbone(x).last_hidden_state  # [B, T', 768]
            pooled = feats.mean(dim=1)  # [B, 768]
        return self.regressor(pooled).squeeze(1)

In [54]:
# Training and Evaluation

def train_model(model, train_loader, val_loader, epochs=EPOCHS):
    model.to(DEVICE)
    optimizer = torch.optim.Adam(model.regressor.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            preds = model(inputs)
            loss = loss_fn(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1} - Loss: {total_loss / len(train_loader):.4f}")

        # Validation
        model.eval()
        preds, targets = [], []
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
                output = model(inputs)
                preds.extend(output.cpu().numpy())
                targets.extend(labels.cpu().numpy())

        rmse = mean_squared_error(targets, preds, squared=False)
        pearson = pearsonr(targets, preds)[0]
        print(f"📉 Val RMSE: {rmse:.4f} | Pearson: {pearson:.4f}")

In [55]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn_train_val(batch):
    waveforms = [x[0].squeeze(0) for x in batch]  # shape: [T]
    padded = pad_sequence(waveforms, batch_first=True)  # shape: [B, T]
    labels = torch.tensor([x[1] for x in batch], dtype=torch.float32)
    return padded, labels

def collate_fn_test(batch):
    waveforms = [x[0].squeeze(0) for x in batch]  # shape: [T]
    padded = pad_sequence(waveforms, batch_first=True)  # shape: [B, T]
    filenames = [x[1] for x in batch]
    return padded, filenames


train_set = GrammarDataset(train_df_, TRAIN_AUDIO_DIR, labels=True)
val_set = GrammarDataset(val_df, TRAIN_AUDIO_DIR, labels=True)
test_set = GrammarDataset(test_df, TEST_AUDIO_DIR, labels=False)

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn_train_val)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, collate_fn=collate_fn_train_val)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, collate_fn=collate_fn_test)
    
model = HuBERTRegressor()
train_model(model, train_loader, val_loader)

Epoch 1: 100%|██████████| 355/355 [04:37<00:00,  1.28it/s]


Epoch 1 - Loss: 2.0002
📉 Val RMSE: 0.9553 | Pearson: 0.6557


Epoch 2: 100%|██████████| 355/355 [04:22<00:00,  1.35it/s]


Epoch 2 - Loss: 1.0071
📉 Val RMSE: 0.8806 | Pearson: 0.7074


Epoch 3: 100%|██████████| 355/355 [04:24<00:00,  1.34it/s]


Epoch 3 - Loss: 0.8017
📉 Val RMSE: 1.0041 | Pearson: 0.6915


Epoch 4: 100%|██████████| 355/355 [04:23<00:00,  1.35it/s]


Epoch 4 - Loss: 0.7919
📉 Val RMSE: 0.7916 | Pearson: 0.7422


Epoch 5: 100%|██████████| 355/355 [04:23<00:00,  1.34it/s]


Epoch 5 - Loss: 0.7627
📉 Val RMSE: 0.9664 | Pearson: 0.6504


In [58]:

# Define test dataset and loader (with proper collate_fn)
test_set = GrammarDataset(test_df, TEST_AUDIO_DIR, labels=False)
test_loader = DataLoader(test_set, batch_size=1, collate_fn=collate_fn_test)  # BATCH_SIZE = 1

# Inference
model.eval()
all_preds, all_fnames = [], []

with torch.no_grad():
    for inputs, fnames in test_loader:
        inputs = inputs.to(DEVICE)  # shape: [1, T]
        preds = model(inputs).cpu().numpy()
        all_preds.extend(preds)
        all_fnames.extend(fnames)

In [59]:
submission = pd.DataFrame({
    "filename": all_fnames,
    "label": all_preds
})

submission.to_csv("/kaggle/working/submission.csv", index=False)
submission.head()

Unnamed: 0,filename,label
0,audio_804.wav,3.718309
1,audio_1028.wav,2.745551
2,audio_865.wav,3.728491
3,audio_774.wav,3.355473
4,audio_1138.wav,3.974254
