In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# Install Dependencies
!pip install transformers torchaudio --quiet

import os
import numpy as np
import pandas as pd
import torch
import torchaudio
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's 

In [6]:
# Paths & constants
DATA_PATH = '/kaggle/input/shl-hiring-assessment/Dataset'
AUDIO_PATH = f'{DATA_PATH}/audios'
TRAIN_AUDIO_DIR = f'{AUDIO_PATH}/train'
TEST_AUDIO_DIR = f'{AUDIO_PATH}/test'
SAMPLE_RATE = 16000
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 2
EPOCHS = 6

In [7]:
from sklearn.model_selection import train_test_split

train_df = pd.read_csv(f'{DATA_PATH}/train.csv')
test_df = pd.read_csv(f'{DATA_PATH}/test.csv')

# Split 80-20 for validation
train_df_, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

print(f"Train: {len(train_df_)} | Val: {len(val_df)} | Test: {len(test_df)}")

Train: 355 | Val: 89 | Test: 204


In [8]:
# Preprocessing
def preprocess_audio(file_path, target_sr=SAMPLE_RATE):
    waveform, sr = torchaudio.load(file_path)

    # Stereo → Mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # Silence trim
    waveform, _ = torchaudio.transforms.Vad(sample_rate=sr)(waveform)

    # Resample
    if sr != target_sr:
        waveform = torchaudio.transforms.Resample(sr, target_sr)(waveform)

    # Normalize
    waveform = waveform / waveform.abs().max()

    return waveform

In [9]:
# Custom dataset

class GrammarDataset(Dataset):
    def __init__(self, df, audio_dir, labels=True):
        self.df = df
        self.audio_dir = audio_dir
        self.labels = labels

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        path = os.path.join(self.audio_dir, row['filename'])
        waveform, sr = torchaudio.load(path)

        # Mono
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Normalize
        waveform = waveform / waveform.abs().max()

        # Resample
        if sr != 16000:
            waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)

        if self.labels:
            return waveform, torch.tensor(row['label'], dtype=torch.float32)
        else:
            return waveform, row['filename']

In [10]:
# Model with Frozen HuBERT + Trainable Head

from transformers import HubertModel

class HuBERTRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = HubertModel.from_pretrained("facebook/hubert-base-ls960")
        for p in self.backbone.parameters():
            p.requires_grad = False

        self.regressor = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):  # x: [B, T]
        with torch.no_grad():
            feats = self.backbone(x).last_hidden_state  # [B, T', 768]
            pooled = feats.mean(dim=1)  # [B, 768]
        return self.regressor(pooled).squeeze(1)

2025-05-05 15:34:20.004725: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746459260.449009      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746459260.574006      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [11]:
# Training and Evaluation

def train_model(model, train_loader, val_loader, epochs=EPOCHS):
    model.to(DEVICE)
    optimizer = torch.optim.Adam(model.regressor.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            preds = model(inputs)
            loss = loss_fn(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1} - Loss: {total_loss / len(train_loader):.4f}")

        # Validation
        model.eval()
        preds, targets = [], []
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
                output = model(inputs)
                preds.extend(output.cpu().numpy())
                targets.extend(labels.cpu().numpy())

        rmse = mean_squared_error(targets, preds, squared=False)
        pearson = pearsonr(targets, preds)[0]
        print(f"📉 Val RMSE: {rmse:.4f} | Pearson: {pearson:.4f}")

In [12]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn_train_val(batch):
    waveforms = [x[0].squeeze(0) for x in batch]  # shape: [T]
    padded = pad_sequence(waveforms, batch_first=True)  # shape: [B, T]
    labels = torch.tensor([x[1] for x in batch], dtype=torch.float32)
    return padded, labels

def collate_fn_test(batch):
    waveforms = [x[0].squeeze(0) for x in batch]  # shape: [T]
    padded = pad_sequence(waveforms, batch_first=True)  # shape: [B, T]
    filenames = [x[1] for x in batch]
    return padded, filenames


train_set = GrammarDataset(train_df_, TRAIN_AUDIO_DIR, labels=True)
val_set = GrammarDataset(val_df, TRAIN_AUDIO_DIR, labels=True)
test_set = GrammarDataset(test_df, TEST_AUDIO_DIR, labels=False)

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn_train_val)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, collate_fn=collate_fn_train_val)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, collate_fn=collate_fn_test)
    
model = HuBERTRegressor()
train_model(model, train_loader, val_loader)

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]


Epoch 1:   0%|          | 0/178 [00:00<?, ?it/s][A
Epoch 1:   1%|          | 1/178 [00:02<06:29,  2.20s/it][A
Epoch 1:   1%|          | 2/178 [00:02<03:59,  1.36s/it][A
Epoch 1:   2%|▏         | 3/178 [00:03<03:11,  1.09s/it][A
Epoch 1:   2%|▏         | 4/178 [00:04<02:49,  1.03it/s][A
Epoch 1:   3%|▎         | 5/178 [00:05<02:36,  1.11it/s][A
Epoch 1:   3%|▎         | 6/178 [00:08<04:55,  1.72s/it][A
Epoch 1:   4%|▍         | 7/178 [00:09<04:04,  1.43s/it][A
Epoch 1:   4%|▍         | 8/178 [00:10<03:28,  1.23s/it][A
Epoch 1:   5%|▌         | 9/178 [00:10<03:01,  1.07s/it][A
Epoch 1:   6%|▌         | 10/178 [00:14<05:04,  1.81s/it][A
Epoch 1:   6%|▌         | 11/178 [00:15<04:13,  1.52s/it][A
Epoch 1:   7%|▋         | 12/178 [00:16<03:37,  1.31s/it][A
Epoch 1:   7%|▋         | 13/178 [00:16<03:13,  1.17s/it][A
Epoch 1:   8%|▊         | 14/178 [00:20<05:20,  1.95s/it][A
Epoch 1:   8%|▊         | 15/178 [00:24<06:50,  2.52s/it][A
Epoch 1:   9%|▉         | 16/178 [00:25<0

Epoch 1 - Loss: 2.6210
📉 Val RMSE: 1.0215 | Pearson: 0.5624


Epoch 2: 100%|██████████| 178/178 [05:36<00:00,  1.89s/it]


Epoch 2 - Loss: 0.9685
📉 Val RMSE: 1.0270 | Pearson: 0.6012


Epoch 3: 100%|██████████| 178/178 [05:41<00:00,  1.92s/it]


Epoch 3 - Loss: 0.9273
📉 Val RMSE: 0.8836 | Pearson: 0.6553


Epoch 4: 100%|██████████| 178/178 [05:26<00:00,  1.83s/it]


Epoch 4 - Loss: 0.8485
📉 Val RMSE: 1.1335 | Pearson: 0.6727


Epoch 5: 100%|██████████| 178/178 [05:25<00:00,  1.83s/it]


Epoch 5 - Loss: 0.8229
📉 Val RMSE: 0.9547 | Pearson: 0.6866


Epoch 6: 100%|██████████| 178/178 [05:32<00:00,  1.87s/it]


Epoch 6 - Loss: 0.9419
📉 Val RMSE: 0.9580 | Pearson: 0.7008


In [14]:
# Define test dataset and loader (with proper collate_fn)
test_set = GrammarDataset(test_df, TEST_AUDIO_DIR, labels=False)
test_loader = DataLoader(test_set, batch_size=2, collate_fn=collate_fn_test)  # BATCH_SIZE = 1

# Inference
model.eval()
all_preds, all_fnames = [], []

with torch.no_grad():
    for inputs, fnames in test_loader:
        inputs = inputs.to(DEVICE)  # shape: [1, T]
        preds = model(inputs).cpu().numpy()
        all_preds.extend(preds)
        all_fnames.extend(fnames)

In [15]:
submission = pd.DataFrame({
    "filename": all_fnames,
    "label": all_preds
})

submission.to_csv("/kaggle/working/submission.csv", index=False)
submission.head()

Unnamed: 0,filename,label
0,audio_804.wav,3.374985
1,audio_1028.wav,3.156743
2,audio_865.wav,3.766367
3,audio_774.wav,3.155712
4,audio_1138.wav,3.443486


In [16]:
df = pd.read_csv("/kaggle/working/submission.csv")

In [17]:
print(df.head())
print(df.shape)

         filename     label
0   audio_804.wav  3.374985
1  audio_1028.wav  3.156743
2   audio_865.wav  3.766367
3   audio_774.wav  3.155712
4  audio_1138.wav  3.443487
(204, 2)
