In [None]:
# CELL 1: Install Libraries

!pip install -q torch torchvision torchaudio transformers librosa soundfile sklearn pandas numpy matplotlib seaborn lightgbm

import os
import warnings
warnings.filterwarnings('ignore')

# Data Handling
import pandas as pd
import numpy as np
from pathlib import Path

# Audio processing
import librosa
import soundfile as sf

# Deep Learning / Embeddings
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model, WhisperProcessor, WhisperModel

# ML
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Using device: cuda


In [None]:

# CELL 2: Kaggle Setup & Data Load

!pip install kaggle

from google.colab import files
print("Please upload your kaggle.json file now:")
files.upload() # Upload kaggle.json

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c shl-intern-hiring-assessment-2025
!unzip -q shl-intern-hiring-assessment-2025.zip -d data/


Please upload your kaggle.json file now:


Saving kaggle.json to kaggle.json
Downloading shl-intern-hiring-assessment-2025.zip to /content
 99% 1.23G/1.23G [00:11<00:00, 139MB/s]
100% 1.23G/1.23G [00:11<00:00, 114MB/s]


In [None]:

# CELL 2: Load Data

train_df = pd.read_csv('/content/dataset/csvs/train.csv')
test_df = pd.read_csv('/content/dataset/csvs/test.csv')

train_audio_dir = Path('/content/dataset/audios/train')
test_audio_dir = Path('/content/dataset/audios/test')

print(f"Train samples: {len(train_df)}, Test samples: {len(test_df)}")


Train samples: 409, Test samples: 197


In [None]:
# CELL 3: Load Pretrained Models

# Wav2Vec2
wav2vec_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)
wav2vec_model.eval()

# Whisper Encoder
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
whisper_model = WhisperModel.from_pretrained("openai/whisper-base").to(device)
whisper_model.eval()


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

WhisperModel(
  (encoder): WhisperEncoder(
    (conv1): Conv1d(80, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,))
    (embed_positions): Embedding(1500, 512)
    (layers): ModuleList(
      (0-5): 6 x WhisperEncoderLayer(
        (self_attn): WhisperAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )


In [None]:

# CELL 4: Feature Extraction

def extract_audio_features(audio_path):
    y, sr = librosa.load(audio_path, sr=16000)

    # --- MFCC ---
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_mean = mfcc.mean(axis=1)
    mfcc_std = mfcc.std(axis=1)

    # --- Spectral ---
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_mean = chroma.mean(axis=1)
    chroma_std = chroma.std(axis=1)

    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y).mean()

    # --- Prosody ---
    duration = librosa.get_duration(y=y, sr=sr)
    rms = librosa.feature.rms(y=y).mean()

    # --- Wav2Vec2 Embedding ---
    wav_input = wav2vec_processor(y, return_tensors="pt", sampling_rate=16000).input_values.to(device)
    with torch.no_grad():
        wav_emb = wav2vec_model(wav_input).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

    # --- Whisper Embedding ---
    whisper_input = whisper_processor(y, sampling_rate=16000, return_tensors="pt").input_features.to(device)
    with torch.no_grad():
        whisper_emb = whisper_model.encoder(whisper_input).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

    features = np.concatenate([
        mfcc_mean, mfcc_std, chroma_mean, chroma_std,
        [spectral_centroid, spectral_bandwidth, spectral_rolloff, zero_crossing_rate, duration, rms],
        wav_emb, whisper_emb
    ])
    return features



In [None]:
# CELL 5: Prepare Train Embeddings

from pathlib import Path
import numpy as np

train_embeddings = []
for idx, row in train_df.iterrows():
    filename = str(row['filename'])
    audio_path = train_audio_dir / filename  # use as-is

    if not audio_path.suffix:  # if no extension, add .wav
        audio_path = audio_path.with_suffix(".wav")

    if audio_path.exists():
        emb = extract_audio_features(audio_path)
    else:
        print(f"File not found: {audio_path}, using zero vector")
        emb = np.zeros(768)  # adjust to your embedding dimension
    train_embeddings.append(emb)

X = np.vstack(train_embeddings)
y = train_df['label'].values
print(f"Train embeddings shape: {X.shape}, Target shape: {y.shape}")


Train embeddings shape: (409, 1336), Target shape: (409,)


In [None]:
print(train_df['filename'].head())

0    audio_173
1    audio_138
2    audio_127
3     audio_95
4     audio_73
Name: filename, dtype: object


new

In [None]:
# CELL 6: Prepare Test Embeddings

from pathlib import Path
import numpy as np

test_embeddings = []
for idx, row in test_df.iterrows():
    filename = str(row['filename'])
    audio_path = test_audio_dir / filename

    # If no extension is present, add .wav
    if not Path(filename).suffix:
        audio_path = audio_path.with_suffix(".wav")

    if audio_path.exists():
        emb = extract_audio_features(audio_path)
    else:
        print(f"File not found: {audio_path}, using zero vector")
        emb = np.zeros(768)  # adjust to your embedding dimension
    test_embeddings.append(emb)

X_test = np.vstack(test_embeddings)
print(f"Test embeddings shape: {X_test.shape}")


Test embeddings shape: (197, 1336)


In [None]:
print(test_df['filename'].head())

0    audio_141
1    audio_114
2     audio_17
3     audio_76
4    audio_156
Name: filename, dtype: object


In [None]:

# CELL 7: LightGBM K-Fold Training


kf = KFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(X_test.shape[0])
val_rmse_list = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n--- Fold {fold+1} ---")
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)

    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.01,
        'num_leaves': 31,
        'min_data_in_leaf': 20,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbosity': -1,
        'seed': 42
    }

    model = lgb.train(
        params,
        train_data,
        num_boost_round=5000,
        valid_sets=[train_data, val_data],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)]
    )

    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    print(f"Fold {fold+1} Val RMSE: {rmse:.4f}")
    val_rmse_list.append(rmse)

    test_preds += model.predict(X_test, num_iteration=model.best_iteration) / kf.n_splits

print(f"\nMean CV RMSE: {np.mean(val_rmse_list):.4f}")


--- Fold 1 ---
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.523075	valid_1's rmse: 0.669739
[200]	training's rmse: 0.369	valid_1's rmse: 0.61882
[300]	training's rmse: 0.276287	valid_1's rmse: 0.598016
[400]	training's rmse: 0.210378	valid_1's rmse: 0.587334
[500]	training's rmse: 0.163512	valid_1's rmse: 0.581976
[600]	training's rmse: 0.128666	valid_1's rmse: 0.579362
[700]	training's rmse: 0.104115	valid_1's rmse: 0.57835
Early stopping, best iteration is:
[680]	training's rmse: 0.10797	valid_1's rmse: 0.577424
Fold 1 Val RMSE: 0.5774

--- Fold 2 ---
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.538276	valid_1's rmse: 0.637513
[200]	training's rmse: 0.377099	valid_1's rmse: 0.602794
[300]	training's rmse: 0.27817	valid_1's rmse: 0.589197
[400]	training's rmse: 0.204493	valid_1's rmse: 0.576308
[500]	training's rmse: 0.155225	valid_1's rmse: 0.569866
[600]	training's rmse: 0.120364	valid_1's rmse: 0.5

new

In [None]:

# CELL 8: Save Submission

test_preds = np.clip(test_preds, 1.0, 5.0)
submission_df = pd.DataFrame({
    'filename': test_df['filename'],
    'label': test_preds
})
submission_df.to_csv('submission.csv', index=False)
print("Submission saved: submission.csv")


Submission saved: submission.csv
