In [1]:
!pip install transformers librosa soundfile accelerate pytorch-lightning language-tool-python xgboost textstat sentence-transformers
!pip install openai-whisper --no-deps
!pip install tiktoken ffmpeg-python
!pip install Levenshtein
!apt-get update
!apt-get install -y openjdk-17-jdk-headless
!java -version

Collecting language-tool-python
  Downloading language_tool_python-3.1.0-py3-none-any.whl.metadata (17 kB)
Collecting textstat
  Downloading textstat-0.7.12-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cubl

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"

import pandas as pd
import numpy as np
import torch
import whisper
import librosa
import Levenshtein
import language_tool_python
import textstat
import xgboost as xgb
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import warnings

warnings.filterwarnings('ignore')

2025-12-15 21:13:04.385539: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765833184.794905      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765833184.893145      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [3]:
# Paths
BASE_PATH = "/kaggle/input/shl-intern-hiring-assessment-2025/dataset"
TRAIN_AUDIO_DIR = f"{BASE_PATH}/audios/train"
TEST_AUDIO_DIR = f"{BASE_PATH}/audios/test"

# Load Whisper
model = whisper.load_model("medium.en")

def transcribe_data(df, audio_dir):
    transcriptions = []
    print(f"Transcribing {len(df)} files...")
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        filename = row['filename']
        if not filename.endswith('.wav'): filename += '.wav'
        path = os.path.join(audio_dir, filename)
        
        try:
            # Beam size 1 helps prevent over-correction of grammar
            result = model.transcribe(path, beam_size=1, language="en")
            transcriptions.append(result['text'].strip())
        except:
            transcriptions.append("")
    return transcriptions

# Load Data
train_df = pd.read_csv(f"{BASE_PATH}/csvs/train.csv")
test_df = pd.read_csv(f"{BASE_PATH}/csvs/test.csv")

# Run Transcription (checks if file exists to save time)
if os.path.exists("train_transcribed.csv"):
    train_df = pd.read_csv("train_transcribed.csv")
    test_df = pd.read_csv("test_transcribed.csv")
    print("Loaded saved transcriptions.")
else:
    train_df['transcription'] = transcribe_data(train_df, TRAIN_AUDIO_DIR)
    test_df['transcription'] = transcribe_data(test_df, TEST_AUDIO_DIR)
    train_df.to_csv("train_transcribed.csv", index=False)
    test_df.to_csv("test_transcribed.csv", index=False)

# Fill NaNs
train_df['transcription'] = train_df['transcription'].fillna("")
test_df['transcription'] = test_df['transcription'].fillna("")

100%|██████████████████████████████████████| 1.42G/1.42G [00:08<00:00, 178MiB/s]


Transcribing 409 files...


  0%|          | 0/409 [00:00<?, ?it/s]

Transcribing 197 files...


  0%|          | 0/197 [00:00<?, ?it/s]

In [20]:
# --- 1. Re-Transcribe with Wav2Vec2 ---
from transformers import pipeline
import librosa
import os
import pandas as pd
from tqdm.notebook import tqdm

# Load Wav2Vec2 (Acoustic model, preserves errors)
asr_dumb = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0 if torch.cuda.is_available() else -1)

def transcribe_wav2vec(df, audio_dir):
    texts = []
    print(f"Re-transcribing {len(df)} files in {audio_dir}...")
    
    for _, row in tqdm(df.iterrows(), total=len(df)):
        fname = row['filename']
        if not fname.endswith('.wav'): fname += '.wav'
        path = os.path.join(audio_dir, fname)
        
        try:
            # Wav2Vec2 requires 16kHz audio
            audio, _ = librosa.load(path, sr=16000)
            # Transcribe
            result = asr_dumb(audio, chunk_length_s=30)['text'].lower()
            texts.append(result)
        except Exception as e:
            print(f"Error {fname}: {e}")
            texts.append("")
            
    return texts

# 1. Run Transcription
train_df['transcription_raw'] = transcribe_wav2vec(train_df, TRAIN_AUDIO_DIR)
test_df['transcription_raw'] = transcribe_wav2vec(test_df, TEST_AUDIO_DIR)

# 2. Swap columns (So your existing code works)
train_df['transcription'] = train_df['transcription_raw']
test_df['transcription'] = test_df['transcription_raw']

# 3. Save immediately
train_df.to_csv("train_wav2vec.csv", index=False)
test_df.to_csv("test_wav2vec.csv", index=False)
print("Saved new raw transcriptions!")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Re-transcribing 409 files in /kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/train...


  0%|          | 0/409 [00:00<?, ?it/s]

Re-transcribing 197 files in /kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test...


  0%|          | 0/197 [00:00<?, ?it/s]

Saved new raw transcriptions!


In [21]:
# --- 3. Advanced Feature Engineering ---

# Initialize Tools
tool = language_tool_python.LanguageTool('en-US')
gec_pipe = pipeline("text2text-generation", model="vennify/t5-base-grammar-correction", device=0 if torch.cuda.is_available() else -1)
embedder = SentenceTransformer('all-MiniLM-L6-v2')

tqdm.pandas() 

def extract_features(row, audio_dir):
    filename = row['filename']
    text = row['transcription']
    features = {}

    # --- 1. PROSODY (Audio) ---
    path = os.path.join(audio_dir, filename if filename.endswith('.wav') else f"{filename}.wav")
    try:
        y, sr = librosa.load(path, sr=None)
        duration = librosa.get_duration(y=y, sr=sr)
        
        # Detect non-silent chunks to calculate true speaking time
        non_silent = librosa.effects.split(y, top_db=20)
        non_silent_time = sum([end - start for start, end in non_silent]) / sr
        
        word_count = len(str(text).split())
        
        features['speech_rate'] = word_count / duration if duration > 0 else 0
        features['articulation_rate'] = word_count / non_silent_time if non_silent_time > 0 else 0
        features['silence_ratio'] = (duration - non_silent_time) / duration if duration > 0 else 0
        features['duration'] = duration
    except:
        features['speech_rate'] = 0
        features['articulation_rate'] = 0
        features['silence_ratio'] = 0
        features['duration'] = 0

    # --- 2. NEURAL GRAMMAR (GEC Distance) ---
    try:
        if len(str(text)) > 1:
            # Ask T5 to fix the grammar
            fixed_text = gec_pipe(f"grammar: {text}")[0]['generated_text']
            # Normalized distance: How much did T5 change the original text?
            features['grammar_distance'] = Levenshtein.distance(text, fixed_text) / len(text)
        else:
            features['grammar_distance'] = 0
    except:
        features['grammar_distance'] = 0

    # --- 3. RULE-BASED GRAMMAR ---
    try:
        matches = tool.check(text)
        features['error_count'] = len(matches)
        features['error_density'] = len(matches) / len(text.split()) if len(text.split()) > 0 else 0
        features['complexity'] = textstat.flesch_kincaid_grade(text)
    except:
        features['error_count'] = 0
        features['error_density'] = 0
        features['complexity'] = 0

    return pd.Series(features)

print("Extracting Advanced Features (Train)...")
train_feats = train_df.progress_apply(lambda row: extract_features(row, TRAIN_AUDIO_DIR), axis=1)

print("Extracting Advanced Features (Test)...")
test_feats = test_df.progress_apply(lambda row: extract_features(row, TEST_AUDIO_DIR), axis=1)

# --- 4. EMBEDDINGS (Semantic) ---
print("Generating Embeddings...")
X_train_emb = embedder.encode(train_df['transcription'].fillna("").tolist(), show_progress_bar=True)
X_test_emb = embedder.encode(test_df['transcription'].fillna("").tolist(), show_progress_bar=True)

Device set to use cuda:0


Extracting Advanced Features (Train)...


  0%|          | 0/409 [00:00<?, ?it/s]

Extracting Advanced Features (Test)...


  0%|          | 0/197 [00:00<?, ?it/s]

Generating Embeddings...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [22]:
# Define function to calculate Vocabulary Richness
def get_unique_ratio(text):
    words = str(text).lower().split()
    if len(words) == 0:
        return 0
    return len(set(words)) / len(words)

# Apply to existing dataframes
print("Adding missing 'unique_ratio' column...")
train_feats['unique_ratio'] = train_df['transcription'].apply(get_unique_ratio)
test_feats['unique_ratio'] = test_df['transcription'].apply(get_unique_ratio)

print("Done! Columns are now:", train_feats.columns.tolist())

Adding missing 'unique_ratio' column...
Done! Columns are now: ['speech_rate', 'articulation_rate', 'silence_ratio', 'duration', 'grammar_distance', 'error_count', 'error_density', 'complexity', 'unique_ratio']


In [23]:
# Concatenate explicit features + raw embeddings
X_train_full = pd.concat([train_feats, pd.DataFrame(X_train_emb)], axis=1)
X_test_full = pd.concat([test_feats, pd.DataFrame(X_test_emb)], axis=1)

# Ensure string column names for XGBoost
X_train_full.columns = X_train_full.columns.astype(str)
X_test_full.columns = X_test_full.columns.astype(str)

y_train = train_df['label']

#  FEATURE SELECTION (Recursive Feature Elimination approach)
# We train a quick model to see what actually matters
print("Running Feature Selection...")
selector_model = xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
selector_model.fit(X_train_full, y_train)

# Get feature importances
importances = selector_model.feature_importances_
indices = np.argsort(importances)[::-1] # Sort descending

# Keep top 100 features (Embeddings + specific grammar features)
top_n = 100 
top_indices = indices[:top_n]
selected_cols = X_train_full.columns[top_indices]

print(f"Selected Top {top_n} features.")

# Filter datasets
X_train_selected = X_train_full[selected_cols]
X_test_selected = X_test_full[selected_cols]


Running Feature Selection...
Selected Top 100 features.


In [24]:
from scipy.stats import pearsonr

# 3. ROBUST TRAINING (5-Fold CV)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_train_selected))
test_preds = np.zeros(len(X_test_selected))
rmse_scores = []
pearson_scores = []  # Track Pearson scores

print("\nTraining Final Model...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_selected, y_train)):
    X_tr, X_val = X_train_selected.iloc[train_idx], X_train_selected.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Balanced Parameters: Less aggressive regularization than before
    model = xgb.XGBRegressor(
        n_estimators=2000,
        learning_rate=0.02,    # Slightly higher to allow learning
        max_depth=4,           # Depth 4 captures more interaction than 3
        subsample=0.8,
        colsample_bytree=0.8,  # Allow access to more features
        n_jobs=-1,
        random_state=42
    )
    
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=100,
        verbose=False
    )
    
    # Clip predictions to valid range 0-5
    val_pred = np.clip(model.predict(X_val), 0, 5)
    oof_preds[val_idx] = val_pred
    test_preds += model.predict(X_test_selected) / 5
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    pearson = pearsonr(y_val, val_pred)[0]
    
    rmse_scores.append(rmse)
    pearson_scores.append(pearson)
    
    print(f"Fold {fold+1} | RMSE: {rmse:.4f} | Pearson: {pearson:.4f}")

print(f"\nAverage CV RMSE: {np.mean(rmse_scores):.4f}")
print(f"Average CV Pearson: {np.mean(pearson_scores):.4f}")


Training Final Model...
Fold 1 | RMSE: 0.6691 | Pearson: 0.5464
Fold 2 | RMSE: 0.5555 | Pearson: 0.6192
Fold 3 | RMSE: 0.6620 | Pearson: 0.6833
Fold 4 | RMSE: 0.5364 | Pearson: 0.6554
Fold 5 | RMSE: 0.5860 | Pearson: 0.6607

Average CV RMSE: 0.6018
Average CV Pearson: 0.6330


In [25]:
final_test_preds = np.clip(test_preds, 0, 5)

submission = pd.DataFrame({
    'filename': test_df['filename'],
    'label': final_test_preds
})

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")
print(submission.head())

Saved submission.csv
    filename     label
0  audio_141  2.819240
1  audio_114  3.113828
2   audio_17  2.865006
3   audio_76  4.273753
4  audio_156  2.971206
