# Section 1: Environment Setup and Imports

In [None]:
import os
import subprocess
from IPython.display import FileLink, display

# Force pure-Python protocol buffers to avoid TensorFlow proto conflicts.
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

In [None]:
import numpy as np
import pandas as pd
import torch
import torchaudio
from torchaudio.transforms import Resample
import librosa
import whisper
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
import warnings
import random
import gc

from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Regressors (choose one or experiment with several)
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

# For text embeddings
from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Section 2: Configuration and Utility Functions

In [None]:
# Define paths – adjust these for your Kaggle environment
CONFIG = {
    "train_csv": "/kaggle/input/shl-dataset/dataset/train.csv",
    "test_csv": "/kaggle/input/shl-dataset/dataset/test.csv",
    "audios_train": "/kaggle/input/shl-dataset/dataset/audios_train",
    "audios_test": "/kaggle/input/shl-dataset/dataset/audios_test",
    "sample_submission": "/kaggle/input/shl-dataset/dataset/sample_submission.csv",
    "output_submission": "/kaggle/working/submission.csv",
    # Audio processing
    "target_sample_rate": 16000,
    "max_audio_length": 10,  # seconds
}
CONFIG["max_audio_length_samples"] = CONFIG["target_sample_rate"] * CONFIG["max_audio_length"]

# Utility download function (for Kaggle output)
def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(zip_name))

# Section 3: Hybrid Feature Extraction Functions

In [None]:
# 2.1 Compute acoustic features from audio using Whisper’s encoder
def extract_acoustic_features(audio_path, whisper_model):
    """
    Loads and processes audio for Whisper.
    Uses Whisper's log-Mel spectrogram and encoder.
    Returns a deep acoustic feature vector by mean-pooling.
    """
    # Load and pad audio via Whisper utilities
    audio = whisper.load_audio(audio_path)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(device)
    with torch.no_grad():
        # Get encoder output; shape: [batch, frames, hidden_size]
        encoded = whisper_model.encoder(mel.unsqueeze(0))
    # Mean pool over time dimension to get a single feature vector
    acoustic_feature = encoded.squeeze(0).mean(dim=0).cpu().numpy()
    return acoustic_feature

In [None]:
# 2.2 Extract hand-crafted acoustic features using librosa
def extract_handcrafted_features(audio_path, sr=16000):
    """
    Loads audio with librosa and computes:
      - MFCC means and standard deviations (n_mfcc=13)
      - Zero crossing rate (mean and std)
      - RMS energy (mean and std)
    Returns a vector of handcrafted acoustic features.
    """
    try:
        y, _ = librosa.load(audio_path, sr=sr)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_mean = np.mean(mfcc, axis=1)
        mfcc_std = np.std(mfcc, axis=1)
        zcr = librosa.feature.zero_crossing_rate(y)[0]
        rms = librosa.feature.rms(y=y)[0]
        handcrafted = np.concatenate([mfcc_mean, mfcc_std, [np.mean(zcr), np.std(zcr)], [np.mean(rms), np.std(rms)]])
    except Exception as e:
        print(f"Error extracting handcrafted features from {audio_path}: {e}")
        # If error, return zeros (length: 13+13+2+2 = 30)
        handcrafted = np.zeros(30)
    return handcrafted


In [None]:
# 2.3 Extract linguistic (text) features:
def extract_text_features(audio_path, whisper_model, text_encoder):
    """
    Uses Whisper to transcribe audio and a SentenceTransformer to encode the transcript.
    Returns a text embedding vector.
    """
    try:
        result = whisper_model.transcribe(audio_path, fp16=False)
        transcript = result['text']
        text_embed = text_encoder.encode(transcript)
    except Exception as e:
        print(f"Error transcribing or encoding text from {audio_path}: {e}")
        # If error, return zeros (assume text embeddings are length 768)
        text_embed = np.zeros(768)
    return text_embed

In [None]:
# 2.4 Compute duration (in seconds) using librosa (or torchaudio)
def compute_duration(audio_path, sr=16000):
    try:
        y, _ = librosa.load(audio_path, sr=sr)
        duration = len(y) / sr
    except Exception as e:
        print(f"Error computing duration for {audio_path}: {e}")
        duration = 0.0
    return duration


In [None]:
# 2.5 Hybrid feature extraction:
def extract_hybrid_features(df, audio_folder, whisper_model, text_encoder):
    """
    For each audio file:
      - Extract deep acoustic features (from Whisper encoder)
      - Extract handcrafted acoustic features (MFCCs, ZCR, RMS)
      - Extract text (linguistic) features (Whisper transcription + SentenceTransformer)
      - Compute duration
    Then concatenates all feature vectors into one combined feature vector.
    """
    combined_features = []
    for file in tqdm(df['filename'], desc="Extracting hybrid features"):
        file_path = os.path.join(audio_folder, file)
        # Deep acoustic representation (e.g., 512-dim or similar)
        acoustic_feat = extract_acoustic_features(file_path, whisper_model)
        # Handcrafted features (30-dimensional, as defined above)
        handcrafted_feat = extract_handcrafted_features(file_path, sr=CONFIG["target_sample_rate"])
        # Text features (e.g., 768-dim from SentenceTransformer)
        text_feat = extract_text_features(file_path, whisper_model, text_encoder)
        # Duration as scalar
        duration = compute_duration(file_path, sr=CONFIG["target_sample_rate"])
        # Optionally, you can normalize duration (e.g., divide by 60) later during training
        # Concatenate all features into one vector
        features = np.concatenate([acoustic_feat, handcrafted_feat, text_feat, [duration]])
        combined_features.append(features)
    combined_features = np.array(combined_features)
    return combined_features


# Section 4: Data Loading and Preparation

In [None]:
# Load CSV files
train_df = pd.read_csv(CONFIG["train_csv"])
test_df = pd.read_csv(CONFIG["test_csv"])

# Create full paths for audio files
train_df['file_path'] = train_df['filename'].apply(lambda x: os.path.join(CONFIG["audios_train"], x))
test_df['file_path']  = test_df['filename'].apply(lambda x: os.path.join(CONFIG["audios_test"], x))

# Load models for feature extraction
print("Loading Whisper model (for both acoustic and transcription) ...")
whisper_model = whisper.load_model("base").to(device)
print("Loading SentenceTransformer model for text embeddings ...")
text_encoder = SentenceTransformer("all-mpnet-base-v2")

# Extract hybrid features for training
print("Extracting hybrid features for training ...")
X = extract_hybrid_features(train_df, CONFIG["audios_train"], whisper_model, text_encoder)
y = train_df['label'].values

# Section 5: Train-Validation Split and Regressor Training

In [None]:
print("Splitting data into training and validation sets ...")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Choose a regressor; for example, XGBoost
print("Training XGBoost Regressor on hybrid features ...")
model_xgb = XGBRegressor(n_estimators=400, learning_rate=0.009, max_depth=6, random_state=42)
model_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
val_preds = model_xgb.predict(X_val)
rmse = mean_squared_error(y_val, val_preds, squared=False)
print(f"Validation RMSE (XGBoost): {rmse:.4f}")

# Alternatively, you can try an MLP:
print("Training MLP Regressor on hybrid features ...")
model_mlp = MLPRegressor(hidden_layer_sizes=(512,464,256), activation='tanh',
                         solver='sgd', max_iter=500000, random_state=42)
model_mlp.fit(X_train, y_train)
val_preds_mlp = model_mlp.predict(X_val)
rmse_mlp = mean_squared_error(y_val, val_preds_mlp, squared=False)
print(f"Validation RMSE (MLP Neural Net): {rmse_mlp:.4f}")

# Section 6: Inference on Test Set and Submission Creation

In [None]:
print("Extracting hybrid features for test set ...")
X_test = extract_hybrid_features(test_df, CONFIG["audios_test"], whisper_model, text_encoder)
print("Predicting on test set with XGBoost ...")
test_preds = model_xgb.predict(X_test)
test_preds = np.clip(test_preds, 0, 5)  # Clip to valid range if necessary
# (Optional) Smoothing: combine with overall training mean, e.g.:
test_preds = 0.9 * test_preds + 0.1 * y_train.mean()

# Create and save submission file
submission_df = pd.DataFrame({
    "filename": test_df["filename"],
    "label": test_preds
})
submission_df.to_csv(CONFIG["output_submission"], index=False)
print("Submission file saved!")
print("Submission file path:", os.path.abspath(CONFIG["output_submission"]))
download_file(CONFIG["output_submission"], "out")