<a href="https://colab.research.google.com/github/InfiSmile/SHL_Assignment/blob/main/Muskan_SHL_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Text + Audio + Rules Ensemble


---



At first, I just combined text and audio features and trained one model. Then I tried using only text models like Sentence Transformer and DeBERTa to see how well they perform, but that showed how important audio features actually are. I used NVIDIA’s Parakeet-TDT-0.6B-v2 model to transcribe the audios for better text data.

Later, I explored few articles that I have mentioned later and found that ensembling can give better results. So, I built a setup where text, audio, and rule-based models are trained separately and then combined using NNLS and confidence-based blending. This way, each model contributes its strengths, text for meaning, audio for tone, and rules for structure ,making the final predictions more reliable.


*Installation*

---



1.   !pip install -U transformers huggingface_hub
2.   !pip install git+https://github.com/openai/whisper.git




#Imports and Config

In [1]:

import os, math, warnings, random, re
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.isotonic import IsotonicRegression
from transformers import AutoTokenizer, AutoModel, get_cosine_schedule_with_warmup

#Ensemble Methods
try:
    import lightgbm as lgb
    LGB_AVAILABLE = True
except Exception:
    from sklearn.ensemble import RandomForestRegressor
    LGB_AVAILABLE = False

try:
    from torch.optim.swa_utils import AveragedModel, SWALR
    SWA_AVAILABLE = True
except Exception:
    SWA_AVAILABLE = False

#---For Audio------------
import librosa, whisper
from scipy.stats import pearsonr

#Config
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TEXT_MODEL = "microsoft/deberta-v3-small"
MAX_LEN = 256
BS = 8
EPOCHS_TEXT = 5
EPOCHS_AUDIO = 8
CLIP_RANGE = (0.0, 5.0)
USE_ZSCORE = True
AUDIO_SR = 16000

# Paths
TRAIN_CSV = "csvs/train.csv"
TEST_CSV  = "csvs/test.csv"
AUDIO_ROOT = "audios"
OUT_DIR = "output"
os.makedirs(OUT_DIR, exist_ok=True)
# ================================

def set_seed(s=SEED):
    random.seed(s); np.random.seed(s); torch.manual_seed(s);
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(s)

set_seed()
print(f"Device: {DEVICE}")


Device: cuda


# Utility Functions
$\text{MAE} = \frac{1}{N}\sum_{i=1}^{N} |y_i - \hat{y}_i|$

$\text{RMSE} = \sqrt{\frac{1}{N}\sum_{i=1}^{N} (y_i - \hat{y}_i)^2}$

$r (Pearson correlation coefficient)  = \frac{\sum_{i=1}^{N} (y_i - \bar{y})(\hat{y}_i - \bar{\hat{y}})}
{\sqrt{\sum_{i=1}^{N} (y_i - \bar{y})^2} \sqrt{\sum_{i=1}^{N} (\hat{y}_i - \bar{\hat{y}})^2}}$


In [2]:

def metrics(y, p):
    mae = mean_absolute_error(y, p)
    rmse = math.sqrt(mean_squared_error(y, p))
    r = pearsonr(y, p)[0] if len(np.unique(y)) > 1 else np.nan
    return mae, rmse, r

#As mentioned in the assignment that it should be in the range 0 to 5
def clip01_5(x):
    return np.clip(x, CLIP_RANGE[0], CLIP_RANGE[1])

def rank_scale(x: np.ndarray) -> np.ndarray:
    idx = np.argsort(np.argsort(x))
    return idx.astype(np.float32) / max(1, len(x)-1)


# Transcript Preprocessing

In [3]:
def clean_text(s: str) -> str:
    s = re.sub(r"\b(\w+)(\s+\1\b)+", r"\1", s, flags=re.I)  # Since there are repeated tokens like I like I like
    s = re.sub(r"\s+", " ", s).strip()
    return s

#Several disfluencies in the audio (explored few of them)
DISFLUENCIES = {"uh","um","erm","hmm","you know","like","sort of"}

def extract_rule_feats(texts: list[str]) -> pd.DataFrame:
    rows = []
    for t in texts:
        s = t or ""
        tokens = s.split()
        tok_n = len(tokens)
        chars = len(s)
        avg_tok = (chars / max(1, tok_n))
        commas = s.count(","); periods = s.count("."); qmarks = s.count("?"); exc = s.count("!")
        caps_ratio = sum(ch.isupper() for ch in s) / max(1, len(s))
        repeats = sum(1 for i in range(1, tok_n) if tokens[i].lower()==tokens[i-1].lower())
        disfluency_hits = sum(1 for w in DISFLUENCIES if w in s.lower())
        rows.append(dict(
            tok_n=tok_n, chars=chars, avg_tok=avg_tok,
            commas=commas, periods=periods, qmarks=qmarks, exclam=exc,
            caps_ratio=caps_ratio, repeats=repeats, disfluencies=disfluency_hits
        ))
    return pd.DataFrame(rows)

## Rule-based feature engineering: These are later used by a simple regressor(Light GBM) for rule based model
def enrich_rules(df):
    df = df.copy()
    df["punct_rate"] = (df["commas"]+df["periods"]+df["qmarks"]+df["exclam"]) / np.maximum(1, df["tok_n"])
    df["repeat_rate"] = df["repeats"] / np.maximum(1, df["tok_n"])
    df["disfluency_rate"] = df["disfluencies"] / np.maximum(1, df["tok_n"])
    df["chars_per_tok"] = df["chars"] / np.maximum(1, df["tok_n"])
    df["caps_x_punct"] = df["caps_ratio"] * df["punct_rate"]
    df["avgTok_x_punct"] = df["avg_tok"] * df["punct_rate"]
    return df


#Audio Features

In [4]:

class AudioFeaturizer:
    def __init__(self, device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model = whisper.load_model("tiny", device=self.device)

    def _encode(self, wav, sr):
        if sr != 16000: #frequency
            wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
        wav = whisper.pad_or_trim(torch.tensor(wav)) #Since whisper expects a fixed-length audio input.
        #Convert waveform to log-Mel spectrogram for Whisper
        mel = whisper.log_mel_spectrogram(wav).to(self.device)
        with torch.no_grad():
            #Encoding features
            hs = self.model.encoder(mel.unsqueeze(0))  # [1,T,384]
        #averaging over time steps to obtain one fixed-size 384-dim audio embedding
        return hs.squeeze(0).float().cpu().mean(dim=0)  # [384]

    def __call__(self, wav_path: str):
        ''' Load, normalize, and clean audio, then extract basic prosodic features (duration, RMS, ZCR) '''
        wav, sr = librosa.load(wav_path, sr=AUDIO_SR, mono=True)
        wav = librosa.util.normalize(wav)
        wav, _ = librosa.effects.trim(wav, top_db=20)
        dur = len(wav) / AUDIO_SR
        rms = float(librosa.feature.rms(y=wav).mean())
        zcr = float(librosa.feature.zero_crossing_rate(y=wav).mean())
        # pitch features
        try:
            f0 = librosa.yin(wav, fmin=80, fmax=400, sr=AUDIO_SR)
            f0 = f0[np.isfinite(f0)]
            f0_mean = float(np.nanmean(f0)) if f0.size else 0.0
            f0_std  = float(np.nanstd(f0))  if f0.size else 0.0
            voiced_ratio = float(np.mean((f0 > 0).astype(float))) if f0.size else 0.0
        except Exception:
            f0_mean = f0_std = voiced_ratio = 0.0
        enc = self._encode(wav, sr)  # [384]
        vec = torch.cat([enc, torch.tensor([dur, rms, zcr, f0_mean, f0_std, voiced_ratio], dtype=torch.float32)], dim=0)  # [390]
        return vec.numpy()


#Datasets


1.   Text Model: "microsoft/deberta-v3-small"
2.   Audio Model: openai- whisper tiny model



In [5]:
#For loading transcripts
class TextDataset(Dataset):
    def __init__(self, texts): self.texts = texts
    def __len__(self): return len(self.texts)
    def __getitem__(self, i): return {"text": self.texts[i]}

def text_collate(batch): return {"text": [b["text"] for b in batch]}

class TextRegressor(nn.Module):
    '''  Text regression model using DeBERTa encoder and MLP head (CLS + mean pooled features)
       [CLS] captures global sentence-level semantics learned during pretraining whereas,
     Mean pooling adds information from all tokens, giving a more smoother context '''
    def __init__(self, model_name=TEXT_MODEL):
        super().__init__()
        self.tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)
        self.txt = AutoModel.from_pretrained(model_name, trust_remote_code=False)
        hid = self.txt.config.hidden_size
        self.head = nn.Sequential(
            nn.Linear(2*hid, 256), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, 64), nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, texts):
        tok = self.tok(texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt").to(DEVICE)
        out = self.txt(**tok).last_hidden_state  # [B,L,H]
        cls  = out[:, 0]
        mean = out.mean(dim=1)
        emb  = torch.cat([cls, mean], dim=1)
        return self.head(emb).squeeze(-1)

#Since we would be using Layer-Wise Learning Rate Decay (LLRD) —
#we need to assign a different learning rate to each transformer layer.
def _get_layers(model):
    #Check if the model has encoder.layer Like deBERTa
    if hasattr(model, "encoder") and hasattr(model.encoder, "layer"):
        return list(model.encoder.layer)
    if hasattr(model, "transformer") and hasattr(model.transformer, "layer"):
        return list(model.transformer.layer)
    raise AttributeError("Unknown transformer layers path.")

#It helps fine-tune large pretrained models more safely like upper layers adapts to the new task
#Lower layers changes slowly. So basically it shouldn't forget it's pretrained knowledge
def llrd_params(model: TextRegressor, base_lr=3e-5, head_lr=1e-3, decay=0.9):
    groups = [{"params": model.head.parameters(), "lr": head_lr}]
    layers = _get_layers(model.txt)
    lr = base_lr
    for i in reversed(range(len(layers))):
        groups.append({"params": layers[i].parameters(), "lr": lr})
        lr *= decay
    if hasattr(model.txt, "embeddings"):
        groups.append({"params": model.txt.embeddings.parameters(), "lr": lr})
    return groups

#Small MLP model for audio embeddings which predicts a single regression value (as we need a value between 0 to 5)
class AudioMLP(nn.Module):
    def __init__(self, in_dim=390):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(512, 128), nn.ReLU(),
            nn.Linear(128, 1)
        )
    def forward(self, x): return self.net(x).squeeze(-1)

# R-Drop loss: combines supervised loss with consistency loss (for stability under dropout)
def rdrop_loss(pred1, pred2, target, base_loss, alpha=2.0):
    sup = base_loss(pred1, target) + base_loss(pred2, target)
    cons = torch.mean((pred1 - pred2) ** 2)
    return 0.5 * sup + alpha * cons

# Maintains a moving average of model weights for smoother, more stable training
class EMA:
    def __init__(self, model, decay=0.999):
        self.decay = decay
        self.shadow = {k: v.detach().clone() for k, v in model.state_dict().items()}
    def update(self, model):
        with torch.no_grad():
            for k, v in model.state_dict().items():
                self.shadow[k].mul_((self.decay)).add_(v.detach(), alpha=1-self.decay)
    def apply_to(self, model):
        model.load_state_dict(self.shadow, strict=True)

# It helps regularize the model . So basically it blends random pairs of samples and labels
def mixup(x, y, alpha=0.2):
    if alpha <= 0: return x, y
    lam = np.random.beta(alpha, alpha)
    idx = torch.randperm(x.size(0), device=x.device)
    x_mix = lam * x + (1-lam) * x[idx]
    y_mix = lam * y + (1-lam) * y[idx]
    return x_mix, y_mix


#Training Loops

In [6]:

# - Train text regression model (DeBERTa + MLP) using gradual unfreezing + EMA + R-Drop
# - Starts by freezing encoder layers, then unfreezes deeper ones each epoch
# - Applies layer-wise learning rate decay (LLRD)
# - Uses R-Drop for regularization and EMA for stable weight tracking
def train_text(texts, y, idx_tr, idx_va, epochs=EPOCHS_TEXT):
    model = TextRegressor().to(DEVICE)
    for p in model.txt.parameters(): p.requires_grad = False

    opt = torch.optim.AdamW(llrd_params(model, base_lr=3e-5, head_lr=1e-3, decay=0.9), weight_decay=1e-3)
    loss_fn = nn.HuberLoss(delta=1.0)
    steps = math.ceil(len(idx_tr)/BS) * epochs
    sch = get_cosine_schedule_with_warmup(opt, int(0.1*steps), steps)
    ema = EMA(model, decay=0.999)

    mu = float(y[idx_tr].mean()); sigma = float(y[idx_tr].std() + 1e-6) if USE_ZSCORE else 1.0
    def z(v): return (v - mu) / sigma if USE_ZSCORE else v
    def uz(v): return v * sigma + mu if USE_ZSCORE else v

    layers = _get_layers(model.txt)
    for ep in range(epochs):
        layers_to_unfreeze = min(2 + ep, len(layers))
        for p in model.txt.parameters(): p.requires_grad = False
        for i in range(len(layers) - layers_to_unfreeze, len(layers)):
            for p in layers[i].parameters(): p.requires_grad = True
        if hasattr(model.txt, "embeddings"):
            for p in model.txt.embeddings.parameters(): p.requires_grad = False

        model.train()
        order = np.random.permutation(idx_tr)
        for start in range(0, len(order), BS):
            bix = order[start:start+BS]
            b_texts = [texts[i] for i in bix]
            b_labels = torch.tensor(z(y[bix]), dtype=torch.float32, device=DEVICE)

            opt.zero_grad(set_to_none=True)
            preds1 = model(b_texts)
            preds2 = model(b_texts)
            loss = rdrop_loss(preds1, preds2, b_labels, loss_fn, alpha=2.0)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step(); sch.step()
            ema.update(model)

    ema.apply_to(model)

    model.eval()
    va_texts = [texts[i] for i in idx_va]
    va_preds = []
    with torch.no_grad():
        for s in range(0, len(va_texts), BS):
            chunk = va_texts[s:s+BS]
            p = model(chunk).detach().cpu().numpy()
            va_preds.append(p)
    va_preds = uz(np.concatenate(va_preds))
    return va_preds, model, (mu, sigma)

def train_audio(vecs, y, idx_tr, idx_va, epochs=EPOCHS_AUDIO):
    model = AudioMLP(in_dim=vecs.shape[1]).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-3)
    loss_fn = nn.HuberLoss(delta=1.0)

    mu = float(y[idx_tr].mean()); sigma = float(y[idx_tr].std() + 1e-6) if USE_ZSCORE else 1.0
    def z(v): return (v - mu) / sigma if USE_ZSCORE else v
    def uz(v): return v * sigma + mu if USE_ZSCORE else v

    use_swa = SWA_AVAILABLE and (epochs >= 3)
    if use_swa:
        swa_model = AveragedModel(model)
        swa_start = epochs - 3
        swa_scheduler = SWALR(opt, swa_lr=5e-4)

    for ep in range(epochs):
        model.train()
        order = np.random.permutation(idx_tr)
        for s in range(0, len(order), BS):
            bix = order[s:s+BS]
            bx = torch.tensor(vecs[bix], dtype=torch.float32, device=DEVICE)
            by = torch.tensor(z(y[bix]), dtype=torch.float32, device=DEVICE)
            bx, by = mixup(bx, by, alpha=0.15)

            opt.zero_grad(set_to_none=True)
            pred = model(bx)
            loss = loss_fn(pred, by)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            if use_swa and ep >= swa_start:
                swa_model.update_parameters(model)
                swa_scheduler.step()

    if use_swa:
        for p, sp in zip(model.parameters(), swa_model.parameters()):
            p.data.copy_(sp.data)

    model.eval()
    va_preds = []
    with torch.no_grad():
        for s in range(0, len(idx_va), BS):
            bix = idx_va[s:s+BS]
            bx = torch.tensor(vecs[bix], dtype=torch.float32, device=DEVICE)
            p = model(bx).detach().cpu().numpy()
            va_preds.append(p)
    va_preds = uz(np.concatenate(va_preds))
    return va_preds, model, (mu, sigma)


#Monte Carlo (MC) dropout inference for text model. Keep dropout active during inference.
def mc_pred_text(model, texts, n=8):
    preds = []
    model.eval()
    for m in model.head.modules():
        if isinstance(m, nn.Dropout): m.train()
    with torch.no_grad():
        for _ in range(n):
            out = []
            for i in range(0, len(texts), BS):
                batch = texts[i:i+BS]
                tokd = model.tok(batch, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt").to(DEVICE)
                hidden = model.txt(**tokd).last_hidden_state
                cls, mean = hidden[:,0], hidden.mean(dim=1)
                emb = torch.cat([cls, mean], dim=1)
                p = model.head(emb).squeeze(-1).detach().cpu().numpy()
                out.append(p)
            preds.append(np.concatenate(out))
    preds = np.stack(preds, 0)
    return preds.mean(0), preds.std(0)

#Monte Carlo (MC) dropout inference for audio model
def mc_pred_audio(model, X, n=8):
    preds = []
    model.eval()
    for m in model.modules():
        if isinstance(m, nn.Dropout): m.train()
    with torch.no_grad():
        for _ in range(n):
            out = []
            for i in range(0, len(X), BS):
                b = torch.tensor(X[i:i+BS], dtype=torch.float32, device=DEVICE)
                out.append(model(b).detach().cpu().numpy())
            preds.append(np.concatenate(out))
    preds = np.stack(preds, 0)
    return preds.mean(0), preds.std(0)


# Dataloading and Feature EXtraction

In [8]:
import librosa
import numpy as np

def extract_features(filepath):
    audio, sr = librosa.load(filepath, sr=16000)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    return np.mean(mfcc.T, axis=0)


In [10]:
import os
import pandas as pd

label = []
for file_name in os.listdir('audios/train'):
    label.append(f'audios/train/{file_name}')

# Correct way to create DataFrame
df_file = pd.DataFrame(label, columns=['filename'])
train = pd.read_csv('csvs/train.csv')

import pandas as pd

df_file['filename_clean'] = df_file['filename'].apply(lambda x: x.split('/')[-1].replace('.wav', ''))

df_merged = pd.merge(df_file, train, left_on='filename_clean', right_on='filename', how='left')

df_merged.drop(columns=['filename_clean'], inplace=True)

In [11]:
df = df_merged.drop(columns='filename_y')

In [12]:
df

Unnamed: 0,filename_x,label
0,audios/train/audio_1.wav,3.0
1,audios/train/audio_10.wav,3.0
2,audios/train/audio_100.wav,3.0
3,audios/train/audio_101.wav,3.5
4,audios/train/audio_102.wav,3.0
...,...,...
404,audios/train/audio_95.wav,2.0
405,audios/train/audio_96.wav,3.0
406,audios/train/audio_97.wav,2.0
407,audios/train/audio_98.wav,3.0


In [14]:
train_df = df.copy()
train_df

Unnamed: 0,filename_x,label
0,audios/train/audio_1.wav,3.0
1,audios/train/audio_10.wav,3.0
2,audios/train/audio_100.wav,3.0
3,audios/train/audio_101.wav,3.5
4,audios/train/audio_102.wav,3.0
...,...,...
404,audios/train/audio_95.wav,2.0
405,audios/train/audio_96.wav,3.0
406,audios/train/audio_97.wav,2.0
407,audios/train/audio_98.wav,3.0


In [18]:
X = np.vstack(train_df["filename_x"].apply(extract_features))
y = train_df["label"].values

NoBackendError: 

In [23]:
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

# Clean transcripts
# train_df["text"] = train_df["text"].map(clean_text)
# test_df["text"]  = test_df["text"].map(clean_text)

y_all = train_df["label"].values.astype(np.float32)

# print("Extracting rule features...")
# train_rules = enrich_rules(extract_rule_feats(train_df["text"].tolist()))
# test_rules  = enrich_rules(extract_rule_feats(test_df["text"].tolist()))

# print("Extracting audio vectors ")
# fe = AudioFeaturizer(device=None)  # auto-select device
# def wav_path(mode, fn): return f"{AUDIO_ROOT}/{mode}/{fn}.wav"
# train_audio_vecs = np.stack([fe(wav_path("train", fn)) for fn in train_df["filename"]])  # [N,390]
# test_audio_vecs  = np.stack([fe(wav_path("test",  fn)) for fn in test_df["filename"]])
# # print("Shapes:", train_audio_vecs.shape, test_audio_vecs.shape)


# K-Fold Training


1. The training loop performs 5-fold cross-validation, splitting data into train and validation sets in each fold.
2. It trains separate text, audio, and rule-based models, generates out-of-fold predictions for validation, and also makes test predictions for each fold to later average for final results.




In [19]:
# Load pre-computed audio features
print("Loading pre-computed audio features...")
train_audio_vecs = np.load('train_audio_features.npy')
test_audio_vecs = np.load('test_audio_features.npy')
print(f"✓ Loaded: train={train_audio_vecs.shape}, test={test_audio_vecs.shape}")


Loading pre-computed audio features...
✓ Loaded: train=(409, 390), test=(197, 390)


In [24]:
#Prepare stratified folds based on target distribution
bins = pd.qcut(y_all, q=min(10, max(2, len(y_all)//30)), labels=False, duplicates="drop")
K = 5
kf = StratifiedKFold(n_splits=K, shuffle=True, random_state=SEED)
#Initialize arrays for out-of-fold (OOF) and test predictions
oof_text  = np.zeros(len(train_df), dtype=np.float32)
oof_audio = np.zeros(len(train_df), dtype=np.float32)
oof_rules = np.zeros(len(train_df), dtype=np.float32)
#Store predictions and uncertainties for each fold
test_text_mean  = np.zeros((K, len(test_df)), dtype=np.float32)
test_text_std   = np.zeros((K, len(test_df)), dtype=np.float32)
test_audio_mean = np.zeros((K, len(test_df)), dtype=np.float32)
test_audio_std  = np.zeros((K, len(test_df)), dtype=np.float32)
test_rules_preds = np.zeros((K, len(test_df)), dtype=np.float32)

 #K-Fold training loop
fold_idx = 0
for tr_idx, va_idx in kf.split(train_df, bins):
    fold_idx += 1
    print(f"\n===== Fold {fold_idx}/{K} =====")
    y = y_all

    # Train text regression model and get validation preds
    text_va_pred, text_model, (mu_t, sig_t) = train_text(train_df["text"].values, y, tr_idx, va_idx, epochs=EPOCHS_TEXT)
    oof_text[va_idx] = text_va_pred

    #  MC Dropout to get mean + std predictions for test set
    mean_t, std_t = mc_pred_text(text_model, test_df["text"].tolist(), n=8)
    mean_t = mean_t * (sig_t if USE_ZSCORE else 1.0) + (mu_t if USE_ZSCORE else 0.0)
    test_text_mean[fold_idx-1] = mean_t
    test_text_std[fold_idx-1]  = std_t

    # Train audio MLP model and get validation preds
    audio_va_pred, audio_model, (mu_a, sig_a) = train_audio(train_audio_vecs, y, tr_idx, va_idx, epochs=EPOCHS_AUDIO)
    oof_audio[va_idx] = audio_va_pred

    #Run MC Dropout for test audio embeddings
    mean_a, std_a = mc_pred_audio(audio_model, test_audio_vecs, n=8)
    mean_a = mean_a * (sig_a if USE_ZSCORE else 1.0) + (mu_a if USE_ZSCORE else 0.0)
    test_audio_mean[fold_idx-1] = mean_a
    test_audio_std[fold_idx-1]  = std_a

    # Rule based model
    X_tr_rules = train_rules.iloc[tr_idx].values
    y_tr = y[tr_idx].astype(float)
    X_va = train_rules.iloc[va_idx].values
    y_va = y[va_idx].astype(float)

    if LGB_AVAILABLE:
        feature_cols = list(train_rules.columns)
        neg_names = {"repeats", "disfluencies", "caps_ratio", "repeat_rate", "disfluency_rate", "caps_x_punct"}
        monotone_constraints = [(-1 if c in neg_names else 0) for c in feature_cols]

        lgbm = lgb.LGBMRegressor(
            n_estimators=1400,
            learning_rate=0.02,
            num_leaves=63,
            min_child_samples=25,
            subsample=0.85,
            colsample_bytree=0.85,
            reg_alpha=0.05,
            reg_lambda=0.05,
            random_state=SEED,
            verbosity=-1,
            monotone_constraints=monotone_constraints,
        )
        # Train LGBM and predict validation
        lgbm.fit(
            X_tr_rules, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="l2",
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(period=0),
            ],
        )
        best_iter = getattr(lgbm, "best_iteration_", None)
        oof_rules[va_idx] = lgbm.predict(X_va, num_iteration=best_iter)
        test_rules_preds[fold_idx-1] = lgbm.predict(test_rules.values, num_iteration=best_iter)
    else:
        rf = RandomForestRegressor(n_estimators=500, random_state=SEED, n_jobs=-1)
        rf.fit(X_tr_rules, y_tr)
        oof_rules[va_idx] = rf.predict(X_va)
        test_rules_preds[fold_idx-1] = rf.predict(test_rules.values)
    #Combine text, audio, and rule-based predictions
    base_stack = np.vstack([oof_text[va_idx], oof_audio[va_idx], oof_rules[va_idx]]).T
    avg_pred = clip01_5(base_stack.mean(axis=1))
    # Compute evaluation metrics for the current fold
    mae, rmse, r = metrics(y[va_idx], avg_pred)
    print(f"Fold {fold_idx} base-avg -> MAE {mae:.3f} RMSE {rmse:.3f} r {r:.3f}")



===== Fold 1/5 =====


KeyError: 'text'

# Meta Blending
Got this idea reading this - [Meta Ensembling](https://medium.com/ml-research-lab/stacking-ensemble-meta-algorithms-for-improve-predictions-f4b4cf3b9237)
After training individual text, audio, and rule-based models, this step performs meta-ensembling ,
It combines their predictions intelligently.
Using NNLS (Non-Negative Least Squares) and rank-based blending, Found the optimal weights for each model’s contribution.
Then, isotonic regression is applied for calibration, ensuring final predictions are well-aligned and smooth.

[Isotonic Regression](https://stats.stackexchange.com/questions/660622/why-isotonic-regression-for-model-calibration) -- I explored this while I was trying to improve my model and It worked.


In [None]:

# optimal ensemble weights using NNLS (Non-Negative Least Squares)
def nnls_sum_to_one(X, y, iters=3000, lr=1e-2):
    w = np.ones(X.shape[1], dtype=np.float32) / X.shape[1]
    for _ in range(iters):
        grad = (2.0 / len(y)) * (X.T @ (X @ w - y))
        w = w - lr * grad
        w = np.maximum(w, 0.0)
        s = w.sum()
        if s > 0: w /= s
    return w

# Stack model predictions (text, audio, rules)
X_meta = np.vstack([oof_text, oof_audio, oof_rules]).T
y_meta = y_all

# Value based NNLS Ensemble :Finds the best non-negative combination of model outputs that minimizes prediction error.
w_val = nnls_sum_to_one(X_meta, y_meta)
oof_meta_val = clip01_5(X_meta @ w_val)
print("\nNNLS value weights:", np.round(w_val, 4))
mae, rmse, r = metrics(y_meta, oof_meta_val)
print(f"NNLS VALUE (OOF) -> MAE {mae:.3f} | RMSE {rmse:.3f} | r {r:.3f}")

#Rank based : Learns weights that best preserve the correct order of predictions.
X_meta_rank = np.vstack([rank_scale(oof_text), rank_scale(oof_audio), rank_scale(oof_rules)]).T
y_rank = rank_scale(y_meta)
w_rank = nnls_sum_to_one(X_meta_rank, y_rank)
oof_meta_rank = X_meta_rank @ w_rank
print("NNLS rank weights:", np.round(w_rank, 4))

#MIximng up both Value based and Rank based
alpha = 0.3
oof_meta_blend = clip01_5((1 - alpha) * oof_meta_val + alpha * oof_meta_rank)
mae, rmse, r = metrics(y_meta, oof_meta_blend)
print(f"RANK-BLEND OOF -> MAE {mae:.3f} | RMSE {rmse:.3f} | r {r:.3f}")


iso = IsotonicRegression(y_min=CLIP_RANGE[0], y_max=CLIP_RANGE[1], out_of_bounds="clip")
iso.fit(oof_meta_blend, y_meta)

#Apply learned weights to test predictions
test_text_m  = test_text_mean.mean(axis=0)
test_text_s  = test_text_std.mean(axis=0) + 1e-6
test_audio_m = test_audio_mean.mean(axis=0)
test_audio_s = test_audio_std.mean(axis=0) + 1e-6
test_rules_m = test_rules_preds.mean(axis=0)

# Compute test-level weighted ensemble
X_test_val = np.vstack([test_text_m, test_audio_m, test_rules_m]).T
test_meta_val = clip01_5(X_test_val @ w_val)

X_test_rank = np.vstack([rank_scale(test_text_m), rank_scale(test_audio_m), rank_scale(test_rules_m)]).T
test_meta_rank = X_test_rank @ w_rank

test_meta = clip01_5((1 - alpha) * test_meta_val + alpha * test_meta_rank)

# giving more importance to models that are more confident
w_conf = np.array([
    1.0 / test_text_s.mean(),
    1.0 / test_audio_s.mean(),
    1.0,  # rules has no std
], dtype=np.float32)
w_conf = w_conf / w_conf.sum()
conf_ens = clip01_5(w_conf[0]*test_text_m + w_conf[1]*test_audio_m + w_conf[2]*test_rules_m)

beta = 0.25
test_meta = clip01_5((1 - beta) * test_meta + beta * conf_ens)

# Final standardization and isotonic calibration
tm = (test_meta - test_meta.mean()) / (test_meta.std() + 1e-6)
tm = tm * (y_meta.std() + 1e-6) + y_meta.mean()
test_meta = clip01_5(0.7 * test_meta + 0.3 * tm)

test_meta = clip01_5(iso.predict(test_meta))



NNLS value weights: [0.3312 0.3307 0.3381]
NNLS VALUE (OOF) -> MAE 0.569 | RMSE 0.767 | r -0.015
NNLS rank weights: [0.5395 0.4605 0.    ]
RANK-BLEND OOF -> MAE 0.830 | RMSE 1.074 | r -0.020


# SAVING SUBMISSION.CSV

In [None]:

pd.DataFrame({
    "filename": test_df["filename"],
    "label": test_meta
}).to_csv(os.path.join(OUT_DIR, "stacked_predictions.csv"), index=False)

pd.DataFrame({
    "oof_text": oof_text,
    "oof_audio": oof_audio,
    "oof_rules": oof_rules,
    "oof_meta_value": oof_meta_val,
    "oof_meta_rankblend": oof_meta_blend,
    "label": y_all,
}).to_csv(os.path.join(OUT_DIR, "oof_features.csv"), index=False)

print("\nSaved:")
print(" -", os.path.join(OUT_DIR, "stacked_predictions.csv"))



Saved:
 - output\stacked_predictions.csv
