In [None]:
!pip install --upgrade huggingface_hub
!hf download distilbert-base-uncased --local-dir /kaggle/working/distilbert-base-uncased --quiet



# ============================================
# Hybrid DistilBERT + Tabular Price Prediction (v3)
# Using Optuna-tuned hyperparameters
# ============================================

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import re
import random
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from transformers import DistilBertTokenizerFast, DistilBertModel, get_cosine_schedule_with_warmup
from torch.optim import AdamW
from torch import amp

# -------------------------------
# Reproducibility
# -------------------------------
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

# -------------------------------
# Load tokenizer/model locally
# -------------------------------
LOCAL_BERT_DIR = "./distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(LOCAL_BERT_DIR)
bert_model = DistilBertModel.from_pretrained(LOCAL_BERT_DIR)
print("✅ DistilBERT loaded locally.")

# -------------------------------
# Paths
# -------------------------------
TRAIN_PATH = "/kaggle/input/mldataset/dataset/train.csv"
TEST_PATH  = "/kaggle/input/mldataset/dataset/test.csv"

# -------------------------------
# SMAPE + LogCosh loss
# -------------------------------
class HybridLoss(nn.Module):
    def __init__(self, eps=1e-6, alpha=0.7):
        super().__init__()
        self.eps, self.alpha = eps, alpha
    def forward(self, y_pred, y_true):
        smape = torch.mean(torch.abs(y_pred - y_true) /
                           ((torch.abs(y_true) + torch.abs(y_pred) + self.eps) / 2))
        logcosh = torch.mean(torch.log(torch.cosh(y_pred - y_true)))
        return self.alpha * smape + (1 - self.alpha) * logcosh

# -------------------------------
# Data preprocessing
# -------------------------------
def extract_value_unit(text):
    value_match = re.search(r'Value:\s*([0-9\.]+)', str(text))
    unit_match  = re.search(r'Unit:\s*([A-Za-z0-9 %]+)', str(text))
    value = float(value_match.group(1)) if value_match else 0.0
    unit  = unit_match.group(1).strip() if unit_match else 'Unknown'
    return value, unit

def clean_text(text):
    t = str(text).lower()
    t = re.sub(r'[^a-z0-9.% ]', ' ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t

train_df = pd.read_csv(TRAIN_PATH).fillna('')
test_df  = pd.read_csv(TEST_PATH).fillna('')
train_df[['Value','Unit']] = train_df['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))
test_df[['Value','Unit']]  = test_df['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))

for df in [train_df, test_df]:
    df['text'] = df['catalog_content'].apply(clean_text)
    df['text_len'] = df['text'].apply(len)
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    df['has_image'] = df['image_link'].apply(lambda x: 1 if isinstance(x, str) and len(x)>5 else 0)
    df['Value_log'] = np.log1p(df['Value'])
    df['value_unit_ratio'] = df['Value_log'] / (df['word_count']+1)
    df['text_len_per_word'] = df['text_len'] / (df['word_count']+1)
    df['is_short_text'] = (df['word_count'] < 5).astype(int)

num_features = ['Value_log','text_len','word_count','has_image',
                'value_unit_ratio','text_len_per_word','is_short_text']
scaler = StandardScaler()
X_num_train = scaler.fit_transform(train_df[num_features])
X_num_test  = scaler.transform(test_df[num_features])

unit2idx = {u:i for i,u in enumerate(train_df['Unit'].unique())}
train_df['Unit_idx'] = train_df['Unit'].map(unit2idx).fillna(len(unit2idx)).astype(int)
test_df['Unit_idx']  = test_df['Unit'].map(unit2idx).fillna(len(unit2idx)).astype(int)
unit_vocab_size = len(unit2idx)

# -------------------------------
# Dataset
# -------------------------------
class PriceDataset(Dataset):
    def __init__(self, texts, num_feats, unit_idx, targets=None, tokenizer=None, max_len=128, aug=False):
        self.num_feats = torch.tensor(num_feats, dtype=torch.float32)
        self.unit_idx  = torch.tensor(unit_idx, dtype=torch.long)
        self.targets = torch.tensor(targets, dtype=torch.float32).unsqueeze(1) if targets is not None else None
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.texts = texts
        if aug:
            self.texts = [self._augment_text(t) for t in texts]
        toks = self.tokenizer(self.texts, padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')
        self.input_ids, self.attention_mask = toks['input_ids'], toks['attention_mask']

    def _augment_text(self, text):
        words = text.split()
        if len(words) > 5 and random.random() < 0.3:
            drop_idx = random.choice(range(len(words)))
            words.pop(drop_idx)
        return " ".join(words)

    def __len__(self): return len(self.input_ids)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'num_feats': self.num_feats[idx],
            'unit_idx': self.unit_idx[idx]
        }
        if self.targets is not None:
            item['target'] = self.targets[idx]
        return item

# -------------------------------
# Train/val split
# -------------------------------
y_all = np.log1p(train_df['price'].values).astype(np.float32)
train_idx, val_idx = train_test_split(np.arange(len(train_df)), test_size=0.1, random_state=42)

train_ds = PriceDataset(train_df['text'].iloc[train_idx].tolist(), X_num_train[train_idx],
                        train_df['Unit_idx'].iloc[train_idx].values, y_all[train_idx], tokenizer, aug=True)
val_ds = PriceDataset(train_df['text'].iloc[val_idx].tolist(), X_num_train[val_idx],
                      train_df['Unit_idx'].iloc[val_idx].values, y_all[val_idx], tokenizer)
test_ds = PriceDataset(test_df['text'].tolist(), X_num_test, test_df['Unit_idx'].values, tokenizer=tokenizer)

# Optuna best params
BATCH_SIZE = 32

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2, pin_memory=True)

# -------------------------------
# Model with Optuna-tuned params
# -------------------------------
class HybridPriceModel(nn.Module):
    def __init__(self, bert_model, num_feat_dim, unit_vocab_size,
                 emb_dim=11, dropout=0.3740734052386133, hidden1=808, hidden2=272):
        super().__init__()
        self.bert = bert_model
        self.unit_emb = nn.Embedding(unit_vocab_size+1, emb_dim)
        bert_dim = bert_model.config.dim
        total_dim = bert_dim*4 + num_feat_dim + emb_dim
        self.attn_vec = nn.Linear(bert_dim, 1)
        self.head = nn.Sequential(
            nn.Linear(total_dim, hidden1), nn.LayerNorm(hidden1), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(hidden1, hidden2), nn.LayerNorm(hidden2), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(hidden2, 128), nn.LayerNorm(128), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(128, 1)
        )

    def forward(self, input_ids, attention_mask, num_feats, unit_idx):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        mask = attention_mask.unsqueeze(-1).float()
        mean_pool = (out * mask).sum(1) / mask.sum(1).clamp(min=1e-9)
        max_pool,_ = (out * mask).max(1)
        cls = out[:,0,:]

        attn_logits = self.attn_vec(out).squeeze(-1).float()
        attn_logits = attn_logits.masked_fill(attention_mask == 0, -1e9)
        attn_w = torch.softmax(attn_logits, dim=1)
        attn_w = attn_w.to(out.dtype).unsqueeze(1)
        attn_pool = torch.bmm(attn_w, out).squeeze(1)

        x = torch.cat([cls, mean_pool, max_pool, attn_pool, num_feats, self.unit_emb(unit_idx)], dim=1)
        return self.head(x)

# -------------------------------
# Training setup
# -------------------------------
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = HybridPriceModel(bert_model, X_num_train.shape[1], unit_vocab_size).to(device)
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

bert_params = [p for n,p in model.named_parameters() if 'bert' in n]
head_params = [p for n,p in model.named_parameters() if 'bert' not in n]

optimizer = AdamW([
    {'params': bert_params, 'lr': 5.545732304370043e-06, 'weight_decay': 1e-4},
    {'params': head_params, 'lr': 0.004846331266624619, 'weight_decay': 1e-4}
])

EPOCHS = 8
total_steps = EPOCHS * len(train_loader)
warmup_steps = max(1, int(0.1 * total_steps))
scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)

criterion = HybridLoss()
scaler = amp.GradScaler() if device == "cuda" else None

best_val = np.inf
patience = 4
early_stop_count = 0

# -------------------------------
# Training loop
# -------------------------------
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False)
    for batch in pbar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        num_feats = batch['num_feats'].to(device)
        unit_idx  = batch['unit_idx'].to(device)
        targets = batch['target'].to(device)

        if scaler is not None:
            with amp.autocast(device_type='cuda'):
                preds = model(input_ids, attention_mask, num_feats, unit_idx)
                loss = criterion(preds, targets)
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
        else:
            preds = model(input_ids, attention_mask, num_feats, unit_idx)
            loss = criterion(preds, targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        scheduler.step()
        running_loss += loss.item() * input_ids.size(0)
        pbar.set_postfix({'batch_loss': f"{loss.item():.6f}"})

    train_loss = running_loss / len(train_loader.dataset)

    model.eval()
    val_loss_acc = 0.0
    with torch.no_grad():
        for b in val_loader:
            ids = b['input_ids'].to(device)
            mask = b['attention_mask'].to(device)
            numf = b['num_feats'].to(device)
            unit = b['unit_idx'].to(device)
            tgt = b['target'].to(device)
            if scaler is not None:
                with amp.autocast(device_type='cuda'):
                    out = model(ids, mask, numf, unit)
            else:
                out = model(ids, mask, numf, unit)
            val_loss_acc += criterion(out, tgt).item() * ids.size(0)

    val_loss = val_loss_acc / len(val_loader.dataset)
    print(f"Epoch {epoch+1} - Train Loss: {train_loss:.6f} - Val Loss: {val_loss:.6f}")

    if val_loss < best_val:
        best_val = val_loss
        early_stop_count = 0
        save_path = "best_model.pth"
        if isinstance(model, nn.DataParallel):
            torch.save(model.module.state_dict(), save_path)
        else:
            torch.save(model.state_dict(), save_path)
        print(f"✅ Saved best model -> {save_path} | Val Loss: {best_val:.6f}")
    else:
        early_stop_count += 1
        if early_stop_count >= patience:
            print("⏹️ Early stopping triggered.")
            break

# -------------------------------
# Load best model
# -------------------------------
if isinstance(model, nn.DataParallel):
    model.module.load_state_dict(torch.load("best_model.pth"))
else:
    model.load_state_dict(torch.load("best_model.pth"))
model.eval()

# -------------------------------
# Prediction
# -------------------------------
preds_list = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        num_feats = batch['num_feats'].to(device)
        unit_idx  = batch['unit_idx'].to(device)
        if scaler is not None:
            with amp.autocast(device_type='cuda'):
                out = model(input_ids, attention_mask, num_feats, unit_idx)
        else:
            out = model(input_ids, attention_mask, num_feats, unit_idx)
        out = out.cpu().numpy().flatten()
        preds_list.append(out)

preds = np.concatenate(preds_list, axis=0)
preds = np.expm1(preds)
preds = np.clip(preds, 0, None)

submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': preds
})
submission.to_csv('submission.csv', index=False)
print(f"✅ Saved submission.csv | Best Val Loss: {best_val:.6f}")
