# üöÄ Minimal Transformer Fine-tuning 

## Configuration
- Model: DeBERTa-v3-small (86M params)
- Batch size: 2
- Epochs: 2
- Folds: 2
- Custom training loop (no HuggingFace Trainer)

This version is optimized to prevent kernel crashes.

In [1]:
# ============================================
# CRITICAL: Prevent Windows Kernel Crash
# ============================================
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
os.environ["OMP_NUM_THREADS"] = "2"
os.environ["MKL_NUM_THREADS"] = "2"
os.environ["NUMEXPR_NUM_THREADS"] = "2"

print("‚úÖ Crash prevention environment variables set")

‚úÖ Crash prevention environment variables set


In [2]:
# Core imports
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Optional
from dataclasses import dataclass

# Progress bar
from tqdm.auto import tqdm

# ML
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.cuda.amp import autocast, GradScaler

# Transformers (NO Trainer!)
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    get_cosine_schedule_with_warmup
)

# Set seeds
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üñ•Ô∏è  Device: {device}")

# Limit threads
torch.set_num_threads(2)

print("‚úÖ All imports successful")

üñ•Ô∏è  Device: cpu
‚úÖ All imports successful


In [4]:
@dataclass
class Config:
    """Minimal configuration to prevent crashes."""
    # Model
    model_name: str = 'microsoft/deberta-v3-small'
    max_length: int = 128
    
    # Training
    epochs: int = 2
    batch_size: int = 2
    accumulation_steps: int = 4
    learning_rate: float = 2e-5
    weight_decay: float = 0.01
    warmup_ratio: float = 0.1
    max_grad_norm: float = 1.0
    
    # Data
    n_folds: int = 2
    
    # Paths
    data_dir: Path = Path('data')
    output_dir: Path = Path('outputs')

config = Config()
config.output_dir.mkdir(exist_ok=True)

print("‚öôÔ∏è Configuration:")
print(f"   Model: {config.model_name}")
print(f"   Batch size: {config.batch_size}")
print(f"   Epochs: {config.epochs}")
print(f"   Folds: {config.n_folds}")

‚öôÔ∏è Configuration:
   Model: microsoft/deberta-v3-small
   Batch size: 2
   Epochs: 2
   Folds: 2


In [5]:
# Load data
train_df = pd.read_csv(config.data_dir / 'train.csv')
test_df = pd.read_csv(config.data_dir / 'test.csv')

print(f"üìä Data loaded:")
print(f"   Train: {len(train_df):,}")
print(f"   Test:  {len(test_df):,}")
print(f"\n   Class distribution:")
print(train_df['rule_violation'].value_counts())

üìä Data loaded:
   Train: 2,029
   Test:  54,059

   Class distribution:
rule_violation
1    1031
0     998
Name: count, dtype: int64


In [6]:
class SimpleDataset(Dataset):
    """Minimal dataset for text classification."""
    
    def __init__(self, data: pd.DataFrame, tokenizer, max_length: int, mode: str = 'train'):
        self.data = data.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mode = mode
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Combine body and rule
        text = f"{row['body']} [SEP] {row['rule']}"
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0)
        }
        
        if self.mode == 'train' and 'rule_violation' in self.data.columns:
            item['labels'] = torch.tensor(row['rule_violation'], dtype=torch.float)
        
        return item

print("‚úÖ Dataset class defined")

‚úÖ Dataset class defined


In [7]:
class SimpleTransformerClassifier(nn.Module):
    """Simple transformer classifier with custom head."""
    
    def __init__(self, model_name: str, dropout: float = 0.1):
        super().__init__()
        
        # Load transformer
        config_tf = AutoConfig.from_pretrained(model_name)
        self.transformer = AutoModel.from_pretrained(model_name, config=config_tf)
        
        hidden_size = self.transformer.config.hidden_size
        
        # Simple classification head
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size // 2, 1)
        )
    
    def forward(self, input_ids, attention_mask, labels=None):
        # Get transformer output
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use CLS token
        pooled = outputs.last_hidden_state[:, 0]
        
        # Classify
        logits = self.classifier(pooled).squeeze(-1)
        
        # Calculate loss if labels provided
        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)
        
        return {'loss': loss, 'logits': logits}

print("‚úÖ Model class defined")

‚úÖ Model class defined


In [8]:
class CustomTrainer:
    """Custom training loop - NO HuggingFace Trainer."""
    
    def __init__(self, model, train_loader, val_loader, config, device):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.config = config
        self.device = device
        
        # Optimizer
        self.optimizer = AdamW(
            model.parameters(),
            lr=config.learning_rate,
            weight_decay=config.weight_decay
        )
        
        # Scheduler
        num_training_steps = len(train_loader) * config.epochs // config.accumulation_steps
        num_warmup_steps = int(num_training_steps * config.warmup_ratio)
        self.scheduler = get_cosine_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )
        
        # Mixed precision
        self.scaler = GradScaler()
        
        # Tracking
        self.best_auc = 0.0
        self.history = {'train_loss': [], 'val_loss': [], 'val_auc': []}
    
    def train_epoch(self):
        """Train for one epoch."""
        self.model.train()
        total_loss = 0
        
        self.optimizer.zero_grad()
        
        for step, batch in enumerate(tqdm(self.train_loader, desc='Training', leave=False)):
            batch = {k: v.to(self.device) for k, v in batch.items()}
            
            # Forward pass with mixed precision
            with autocast():
                outputs = self.model(**batch)
                loss = outputs['loss'] / self.config.accumulation_steps
            
            # Backward
            self.scaler.scale(loss).backward()
            
            # Update weights every N steps
            if (step + 1) % self.config.accumulation_steps == 0:
                self.scaler.unscale_(self.optimizer)
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm)
                self.scaler.step(self.optimizer)
                self.scaler.update()
                self.scheduler.step()
                self.optimizer.zero_grad()
            
            total_loss += loss.item() * self.config.accumulation_steps
        
        return total_loss / len(self.train_loader)
    
    @torch.no_grad()
    def validate(self):
        """Validate model."""
        self.model.eval()
        total_loss = 0
        all_preds = []
        all_labels = []
        
        for batch in tqdm(self.val_loader, desc='Validating', leave=False):
            batch = {k: v.to(self.device) for k, v in batch.items()}
            
            outputs = self.model(**batch)
            loss = outputs['loss']
            logits = outputs['logits']
            
            total_loss += loss.item()
            
            probs = torch.sigmoid(logits).cpu().numpy()
            all_preds.extend(probs)
            all_labels.extend(batch['labels'].cpu().numpy())
        
        avg_loss = total_loss / len(self.val_loader)
        auc = roc_auc_score(all_labels, all_preds)
        
        return avg_loss, auc
    
    def fit(self):
        """Train for all epochs."""
        print(f"\nüöÄ Starting training...\n")
        
        for epoch in range(self.config.epochs):
            print(f"Epoch {epoch + 1}/{self.config.epochs}")
            print("-" * 60)
            
            # Train
            train_loss = self.train_epoch()
            
            # Validate
            val_loss, val_auc = self.validate()
            
            # Save history
            self.history['train_loss'].append(train_loss)
            self.history['val_loss'].append(val_loss)
            self.history['val_auc'].append(val_auc)
            
            print(f"Train Loss: {train_loss:.4f}")
            print(f"Val Loss:   {val_loss:.4f}")
            print(f"Val AUC:    {val_auc:.4f}")
            
            # Save best
            if val_auc > self.best_auc:
                self.best_auc = val_auc
                torch.save(self.model.state_dict(), 'best_model.pt')
                print(f"üíæ Best model saved! (AUC: {val_auc:.4f})")
            
            print()
        
        print(f"‚úÖ Training complete! Best AUC: {self.best_auc:.4f}")
        return self.history

print("‚úÖ Custom Trainer defined")

‚úÖ Custom Trainer defined


In [9]:
def train_with_cv(train_df, config, n_folds=2):
    """Train with k-fold cross-validation."""
    
    print(f"\n{'='*60}")
    print(f"Training with {n_folds}-Fold CV")
    print(f"{'='*60}\n")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    
    # K-fold
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=SEED)
    
    fold_results = []
    oof_predictions = np.zeros(len(train_df))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['rule_violation'])):
        print(f"\n{'='*60}")
        print(f"Fold {fold + 1}/{n_folds}")
        print(f"{'='*60}")
        
        # Split data
        fold_train = train_df.iloc[train_idx].reset_index(drop=True)
        fold_val = train_df.iloc[val_idx].reset_index(drop=True)
        
        print(f"Train: {len(fold_train):,} | Val: {len(fold_val):,}")
        
        # Create datasets
        train_dataset = SimpleDataset(fold_train, tokenizer, config.max_length, mode='train')
        val_dataset = SimpleDataset(fold_val, tokenizer, config.max_length, mode='train')
        
        # Create loaders
        train_loader = DataLoader(
            train_dataset,
            batch_size=config.batch_size,
            shuffle=True,
            num_workers=0,  # 0 to prevent crashes
            pin_memory=True if torch.cuda.is_available() else False
        )
        
        val_loader = DataLoader(
            val_dataset,
            batch_size=config.batch_size * 2,
            shuffle=False,
            num_workers=0,
            pin_memory=True if torch.cuda.is_available() else False
        )
        
        # Initialize model
        model = SimpleTransformerClassifier(config.model_name)
        
        # Train
        trainer = CustomTrainer(model, train_loader, val_loader, config, device)
        history = trainer.fit()
        
        # Get OOF predictions
        model.eval()
        with torch.no_grad():
            val_preds = []
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(batch['input_ids'], batch['attention_mask'])
                probs = torch.sigmoid(outputs['logits']).cpu().numpy()
                val_preds.extend(probs)
        
        oof_predictions[val_idx] = val_preds
        
        fold_results.append({
            'fold': fold + 1,
            'best_auc': trainer.best_auc,
            'history': history
        })
        
        # Clean up
        del model, trainer, train_loader, val_loader
        torch.cuda.empty_cache()
    
    # Calculate overall CV score
    cv_auc = roc_auc_score(train_df['rule_violation'], oof_predictions)
    
    print(f"\n{'='*60}")
    print(f"Cross-Validation Results")
    print(f"{'='*60}")
    for result in fold_results:
        print(f"Fold {result['fold']}: AUC = {result['best_auc']:.4f}")
    
    avg_auc = np.mean([r['best_auc'] for r in fold_results])
    print(f"\nMean AUC: {avg_auc:.4f}")
    print(f"OOF AUC:  {cv_auc:.4f}")
    print(f"{'='*60}")
    
    return {
        'fold_results': fold_results,
        'oof_predictions': oof_predictions,
        'cv_auc': cv_auc
    }

print("‚úÖ Training function defined")

‚úÖ Training function defined


In [10]:
# Train model
results = train_with_cv(train_df, config, n_folds=config.n_folds)

print(f"\nüèÜ Final CV AUC: {results['cv_auc']:.4f}")


Training with 2-Fold CV



: 

In [None]:
@torch.no_grad()
def generate_test_predictions(test_df, config):
    """Generate predictions on test set."""
    
    print("\nüîÆ Generating test predictions...")
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    model = SimpleTransformerClassifier(config.model_name)
    model.load_state_dict(torch.load('best_model.pt'))
    model.to(device)
    model.eval()
    
    # Create dataset
    test_dataset = SimpleDataset(test_df, tokenizer, config.max_length, mode='test')
    test_loader = DataLoader(
        test_dataset,
        batch_size=config.batch_size * 2,
        shuffle=False,
        num_workers=0
    )
    
    # Predict
    predictions = []
    for batch in tqdm(test_loader, desc='Predicting'):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(batch['input_ids'], batch['attention_mask'])
        probs = torch.sigmoid(outputs['logits']).cpu().numpy()
        predictions.extend(probs)
    
    return np.array(predictions)

# Generate predictions
test_predictions = generate_test_predictions(test_df, config)

print(f"\n‚úÖ Predictions generated!")
print(f"   Min: {test_predictions.min():.4f}")
print(f"   Max: {test_predictions.max():.4f}")
print(f"   Mean: {test_predictions.mean():.4f}")

In [None]:
# Create submission
submission = pd.DataFrame({
    'row_id': test_df['row_id'],
    'rule_violation': test_predictions
})

submission.to_csv(config.output_dir / 'submission.csv', index=False)

print("‚úÖ Submission saved to outputs/submission.csv")
print(f"\nüìä Submission preview:")
print(submission.head(10))

print(f"\nüéâ ALL DONE!")
print(f"   CV AUC: {results['cv_auc']:.4f}")
print(f"   Model: {config.model_name}")
print(f"   File: outputs/submission.csv")