In [14]:
import sys
import os

# 1. Ë®≠ÁΩÆ Protobuf ‰ΩøÁî® Python ÂØ¶Áèæ
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

# 2. ÈóúÈçµ‰øÆÂæ©ÔºöÂ±èËîΩ TensorFlow
# ÈÄôÊúÉÈò≤Ê≠¢ transformers Â∞éÂÖ• tensorflowÔºåÂæûËÄåÈÅøÂÖç protobuf ÁâàÊú¨Ë°ùÁ™Å
sys.modules["tensorflow"] = None 

In [None]:
import gc
from collections import Counter
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig
from tqdm.auto import tqdm

In [16]:
# ÈÅçÊ≠∑ input ÁõÆÈåÑÔºåÊü•ÁúãÊ™îÊ°àÁµêÊßã
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/google-quest-challenge/sample_submission.csv
/kaggle/input/google-quest-challenge/train.csv
/kaggle/input/google-quest-challenge/test.csv
/kaggle/input/quest-finetuned-weights/best-deberta-v3-base-1.pth
/kaggle/input/quest-finetuned-weights/best_deberta-v3-base-2.pth
/kaggle/input/deberta-v3-base-offline/spm.model
/kaggle/input/deberta-v3-base-offline/config.json
/kaggle/input/deberta-v3-base-offline/tokenizer.json
/kaggle/input/deberta-v3-base-offline/tokenizer_config.json
/kaggle/input/deberta-v3-base-offline/model.safetensors
/kaggle/input/deberta-v3-base-offline/special_tokens_map.json
/kaggle/input/deberta-v3-base-offline/added_tokens.json


In [None]:
# ==========================================
# 1. Configuration
# ==========================================
class Config:
    """Inference configuration"""
    model_name = "/kaggle/input/deberta-v3-base-offline"
    max_len = 512
    batch_size = 16  # Can increase batch size for inference
    num_workers = 2
    seed = 42
    
    # Paths - Updated for Kaggle environment
    train_csv = "/kaggle/input/google-quest-challenge/train.csv" 
    test_csv = "/kaggle/input/google-quest-challenge/test.csv"
    sample_submission_csv = "/kaggle/input/google-quest-challenge/sample_submission.csv"
    
    # Path to folder containing trained model weights (5 fold models)
    models_dir = "/kaggle/input/quest-finetuned-weights"
    n_folds = 5  # Number of fold models to ensemble
    
    target_cols = [
        'question_asker_intent_understanding', 'question_body_critical', 'question_conversational',
        'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer',
        'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent',
        'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice',
        'question_type_compare', 'question_type_consequence', 'question_type_definition',
        'question_type_entity', 'question_type_instructions', 'question_type_procedure',
        'question_type_reason_explanation', 'question_type_spelling', 'question_well_written',
        'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance',
        'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure',
        'answer_type_reason_explanation', 'answer_well_written'
    ]

def seed_everything(seed=42):
    """Set random seeds for reproducibility"""
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(Config.seed)

# Display configuration
print("‚úÖ Configuration loaded successfully!\n")
print(f"ü§ñ Model: {Config.model_name}")
print(f"üìè Max Length: {Config.max_len}")
print(f"üì¶ Batch Size: {Config.batch_size}")
print(f"üéØ Number of Folds: {Config.n_folds}")
print(f"üé≤ Random Seed: {Config.seed}")
print(f"\nüìä Number of target labels: {len(Config.target_cols)}")

‚úÖ Configuration loaded successfully!

ü§ñ Model: /kaggle/input/deberta-v3-base-offline
üìè Max Length: 512
üì¶ Batch Size: 16
üé≤ Random Seed: 42

üìä Number of target labels: 30


In [18]:
# ==========================================
# 2. Dataset Class
# ==========================================
class QuestDataset(Dataset):
    """Custom dataset for Q&A labeling task"""
    
    def __init__(self, df, tokenizer, max_len=512, mode="test"):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.mode = mode
        
        self.titles = df['question_title'].values
        self.bodies = df['question_body'].values
        self.answers = df['answer'].values
        
        if self.mode != "test":
            self.targets = df[Config.target_cols].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        title = str(self.titles[idx])
        body = str(self.bodies[idx])
        answer = str(self.answers[idx])
        
        # Combine question parts
        q_text = title + " " + self.tokenizer.sep_token + " " + body
        a_text = answer
        
        # Tokenize
        q_tokens = self.tokenizer.tokenize(q_text)
        a_tokens = self.tokenizer.tokenize(a_text)
        
        # Dynamic truncation with budget awareness
        budget = self.max_len - 3  # [CLS], [SEP], [SEP]
        if len(q_tokens) + len(a_tokens) > budget:
            half = budget // 2
            if len(a_tokens) > half and len(q_tokens) > half:
                a_tokens = a_tokens[:half]
                q_tokens = q_tokens[:budget - len(a_tokens)]
            elif len(a_tokens) <= half:
                q_tokens = q_tokens[:budget - len(a_tokens)]
            else:
                a_tokens = a_tokens[:budget - len(q_tokens)]
                
        # Build input IDs
        ids = [self.tokenizer.cls_token_id] + \
              self.tokenizer.convert_tokens_to_ids(q_tokens) + \
              [self.tokenizer.sep_token_id] + \
              self.tokenizer.convert_tokens_to_ids(a_tokens) + \
              [self.tokenizer.sep_token_id]
              
        mask = [1] * len(ids)
        padding_len = self.max_len - len(ids)
        ids = ids + [self.tokenizer.pad_token_id] * padding_len
        mask = mask + [0] * padding_len
        
        output = {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long)
        }
        
        if self.mode != "test":
            output['labels'] = torch.tensor(self.targets[idx], dtype=torch.float)
            
        return output

In [19]:
# ==========================================
# 3. Model Class
# ==========================================
class QuestDebertaModel(nn.Module):
    """DeBERTa model with weighted layer pooling and multi-sample dropout"""
    
    def __init__(self, model_name=Config.model_name, num_labels=30):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.output_hidden_states = True
        self.model = AutoModel.from_pretrained(model_name, config=self.config)
        
        # Weighted layer pooling
        n_weights = self.config.num_hidden_layers + 1
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = nn.Parameter(weights_init)
        
        # Multi-sample dropout
        self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
        self.fc = nn.Linear(self.config.hidden_size, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.hidden_states 
        
        # Stack [CLS] tokens
        cls_outputs = torch.stack([layer[:, 0, :] for layer in hidden_states], dim=1)
        
        # Weighted sum
        weights = torch.softmax(self.layer_weights, dim=0).view(1, -1, 1)
        weighted_cls = (weights * cls_outputs).sum(dim=1)
        
        # Multi-sample dropout
        logits_list = []
        for dropout in self.dropouts:
            logits_list.append(self.fc(dropout(weighted_cls)))
        avg_logits = torch.mean(torch.stack(logits_list, dim=0), dim=0)
        
        return self.sigmoid(avg_logits)

In [None]:
# ==========================================
# 4. Post-processing Utilities (Winning Solution Approach)
# ==========================================
def postprocess_single_column(target, ref):
    """
    Match the distribution of predicted column to training distribution.
    
    This technique from the winning solution adjusts predictions to follow
    the same distribution as the training data. Since Spearman correlation
    is rank-based, this can improve rankings by leveraging training set
    distribution knowledge.
    
    Args:
        target: Predicted values for a single column (numpy array)
        ref: Training values for the same column (numpy array)
        
    Returns:
        Postprocessed predictions scaled to [0, 1]
    """
    # Sort indices by predicted values
    ids = np.argsort(target)
    
    # Get value counts from training data, sorted by value
    counts = sorted(Counter(ref).items(), key=lambda s: s[0])
    scores = np.zeros_like(target)
    
    last_pos = 0
    v = 0
    
    # Assign rank values based on training distribution
    for value, count in counts:
        # Calculate position in test set proportional to training distribution
        next_pos = last_pos + int(round(count / len(ref) * len(target)))
        if next_pos == last_pos:
            next_pos += 1
            
        # Assign same score to samples in this range
        cond = ids[last_pos:next_pos]
        scores[cond] = v
        last_pos = next_pos
        v += 1
    
    # Normalize to [0, 1]
    if scores.max() > 0:
        return scores / scores.max()
    return scores


def postprocess_predictions(predictions, train_df, target_cols, use_distribution_matching=True):
    """
    Apply distribution matching and normalization to predictions.
    
    Since Spearman correlation only cares about rankings, not actual values:
    - Snapping to specific values is NOT helpful
    - Distribution matching CAN help by adjusting rankings
    
    Args:
        predictions: numpy array of shape (n_samples, n_targets)
        train_df: Training dataframe with target columns
        target_cols: List of target column names
        use_distribution_matching: If True, apply distribution matching to selected columns
        
    Returns:
        Postprocessed predictions as numpy array
    """
    postprocessed = predictions.copy()
    
    # Columns where distribution matching showed substantial improvement
    # Winner reported 0.027-0.030 boost from this technique
    distribution_matching_cols = {
        # Original columns from winner's solution
        'question_conversational',
        'question_type_compare',
        'question_type_definition',
        'question_type_entity',
        'question_has_commonly_accepted_answer',
        'question_type_consequence',
        'question_type_spelling',
        
        # Additional challenging targets with sparse/imbalanced distributions
        'question_type_choice',
        'question_not_really_a_question',
        'question_multi_intent',
        'question_type_procedure',
        'question_type_instructions',
        'answer_type_procedure',
        'answer_type_instructions',
        'question_expect_short_answer',
        'answer_type_reason_explanation',
    }
    
    for i, col in enumerate(target_cols):
        if use_distribution_matching and col in distribution_matching_cols:
            # Apply distribution matching for specific columns
            scores = postprocess_single_column(
                postprocessed[:, i], 
                train_df[col].values
            )
            postprocessed[:, i] = scores
        
        # Scale all columns to [0, 1] interval
        v = postprocessed[:, i]
        v_min, v_max = v.min(), v.max()
        if v_max > v_min:
            postprocessed[:, i] = (v - v_min) / (v_max - v_min)
        else:
            postprocessed[:, i] = 0.5  # If all values are the same
    
    return postprocessed

In [None]:
# ==========================================
# 5. Inference Pipeline (5-Fold Ensemble)
# ==========================================
@torch.no_grad()
def generate_predictions(model, test_loader, device):
    """Generate predictions on test set"""
    model.eval()
    all_preds = []
    
    for batch in tqdm(test_loader, desc="Inference", leave=False):
        input_ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids, mask)
        all_preds.append(outputs.cpu().numpy())
            
    return np.concatenate(all_preds)


def inference_pipeline(use_postprocessing=True):
    """
    Complete inference pipeline with 5-fold ensemble and post-processing.
    
    Args:
        use_postprocessing: If True, apply distribution matching post-processing
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"üñ•Ô∏è  Using device: {device}")
    if torch.cuda.is_available():
        print(f"   Available GPUs: {torch.cuda.device_count()}\n")
    
    # Load test data
    print("üìÇ Loading test data...")
    if not os.path.exists(Config.test_csv):
        print(f"‚ùå Error: Test file not found at {Config.test_csv}")
        return None
        
    test_df = pd.read_csv(Config.test_csv)
    print(f"   Test samples: {len(test_df):,}\n")
    
    # Load tokenizer
    print("üî§ Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(Config.model_name)
    print("   ‚úì Tokenizer ready\n")
    
    # Prepare test dataloader with larger batch size for inference
    test_dataset = QuestDataset(test_df, tokenizer, mode="test")
    test_loader = DataLoader(
        test_dataset, 
        batch_size=Config.batch_size * 2,  # Larger batch for inference
        shuffle=False, 
        num_workers=Config.num_workers,
        pin_memory=True
    )
    
    # ========================================
    # Ensemble predictions from all fold models
    # ========================================
    print(f"üéØ Loading and ensembling {Config.n_folds} fold models...")
    fold_preds = []
    
    for fold in range(1, Config.n_folds + 1):
        model_path = os.path.join(Config.models_dir, f"best_model_fold{fold}.pth")
        
        if not os.path.exists(model_path):
            print(f"   ‚ö†Ô∏è  Warning: Model fold {fold} not found at {model_path}")
            continue
            
        print(f"   üì¶ Loading fold {fold} model...")
        model = QuestDebertaModel()
        state_dict = torch.load(model_path, map_location=device)
        model.load_state_dict(state_dict)
        model.to(device)
        
        # Generate predictions for this fold
        preds = generate_predictions(model, test_loader, device)
        fold_preds.append(preds)
        print(f"      ‚úì Fold {fold} predictions generated")
        
        # Clean up GPU memory
        del model
        torch.cuda.empty_cache()
        gc.collect()
    
    if not fold_preds:
        raise ValueError("‚ùå No models found for inference!")
    
    # Average predictions across all folds
    avg_preds = np.mean(fold_preds, axis=0)
    print(f"\n‚úÖ Ensemble complete: Averaged predictions from {len(fold_preds)} fold model(s)")
    print(f"   Final predictions shape: {avg_preds.shape}")
    
    # ========================================
    # Post-processing with distribution matching
    # ========================================
    if use_postprocessing and os.path.exists(Config.train_csv):
        print("\nüîß Applying distribution matching post-processing...")
        train_df = pd.read_csv(Config.train_csv)
        final_preds = postprocess_predictions(
            avg_preds, 
            train_df, 
            Config.target_cols,
            use_distribution_matching=True
        )
        print("   ‚úì Distribution matching applied to selected columns")
    else:
        final_preds = avg_preds
        if not os.path.exists(Config.train_csv):
            print("\n‚ö†Ô∏è  train.csv not found, skipping post-processing")
        else:
            print("\n‚ö†Ô∏è  Post-processing disabled")

    # ========================================
    # Create submission file
    # ========================================
    print("\nüìù Creating submission file...")
    submission = pd.DataFrame(final_preds, columns=Config.target_cols)
    submission['qa_id'] = test_df['qa_id']
    submission = submission[['qa_id'] + Config.target_cols]
    submission.to_csv("submission.csv", index=False)
    print("   ‚úì Submission saved to submission.csv")
    
    # Display sample predictions
    print("\nüìä Sample predictions (first 3 rows):")
    print(submission.head(3))
            
    return submission

In [None]:
# ==========================================
# 6. Run Inference (5-Fold Ensemble)
# ==========================================
print("="*60)
print("Google Quest Q&A Labeling - Inference with 5-Fold Ensemble")
print("="*60)
print(f"Configuration:")
print(f"  ‚Ä¢ Model: {Config.model_name}")
print(f"  ‚Ä¢ Ensemble: {Config.n_folds} fold models")
print(f"  ‚Ä¢ Post-processing: Distribution Matching (Winning Solution)")
print(f"  ‚Ä¢ Batch Size: {Config.batch_size}")
print("="*60 + "\n")

submission = inference_pipeline(use_postprocessing=True)

print("\n" + "="*60)
print("‚úÖ Inference pipeline completed successfully!")
print("="*60)

Using device: cuda

Loading test data...
Test samples: 476

Loading model and tokenizer...
‚úì Loaded weights from /kaggle/input/quest-finetuned-weights/best_deberta-v3-base-2.pth
‚úì Model and tokenizer ready

Generating predictions...


Inference:   0%|          | 0/30 [00:00<?, ?it/s]


Applying post-processing...
‚úì Predictions snapped to valid values
‚úì Added epsilon noise to prevent constant columns

Creating submission file...
‚úì Submission saved to submission.csv
