In [None]:
import os
import random
os.environ["WANDB_API_KEY"] = "478784ca8c32ded92ab16803b0e11de70116534e"
os.environ["WANDB_PROJECT"] = "lora-agnews"

# Install and import required libraries
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes nvidia-ml-py3 scikit-learn matplotlib seaborn
!pip install nvidia-ml-py3

import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import (
    RobertaModel, 
    RobertaTokenizer, 
    TrainingArguments, 
    Trainer, 
    DataCollatorWithPadding, 
    RobertaForSequenceClassification,
    RobertaConfig,
    get_linear_schedule_with_warmup
)
from transformers.trainer_callback import TrainerCallback
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns

# Set random seed for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)

## Load Tokenizer and Preprocess Data
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

# Enhanced text cleaning with word dropout (disabled for initial debugging)
def clean_text(text, apply_dropout=False, dropout_prob=0.05):
    # Basic cleaning
    text = text.strip()
    text = ' '.join(text.split())
    
    # Word dropout is disabled for initial debugging
    # Will be enabled once we establish a baseline
    return text

def preprocess(examples):
    # Apply dropout during training - disabled for now
    cleaned_texts = [clean_text(text, apply_dropout=False) 
                    for text in examples['text']]
    
    tokenized = tokenizer(
        cleaned_texts, 
        truncation=True, 
        padding='max_length',
        max_length=512,
        return_token_type_ids=False,
        return_attention_mask=True
    )
    
    return tokenized

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

# Extract the number of classes and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"Number of labels: {num_labels}")
print(f"The labels: {class_names}")

# Create an id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}
label2id = {label: i for i, label in id2label.items()}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# For initial debugging, start with the standard model
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label,
    label2id=label2id
)

# Split the original training set with more validation data
split_datasets = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    
    print(f"\ntrainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.4f}")
    return trainable_params

# Create LoRA config with reduced dropout
# PEFT Config
peft_config = LoraConfig(
    r=36,
    lora_alpha=32,
    lora_dropout=0.25,
    bias='none',
    target_modules=["roberta.encoder.layer.0.attention.self.query",
    "roberta.encoder.layer.0.attention.self.key",
    "roberta.encoder.layer.5.attention.self.query",
    "roberta.encoder.layer.10.attention.self.query",
    ],
    task_type="SEQ_CLS",
)

# Apply PEFT to the base model
peft_model = get_peft_model(model, peft_config)

# Print the trainable parameters
trainable_params = print_trainable_parameters(peft_model)

# Verify we're under 1M parameters
assert trainable_params < 1000000, f"Trainable parameters ({trainable_params}) exceed 1M limit!"

## Training Setup with Improved Parameters
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Calculate various metrics
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    
    # Print class distribution for debugging
    print("\nPrediction distribution:")
    for i, name in id2label.items():
        count = (preds == i).sum()
        print(f"  {name}: {count} ({count/len(preds)*100:.2f}%)")
    
    # Check if model is predicting a single class
    if np.unique(preds).size == 1:
        print("WARNING: Model is predicting only one class!")
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Dynamic dropout scheduler callback with reduced rates
class DropoutScheduler(TrainerCallback):
    """Dynamically adjust dropout rates during training"""
    def __init__(self, initial_dropout=0.15, final_dropout=0.05):
        self.initial_dropout = initial_dropout
        self.final_dropout = final_dropout
        
    def on_epoch_begin(self, args, state, control, model=None, **kwargs):
        if model is None:
            return
            
        # Calculate current dropout rate based on training progress
        progress = state.epoch / args.num_train_epochs
        current_dropout = self.initial_dropout - progress * (self.initial_dropout - self.final_dropout)
        
        # Update dropout in all modules
        for module in model.modules():
            if isinstance(module, nn.Dropout):
                module.p = current_dropout
                
        print(f"Epoch {state.epoch:.2f}: Setting dropout to {current_dropout:.4f}")

# Setup Training args with optimized parameters
output_dir = "results_improved_with_dropout_debug"
training_args = TrainingArguments(
    output_dir="./results_lora_r16",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_strategy="epoch",          # Corrected argument name
    save_strategy="epoch",          # This name is likely still correct
    load_best_model_at_end=True,
    push_to_hub=False,
    logging_dir='./logs_lora_r16',
    logging_steps=100,
    report_to="none",
    warmup_ratio=0.1,
    # bf16=True, # Keep commented unless base model loaded appropriately
    # optim="adamw_torch",
)

def get_trainer(model):
    return Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        callbacks=[DropoutScheduler(initial_dropout=0.15, final_dropout=0.05)]
    )

### Start Training
peft_lora_finetuning_trainer = get_trainer(peft_model)

# Train the model
print("Starting training...")
result = peft_lora_finetuning_trainer.train()

# Print training metrics
print(f"Training completed. Training loss: {result.training_loss}")

# Evaluate the model
eval_results = peft_lora_finetuning_trainer.evaluate()
print("\nEvaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value}")

# Debugging: Check model predictions more thoroughly
predictions = peft_lora_finetuning_trainer.predict(eval_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

# Print confusion matrix
cm = confusion_matrix(labels, preds)
print("\nConfusion Matrix:")
print(cm)

# Print class-wise accuracy
print("\nClass-wise accuracy:")
for i, name in id2label.items():
    class_indices = np.where(labels == i)[0]
    if len(class_indices) > 0:
        class_preds = preds[class_indices]
        class_accuracy = (class_preds == i).sum() / len(class_indices)
        print(f"  {name}: {class_accuracy:.4f}")

# Save the fine-tuned model
peft_model_path = os.path.join(output_dir, "final_model")
peft_lora_finetuning_trainer.save_model(peft_model_path)
print(f"Model saved to {peft_model_path}")

# Function to visualize confusion matrix
def plot_confusion_matrix(trainer, dataset):
    predictions = trainer.predict(dataset)
    preds = predictions.predictions.argmax(-1)
    labels = predictions.label_ids
    
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, 
                yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
    plt.close()

# Generate and save confusion matrix
plot_confusion_matrix(peft_lora_finetuning_trainer, eval_dataset)

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-ml-py3
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3

2025-04-20 13:27:16.729797: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745155636.915553      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745155636.965223      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Number of labels: 4
The labels: ['World', 'Sports', 'Business', 'Sci/Tech']


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training examples: 108000
Validation examples: 12000

trainable params: 814852 || all params: 125463560 || trainable%: 0.6495


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...
Epoch 0.00: Setting dropout to 0.1500


Epoch,Training Loss,Validation Loss


In [None]:
## Evaluate Finetuned Model
# Function for performing inference on custom input
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Clean the text first
    text = clean_text(text)
    
    # Update to match the preprocessing in training
    inputs = tokenizer(
        text, 
        truncation=True, 
        padding=True, 
        max_length=256,  # Match the increased max_length used in training
        return_tensors="pt"
    ).to(device)
    
    with torch.no_grad():
        output = model(**inputs)
    
    # Get prediction scores and softmax probabilities
    logits = output.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    prediction = logits.argmax(dim=-1).item()
    confidence = probs[0][prediction].item()
    
    print(f'\nClass: {prediction}, Label: {id2label[prediction]}, Confidence: {confidence:.4f}')
    print(f'Text: {text}')
    return id2label[prediction], confidence

# Test the model on a few examples
test_texts = [
    "Wall St. Bears Claw Back Into the Black. Short-sellers, Wall Street's dwindling band of ultra-cynics, are seeing green again.",
    "Kederis proclaims innocence. Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors.",
    "US plans to send more troops to Iraq next year, despite calls to withdraw forces.",
    "NASA's new space telescope captures stunning images of distant galaxies."
]

print("\nTesting model on example texts:")
for text in test_texts:
    pred_label, confidence = classify(peft_model, tokenizer, text)

# Function to evaluate model on a dataset
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=32, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    all_labels = []
    all_probs = []  # Added to track prediction probabilities
    
    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        
        # Get both predictions and probabilities
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
        predictions = logits.argmax(dim=-1)
        
        all_predictions.append(predictions.cpu())
        all_probs.append(probs.cpu())
        
        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            all_labels.append(references.cpu())

    # Concatenate predictions and probabilities from all batches
    all_predictions = torch.cat(all_predictions, dim=0)
    all_probs = torch.cat(all_probs, dim=0)
    
    if labelled:
        all_labels = torch.cat(all_labels, dim=0)
        
        # Calculate metrics
        accuracy = accuracy_score(all_labels, all_predictions)
        precision = precision_score(all_labels, all_predictions, average='weighted')
        recall = recall_score(all_labels, all_predictions, average='weighted')
        f1 = f1_score(all_labels, all_predictions, average='weighted')
        
        print(f"\nEvaluation Metrics:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
        # Create confusion matrix
        cm = confusion_matrix(all_labels, all_predictions)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=class_names, 
                   yticklabels=class_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title('Confusion Matrix')
        plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
        plt.close()
        
        # Add error analysis for misclassified examples
        print("\nAnalyzing misclassifications...")
        misclassified_indices = torch.where(all_predictions != all_labels)[0]
        if len(misclassified_indices) > 0:
            sample_size = min(10, len(misclassified_indices))
            sample_indices = np.random.choice(misclassified_indices, sample_size, replace=False)
            
            print(f"\nSample of misclassified examples ({sample_size}/{len(misclassified_indices)}):")
            for idx in sample_indices:
                pred = all_predictions[idx].item()
                true = all_labels[idx].item()
                prob = all_probs[idx][pred].item()
                print(f"Example {idx}: Predicted {id2label[pred]} ({prob:.4f}), True {id2label[true]}")
        
        return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}, all_predictions, all_labels
    else:
        return all_predictions

# Check evaluation accuracy
print("\nEvaluating model on validation dataset...")
metrics, all_predictions, all_labels = evaluate_model(peft_model, eval_dataset, True, 32, data_collator)

# # Check evaluation accuracy
# print("\nEvaluating model on validation dataset...")
# metrics, _ = evaluate_model(peft_model, eval_dataset, True, 32, data_collator)

### Run Inference on unlabelled dataset
# Load unlabelled data
try:
    print("\nLoading unlabelled test data...")
    unlabelled_dataset = pd.read_pickle("/home/ps5218/test_unlabelled.pkl")
    
    # Apply the same preprocessing as in training
    test_dataset = Dataset.from_pandas(unlabelled_dataset)
    test_dataset = test_dataset.map(preprocess, batched=True, remove_columns=["text"])
    
    # Run inference and save predictions
    print("Running inference on test dataset...")
    preds = evaluate_model(peft_model, test_dataset, False, 32, data_collator)
    df_output = pd.DataFrame({
        'ID': range(len(preds)),
        'Label': preds.numpy()  # or preds.tolist()
    })
    
    # Save predictions to CSV
    output_path = os.path.join(output_dir, "inference_output.csv")
    df_output.to_csv(output_path, index=False)
    print(f"Inference complete. Predictions saved to {output_path}")
    
    # Plot label distribution in predictions
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df_output, x='Label')
    plt.xticks(range(len(class_names)), class_names, rotation=45)
    plt.title('Label Distribution in Predictions')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'prediction_distribution.png'))
    plt.close()
    
except Exception as e:
    print(f"Error loading or processing unlabelled data: {e}")
    print("Skipping unlabelled data inference.")

# Save the final model with proper naming
model_save_path = os.path.join(output_dir, "final_model_95percent")
peft_model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

# Print final parameter count
print("\nFinal model details:")
print_trainable_parameters(peft_model)  # Use the function defined earlier
print(f"Number of classes: {num_labels}")
print(f"Class names: {class_names}")
print(f"Final training metrics: {metrics}")
print("Training complete!")

# Optional: Class-wise accuracy analysis
if 'accuracy' in metrics:
    print("\nClass-wise performance:")
    for idx, class_name in enumerate(class_names):
        # Filter for examples of this class
        class_indices = torch.where(all_labels == idx)[0]
        class_preds = all_predictions[class_indices]
        class_true = all_labels[class_indices]
        class_accuracy = (class_preds == class_true).float().mean().item()
        class_examples = len(class_indices)
        
        print(f"Class {idx} ({class_name}): Accuracy {class_accuracy:.4f} ({len(torch.where(class_preds == class_true)[0])}/{class_examples})")

In [None]:
# Fix for loading and processing unlabelled data
try:
    print("\nLoading unlabelled test data...")
    # Option 1: If you have a pickle file with a DataFrame
    try:
        # Try loading as a pandas DataFrame first
        unlabelled_df = pd.read_pickle("/kaggle/input/test-proj2/test_unlabelled.pkl")
        
        # Convert DataFrame to Dataset
        from datasets import Dataset
        test_dataset = Dataset.from_pandas(unlabelled_df)
        
    except Exception as e:
        print(f"Could not load as DataFrame: {e}")
        
        # Option 2: If it's already a Dataset object saved as pickle
        try:
            import pickle
            with open("/kaggle/input/test-proj2/test_unlabelled.pkl", "rb") as f:
                test_dataset = pickle.load(f)
        except:
            # Option 3: Try loading directly as a Dataset
            from datasets import load_from_disk
            try:
                test_dataset = load_from_disk("test_unlabelled")
            except:
                # Option 4: Create a dummy test set from a subset of the original test set
                print("Creating a simulated unlabelled test set from original test data...")
                # Get a small subset of the test data and remove labels
                test_dataset = dataset['test'].select(range(100))
                test_dataset = test_dataset.remove_columns(['label'])
    
    # Check the dataset format
    print(f"Test dataset format: {test_dataset}")
    print(f"Test dataset features: {test_dataset.features}")
    
    # Apply preprocessing (make sure to handle potential differences in column names)
    if 'text' in test_dataset.features:
        # Apply the same preprocessing as in training
        processed_test = test_dataset.map(preprocess, batched=True, remove_columns=["text"])
    else:
        # If already preprocessed or has different column names
        print("Dataset doesn't have 'text' column. Checking if already tokenized...")
        required_cols = ['input_ids', 'attention_mask']
        if all(col in test_dataset.features for col in required_cols):
            print("Dataset appears to be already tokenized.")
            processed_test = test_dataset
        else:
            print(f"Available columns: {list(test_dataset.features.keys())}")
            raise ValueError("Cannot find text data or tokenized inputs in the dataset.")
    
    # Run inference and save predictions
    print("Running inference on test dataset...")
    preds = evaluate_model(peft_model, processed_test, False, 32, data_collator)
    
    # Convert to numpy if it's a torch tensor
    if hasattr(preds, 'numpy'):
        preds_numpy = preds.numpy()
    else:
        preds_numpy = preds
    
    # Create a DataFrame with predictions
    df_output = pd.DataFrame({
        'ID': range(len(preds_numpy)),
        'Label': preds_numpy
    })
    
    # Map numerical labels to text labels
    df_output['LabelText'] = df_output['Label'].map(id2label)
    
    # Save predictions to CSV
    output_path = os.path.join(output_dir, "inference_output.csv")
    df_output.to_csv(output_path, index=False)
    print(f"Inference complete. Predictions saved to {output_path}")
    
    # Plot label distribution in predictions
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df_output, x='Label')
    plt.xticks(range(len(class_names)), class_names, rotation=45)
    plt.title('Label Distribution in Predictions')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'prediction_distribution.png'))
    plt.close()
    
except Exception as e:
    print(f"Error loading or processing unlabelled data: {e}")
    print("Detailed error information:", flush=True)
    import traceback
    traceback.print_exc()
    print("\nSkipping unlabelled data inference.")
    
    # Creating a simulated test set for demonstration
    print("\nCreating a sample test prediction file instead...")
    # Generate some sample predictions
    sample_size = 100
    sample_preds = np.random.randint(0, num_labels, size=sample_size)
    df_output = pd.DataFrame({
        'ID': range(sample_size),
        'Label': sample_preds,
        'LabelText': [id2label[pred] for pred in sample_preds]
    })
    
    # Save sample predictions to CSV
    output_path = os.path.join(output_dir, "sample_inference_output.csv")
    df_output.to_csv(output_path, index=False)
    print(f"Sample predictions saved to {output_path}")