In [None]:
!pip install datasets evaluate

In [None]:
import os
import json

# Define Kaggle credentials
kaggle_creds = {
    "username": "hurryingauto3",
    "key": "17e33c07cfd0993aecbc770b33c7054e"
}

# Ensure the Kaggle config directory exists
os.makedirs(os.path.expanduser("~/.config/kaggle/"), exist_ok=True)

# Write credentials to kaggle.json
with open(os.path.expanduser("~/.config/kaggle/kaggle.json"), "w") as f:
    json.dump(kaggle_creds, f)

# Set correct permissions
os.chmod(os.path.expanduser("~/.config/kaggle/kaggle.json"), 0o600)

# Remove the "data/" directory if it exists
os.system("rm -rf data/")

In [None]:


import os
import pickle
import zipfile
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding
from kaggle.api.kaggle_api_extended import KaggleApi

try:
    from datasets import Dataset as HFDataset # Use an alias to avoid conflict with torch.utils.data.Dataset
    from datasets import load_dataset
except ImportError:
    print("Please install the 'datasets' library: pip install datasets")
    HFDataset = None


In [None]:

# --- Custom Dataset for the Competition Test File ---
class AGNewsTestDataset(Dataset):
    """
    Custom dataset for AGNEWS competition test text data.
    Handles test data stored as a pickled Hugging Face Dataset object.

    Args:
        pkl_file (str): Path to the pickle file containing the test data (expected as HF Dataset).
        tokenizer (callable): Tokenizer instance (e.g., from Hugging Face)
        max_length (int): Maximum sequence length for tokenization.
        text_column (str): The name of the column containing the text in the pickled Dataset. Defaults to 'text'.
    """
    def __init__(self, pkl_file, tokenizer, max_length=512, text_column="text"):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_column = text_column
        self.texts = [] # Initialize as empty list

        try:
            with open(pkl_file, 'rb') as f:
                loaded_object = pickle.load(f)

            # --- Check if the loaded object is a Hugging Face Dataset ---
            if HFDataset is not None and isinstance(loaded_object, HFDataset):
                print(f"Pickle file contained a Hugging Face Dataset object.")
                # Check if the expected text column exists
                if self.text_column in loaded_object.column_names:
                    # Extract the text column into a list
                    self.texts = loaded_object[self.text_column]
                    print(f"Successfully extracted '{self.text_column}' column ({len(self.texts)} items).")
                else:
                    raise ValueError(f"Loaded Dataset object does not contain the expected text column '{self.text_column}'. "
                                     f"Available columns: {loaded_object.column_names}")
            # --- Fallback: Check if it's a list (original assumption) ---
            elif isinstance(loaded_object, list):
                 print("Pickle file contained a standard Python list.")
                 self.texts = loaded_object
            # --- Fallback: Check if it's a dictionary (previous check) ---
            elif isinstance(loaded_object, dict):
                 print("Pickle file contained a standard Python dict.")
                 possible_keys = ['text', 'data', 'description'] # Add other likely keys if needed
                 data_key = next((k for k in possible_keys if k in loaded_object), None)
                 if data_key and isinstance(loaded_object[data_key], list):
                     print(f"Assuming text data is under key '{data_key}'.")
                     self.texts = loaded_object[data_key]
                 else:
                     raise ValueError(f"Could not find a list of texts in pkl dictionary. Keys found: {list(loaded_object.keys())}")
            # --- If none of the above ---
            else:
                 raise TypeError(f"Unsupported data type loaded from pickle file: {type(loaded_object)}. "
                                 "Expected Hugging Face Dataset, list, or dict containing a list.")

            # Final check if texts were actually loaded
            if not self.texts:
                 raise ValueError(f"Failed to load any text data from the pickle file: {pkl_file}")


        except FileNotFoundError:
            print(f"Error: Test pickle file not found at {pkl_file}")
            raise
        except Exception as e:
            print(f"Error loading or processing pickle file {pkl_file}: {e}")
            raise

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        # Ensure text is a string (might be redundant if extracted from HF Dataset, but safe)
        if not isinstance(text, str):
            text = str(text)

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding=False, # Padding will be handled by the collator
            max_length=self.max_length,
            return_tensors=None, # Return python lists/ints, collator handles tensor conversion
        )

        # Return the tokenized inputs and the original index for submission mapping
        # Remove 'token_type_ids' if your model doesn't use them (like RoBERTa)
        item = {k: v for k, v in encoding.items() if k != 'token_type_ids'}
        item['index'] = index # Include original index

        return item

# --- Data Module for AGNEWS ---
class AGNewsDataModule:
    """
    Data module for AGNEWS dataset (train/val from Hugging Face, test from competition file).

    Args:
        model_name_or_path (str): Identifier for the tokenizer (e.g., "roberta-base").
        data_dir (str): Directory to potentially store data (less critical when using `datasets`).
        competition_name (str): Name of the Kaggle competition for downloading test data.
        batch_size (int): Training batch size.
        test_batch_size (int): Testing/Validation batch size.
        num_workers (int): Number of workers for data loading.
        max_seq_length (int): Maximum sequence length for tokenizer.
        val_split_percentage (float): Percentage of training data to use for validation (0 to disable).
    """
    def __init__(self,
                 model_name_or_path="roberta-base",
                 data_dir="./data_agnews",
                 competition_name="deep-learning-spring-2025-project-2", # UPDATE IF NEEDED
                 batch_size=16,
                 test_batch_size=32,
                 num_workers=2,
                 max_seq_length=512,
                 val_split_percentage=0.1): # Use 10% of train for validation

        self.model_name_or_path = model_name_or_path
        self.data_dir = data_dir
        self.competition_name = competition_name
        self.batch_size = batch_size
        self.test_batch_size = test_batch_size
        self.num_workers = num_workers
        self.max_seq_length = max_seq_length
        self.val_split_percentage = val_split_percentage

        # Paths for competition data
        self.competition_path = os.path.join(self.data_dir, self.competition_name)
        self.zip_path = os.path.join(self.competition_path, f"{self.competition_name}.zip")
        self.test_pkl = os.path.join(self.competition_path, "test_unlabelled.pkl") # Correct filename

        # Initialize tokenizer and data collator
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
        # Data collator handles dynamic padding within each batch
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

        self.train_dataset = None
        self.val_dataset = None
        self.predict_dataset = None

    def _tokenize_function(self, examples):
        # Tokenize the text field. AGNEWS uses 'text'.
        # Padding is false here; collator handles it later.
        return self.tokenizer(
            examples["text"],
            truncation=True,
            padding=False,
            max_length=self.max_seq_length
        )

    def prepare_data(self):
        """Downloads competition data if needed."""
        # Download standard AGNEWS train/test via `datasets` library automatically on first use.
        print("Checking/downloading AGNEWS dataset from Hugging Face...")
        load_dataset("ag_news", cache_dir=os.path.join(self.data_dir, "hf_cache"))
        print("Checking/downloading competition test data...")
        self.download_competition_data()

    def setup(self, stage=None):
        """Loads and preprocesses datasets."""
        # Load AGNEWS dataset
        dataset = load_dataset("ag_news", cache_dir=os.path.join(self.data_dir, "hf_cache"))

        # Tokenize dataset
        tokenized_dataset = dataset.map(self._tokenize_function, batched=True)

        # Remove original text column, select necessary columns
        tokenized_dataset = tokenized_dataset.remove_columns(["text"])
        # Rename 'label' to 'labels' if required by the model/trainer framework
        # tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
        tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

        if stage == "fit" or stage is None:
            ag_train_data = tokenized_dataset["train"]
            if self.val_split_percentage > 0:
                split = ag_train_data.train_test_split(test_size=self.val_split_percentage)
                self.train_dataset = split['train']
                self.val_dataset = split['test']
                print(f"Using {len(self.train_dataset)} samples for training, {len(self.val_dataset)} for validation.")
            else:
                # Use standard AGNEWS test set as validation if no split % is given
                self.train_dataset = ag_train_data
                self.val_dataset = tokenized_dataset["test"]
                print(f"Using {len(self.train_dataset)} samples for training, {len(self.val_dataset)} (standard test set) for validation.")


        if stage == "validate" or stage is None:
             if self.val_dataset is None: # If setup wasn't called with 'fit'
                 # Load validation data (standard AGNEWS test set)
                 self.val_dataset = tokenized_dataset["test"]
                 print(f"Loaded {len(self.val_dataset)} (standard test set) for validation.")


        if stage == "test" or stage is None:
            # Setup competition test dataset
             print(f"Setting up competition test dataset from: {self.test_pkl}")
             self.predict_dataset = AGNewsTestDataset(
                 self.test_pkl,
                 self.tokenizer,
                 self.max_seq_length
             )
             print(f"Loaded {len(self.predict_dataset)} samples for competition prediction.")


    def get_train_loader(self):
        if not self.train_dataset:
            self.setup("fit")
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            collate_fn=self.data_collator # Use collator for dynamic padding
        )

    def get_val_loader(self):
        if not self.val_dataset:
            self.setup("validate") # Or 'fit' if you always run setup completely
        return DataLoader(
            self.val_dataset,
            batch_size=self.test_batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            collate_fn=self.data_collator # Use collator for dynamic padding
        )

    def get_competition_test_loader(self):
        """Gets the DataLoader for the competition's unlabelled test set."""
        if not self.predict_dataset:
            self.setup("test")
        return DataLoader(
            self.predict_dataset,
            batch_size=self.test_batch_size,
            shuffle=False, # Important: Keep order for submission
            num_workers=self.num_workers,
            collate_fn=self.data_collator # Use collator for dynamic padding - it handles dicts well
        )

    def download_competition_data(self):
        """Downloads and extracts competition test data using Kaggle API."""
        if not os.path.exists(self.test_pkl):
            print(f"Competition test file not found at {self.test_pkl}. Attempting download...")
            os.makedirs(self.competition_path, exist_ok=True)
            try:
                from kaggle.api.kaggle_api_extended import KaggleApi
                api = KaggleApi()
                api.authenticate() # Make sure kaggle.json is set up
                api.competition_download_files(self.competition_name, path=self.competition_path)

                if os.path.exists(self.zip_path):
                    print(f"Extracting {self.zip_path}...")
                    with zipfile.ZipFile(self.zip_path, 'r') as zip_ref:
                        zip_ref.extractall(self.competition_path)
                    os.remove(self.zip_path) # Clean up the zip file
                    print("Extraction complete.")
                else:
                     print(f"Warning: Zip file {self.zip_path} not found after download attempt.")

            except ImportError:
                print("Warning: 'kaggle' library not found. Cannot download competition data automatically.")
                print("Please download the 'test_unlabelled.pkl' manually from the Kaggle competition page")
                print(f"and place it in: {self.competition_path}")
            except Exception as e:
                print(f"An error occurred during Kaggle download/extraction: {e}")
                print("Please check your Kaggle API setup and competition name.")

        if not os.path.exists(self.test_pkl):
            # Raise error only after attempting download
            raise FileNotFoundError(
                f"Competition test file '{os.path.basename(self.test_pkl)}' not found in '{self.competition_path}'. "
                "Please ensure it is downloaded and extracted correctly."
            )
        else:
            print(f"Competition test file found: {self.test_pkl}")


In [None]:
MODEL_ID = "roberta-base"
COMPETITION_ID = "deep-learning-spring-2025-project-2" # Double-check this ID
DATA_DIR = "./agnews_data"
BATCH_SIZE = 8 # Small batch size for demo
TEST_BATCH_SIZE = 16
MAX_LEN = 128 # Shorter length for faster demo processing

# Instantiate the data module
data_module = AGNewsDataModule(
    model_name_or_path=MODEL_ID,
    data_dir=DATA_DIR,
    competition_name=COMPETITION_ID,
    batch_size=BATCH_SIZE,
    test_batch_size=TEST_BATCH_SIZE,
    max_seq_length=MAX_LEN,
    num_workers=2 # Set to 0 for easier debugging in __main__
)

data_module.prepare_data() # Downloads HF data and competition data if needed

data_module.setup() # Sets up train, val, and test

In [None]:
# --- Visualize Training Samples ---
print("\n--- Train Set Samples (First 5) ---")
try:
    if data_module.train_dataset:
        # Select the first 5 samples directly from the Hugging Face dataset
        train_samples = data_module.train_dataset.select(range(min(5, len(data_module.train_dataset))))

        train_data_for_df = []
        for sample in train_samples:
            text = data_module.tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
            # Ensure 'label' exists, otherwise use a placeholder like None or -1
            label = sample.get('label', None)
            if isinstance(label, torch.Tensor):
                  label = label.item() # Convert tensor to Python number
            train_data_for_df.append({'Decoded Text': text, 'Label': label})

        train_df = pd.DataFrame(train_data_for_df)
        print(train_df)
    else:
        print("Train dataset not loaded or empty.")
except Exception as e:
    print(f"Error displaying train samples: {e}")

In [None]:
# --- Visualize Validation Samples ---
print("\n--- Validation Set Samples (First 5) ---")
try:
    if data_module.val_dataset:
        # Select the first 5 samples directly from the Hugging Face dataset
        val_samples = data_module.val_dataset.select(range(min(5, len(data_module.val_dataset))))

        val_data_for_df = []
        for sample in val_samples:
            text = data_module.tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
            # Ensure 'label' exists, otherwise use a placeholder like None or -1
            label = sample.get('label', None)
            if isinstance(label, torch.Tensor):
                  label = label.item() # Convert tensor to Python number
            val_data_for_df.append({'Decoded Text': text, 'Label': label})

        val_df = pd.DataFrame(val_data_for_df)
        print(val_df)
    else:
        print("Validation dataset not loaded or empty.")
except Exception as e:
    print(f"Error displaying validation samples: {e}")

In [None]:
# --- Visualize Competition Test Samples ---
print("\n--- Competition Test Set Samples (First 5) ---")
try:
    if data_module.predict_dataset:
        test_data_for_df = []
        # Iterate through the custom dataset using __getitem__
        num_samples_to_show = min(5, len(data_module.predict_dataset))
        for i in range(num_samples_to_show):
            sample = data_module.predict_dataset[i] # Fetches the dictionary item
            text = data_module.tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
            # Ensure 'index' exists, otherwise use a placeholder like None or -1
            original_index = sample.get('index', None)
            test_data_for_df.append({'Decoded Text': text, 'Original Index': original_index})

        test_df = pd.DataFrame(test_data_for_df)
        print(test_df)
    else:
        print("Competition test dataset not loaded or empty.")
except Exception as e:
    print(f"Error displaying competition test samples: {e}")

In [None]:
import torch
import numpy as np
import evaluate # Hugging Face evaluate library
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding # Make sure this is imported
)
from peft import LoraConfig, TaskType, get_peft_model



# Bse RoBERTa Model Import

In [None]:
# Assume AGNewsDataModule is defined in another file or earlier in the script
# from your_data_module_file import AGNewsDataModule

# --- Configuration ---
model_name = "roberta-base"
num_labels = 4
lora_r = 8
lora_alpha = 16
lora_dropout = 0.1
target_modules = ["query", "value"]
output_dir = "./results/roberta-lora-agnews" # Directory to save checkpoints and logs
training_log_dir = "./logs/roberta-lora-agnews" # Directory for TensorBoard/logging
adapter_save_dir = "./trained_adapters/roberta-lora-agnews" # Directory to save final adapter

# --- Hyperparameters for Training ---
# These are crucial and require tuning!
learning_rate = 2e-4 # LoRA might tolerate/need higher LR than full finetuning
train_batch_size = 16 # Adjust based on GPU memory
eval_batch_size = 32  # Adjust based on GPU memory
num_train_epochs = 3  # Start with a few epochs, increase as needed
weight_decay = 0.01
warmup_ratio = 0.1 # Percentage of steps for learning rate warmup



In [None]:
# --- 1. Load Base Model (as you did) ---
print(f"Loading base model '{model_name}'...")
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

In [None]:
# --- 2. Define LoRA Config (as you did) ---
print("Defining LoRA configuration...")
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=target_modules,
    lora_dropout=lora_dropout,
    bias="none",
)


In [None]:
# --- 3. Apply LoRA to the model (as you did) ---
print("Applying LoRA adapter to the model...")
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters() # Verify parameter count is low


In [None]:
train_dataset = data_module.get_train_loader().dataset # Get the underlying Dataset object
val_dataset = data_module.get_val_loader().dataset     # Get the underlying Dataset object
tokenizer = data_module.tokenizer                     # Get the tokenizer
data_collator = data_module.data_collator             # Get the data collator

In [None]:
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

In [None]:
# --- 5. Define Compute Metrics Function ---
# Load the accuracy metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Computes accuracy on evaluation predictions."""
    predictions, labels = eval_pred
    # 'predictions' are logits, convert to predicted class index
    preds = np.argmax(predictions, axis=1)
    # Calculate accuracy
    acc = accuracy_metric.compute(predictions=preds, references=labels)
    return {"accuracy": acc["accuracy"]}


In [None]:
# --- 6. Define Training Arguments ---
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    weight_decay=weight_decay,
    warmup_ratio=warmup_ratio,

    # Evaluation and Saving Strategy
    eval_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch",       # Save checkpoint at the end of each epoch
    load_best_model_at_end=True, # Load the best model found during training
    metric_for_best_model="accuracy", # Use accuracy to determine the best model
    greater_is_better=True,      # Higher accuracy is better

    # Logging
    logging_dir=training_log_dir,
    logging_strategy="steps",
    logging_steps=50,            # Log metrics every 50 steps

    # Other potentially useful args
    # fp16=True,                 # Enable mixed precision training if GPU supports it (requires accelerate)
    # gradient_accumulation_steps=2, # If batch size needs to be effectively larger than fits in memory
    report_to="tensorboard",     # Report logs to TensorBoard (can also use "wandb")
    save_total_limit=2,          # Keep only the last 2 checkpoints + the best one
    # push_to_hub=False,         # Set to True to push model to Hugging Face Hub
)

In [None]:
# --- 7. Initialize Trainer ---
trainer = Trainer(
    model=model,                         # The PEFT model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Validation dataset
    tokenizer=tokenizer,                 # Tokenizer (needed for padding/saving)
    data_collator=data_collator,         # Data collator for dynamic padding
    compute_metrics=compute_metrics,     # Function to compute metrics
)


In [None]:
# --- 8. Start Training ---
print("\nStarting Training...")
train_result = trainer.train()
# --- 9. Save Training Stats and Final Adapter ---
print("\nTraining finished. Saving metrics and final adapter...")
# Saves metrics like loss, learning rate, epoch, etc. to json file
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)


In [None]:
# Save the trained LoRA adapter weights explicitly
# This saves only the adapter weights, which is the goal of PEFT
model.save_pretrained(adapter_save_dir)
# You might also want to save the tokenizer with the adapter for easy loading later
tokenizer.save_pretrained(adapter_save_dir)
print(f"LoRA adapter weights saved to: {adapter_save_dir}")

In [None]:
# --- 10. Evaluate Final Model (Optional but recommended) ---
print("\nEvaluating the best model on the validation set...")
eval_metrics = trainer.evaluate(eval_dataset=val_dataset) # Use the same validation set
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)
print(f"Final Evaluation Metrics: {eval_metrics}")

print("\nLoRA Training Setup Complete.")

In [None]:
import torch
import pandas as pd
import numpy as np
import os # Import os for path joining if reading metrics from file
# import json # Import json if reading metrics from file
from tqdm.auto import tqdm # For progress bar
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel, PeftConfig
from torch.utils.data import DataLoader
# Assume AGNewsDataModule and AGNewsTestDataset are defined
# from your_data_module_file import AGNewsDataModule, AGNewsTestDataset

# --- Configuration ---
base_model_name = "roberta-base"
adapter_path = "./trained_adapters/roberta-lora-agnews" # Directory where you saved the adapter
num_labels = 4
eval_batch_size = 32
max_seq_length = 128

# --- Determine Accuracy for Filename ---

final_accuracy = eval_metrics['eval_accuracy']
print(f"Using final accuracy for filename: {final_accuracy:.4f}")

# --- Set Output Filename ---
output_csv_path = f"submission_acc_{final_accuracy:.4f}.csv"
print(f"Output file will be saved as: {output_csv_path}")

# --- Device Setup ---
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU device.")



In [None]:
# --- 1. Load Tokenizer ---
print(f"Loading tokenizer from '{base_model_name}'...")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# --- 2. Load Base Model ---
print(f"Loading base model '{base_model_name}'...")
base_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    num_labels=num_labels,
    return_dict=True
)

# --- 3. Load LoRA Adapter Weights ---
print(f"Loading LoRA adapter weights from '{adapter_path}'...")
try:
    model = PeftModel.from_pretrained(base_model, adapter_path)
    print("Successfully loaded LoRA adapter.")
except Exception as e:
    print(f"Error loading PEFT model from {adapter_path}: {e}")
    raise

# --- 4. Prepare Model for Inference ---
model = model.to(device)
model.eval()
print("Model moved to device and set to evaluation mode.")




In [None]:

competition_loader = data_module.get_competition_test_loader()
print(f"Test data loaded. Number of batches: {len(competition_loader)}")

# --- 6. Run Inference ---
all_predictions = []
all_indices = []

print("\nStarting prediction loop...")
with torch.no_grad():
    for batch in tqdm(competition_loader, desc="Predicting"):
        model_inputs = {
            k: v.to(device) for k, v in batch.items()
            if k in tokenizer.model_input_names
        }
        indices = batch['index'].cpu().numpy()
        outputs = model(**model_inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        predictions_np = predictions.cpu().numpy()
        all_predictions.extend(predictions_np)
        all_indices.extend(indices)
print("Prediction loop finished.")



In [None]:
# --- 7. Create DataFrame and Save CSV ---
submission_df = pd.DataFrame({
    'ID': all_indices,
    'Label': all_predictions
})

In [None]:
# Merge the two dataframes based on the index columns
# We use 'Original Index' from test_df and 'index' from submission_df
# 'how=inner' ensures only matching indices are kept (should be all of them)
view_df = pd.merge(
    test_df,
    submission_df,
    left_on='Original Index', # Key column in the left DataFrame (test_df)
    right_on='ID',         # Key column in the right DataFrame (submission_df)
    how='inner'               # Use 'inner' join (safer) or 'left' if test_df is guaranteed complete
)

# --- Display the Result ---
print("\n--- Combined View: Test Text with Predictions ---")
# Set display options for better readability
pd.set_option('display.max_colwidth', 150) # Show more of the text
pd.set_option('display.width', 1000)      # Wider display

view_df.head(10)

In [None]:
print(f"Saving submission file to '{output_csv_path}'...")
submission_df.to_csv(output_csv_path, index=False)

In [None]:
# def submit_to_kaggle(filename, message):
#     api = KaggleApi()
#     api.authenticate()
#     api.competition_submit(filename, f"{message}", 'deep-learning-spring-2025-project-2')

In [None]:
# submit_to_kaggle(output_csv_path, f"{final_accuracy:.4f}%")