# Model used: google/flan-t5-base.

**Install dependencies and create exec file**

In [None]:
!pip install -q transformers accelerate evaluate datasets sentencepiece rouge_score

script_content = """
import pandas as pd
import torch
import logging
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import Dataset
from accelerate import Accelerator

def main():
    accelerator = Accelerator()

    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    DATA_PATH = "/kaggle/input/modeldata/generated_data.csv"
    PROMPT_COLUMN_NAME = 'prompt'
    RESPONSE_COLUMN_NAME = 'response'
    MODEL_CHECKPOINT = "google/flan-t5-base"
    FINETUNED_MODEL_DIR = "/kaggle/working/chatbot_finetuned_stable"
    TRAINING_EPOCHS = 15
    LEARNING_RATE = 3e-5

    if accelerator.is_main_process:
        AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
        AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
    accelerator.wait_for_everyone()

    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

    df = pd.read_csv(DATA_PATH)
    df.columns = df.columns.str.strip()
    dataset = Dataset.from_pandas(df)
    split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = split_dataset["train"]
    eval_dataset = split_dataset["test"]

    def tokenize_function(examples):
        model_inputs = tokenizer(examples[PROMPT_COLUMN_NAME], max_length=128, truncation=True)
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples[RESPONSE_COLUMN_NAME], max_length=128, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
    tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=eval_dataset.column_names)

    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    training_args = TrainingArguments(
        output_dir=FINETUNED_MODEL_DIR,
        num_train_epochs=TRAINING_EPOCHS,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        logging_dir=f"{FINETUNED_MODEL_DIR}/logs",
        logging_strategy="epoch",
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        fp16=False,
        max_grad_norm=1.0,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    trainer.train()

    if accelerator.is_main_process:
        trainer.save_model(FINETUNED_MODEL_DIR)
        tokenizer.save_pretrained(FINETUNED_MODEL_DIR)
        logging.info(f"Best fine-tuned model saved to: {FINETUNED_MODEL_DIR}")

if __name__ == '__main__':
    main()
"""

with open('train.py', 'w') as f:
    f.write(script_content)

print("File 'train.py' created with stability fixes.")

**Training**

In [None]:
!accelerate launch train.py

**General Tests**

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os
import evaluate

def load_inference_model(model_path):
    print(f"Loading fine-tuned model for inference from: {model_path}")
    if not os.path.exists(model_path):
        print(f"ERROR: Model directory not found at '{model_path}'.")
        print("Please run the fine-tuning script successfully before testing.")
        return None, None, None

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)
        print(f"Model loaded successfully to device: {device}")
        return model, tokenizer, device
    except Exception as e:
        print(f"ERROR: An error occurred while loading the model: {e}")
        return None, None, None

def generate_response(prompt, model, tokenizer, device):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=150, num_beams=5, early_stopping=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def run_test_with_evaluation(model, tokenizer, device, test_file_path, prompt_column, response_column, num_samples=10):
    print("\n" + "="*50)
    print("Starting LLM Functionality and Correctness Test")
    print("="*50 + "\n")

    try:
        df = pd.read_csv(test_file_path)
        df.columns = df.columns.str.strip()
    except FileNotFoundError:
        print(f"ERROR: Test file not found at '{test_file_path}'.")
        return

    required_columns = [prompt_column, response_column]
    if not all(col in df.columns for col in required_columns):
        print(f"ERROR: Test file must contain '{prompt_column}' and '{response_column}' columns.")
        return

    sample_df = df.sample(n=num_samples, random_state=42)
    prompts = sample_df[prompt_column].tolist()
    reference_responses = sample_df[response_column].tolist()

    generated_responses = []
    rouge_metric = evaluate.load('rouge')

    for i, prompt in enumerate(prompts):
        print(f"--- Test Case {i+1}/{num_samples} ---")
        print(f"PROMPT: {prompt}")

        reference = reference_responses[i]
        print(f"REFERENCE RESPONSE: {reference}")

        bot_response = generate_response(prompt, model, tokenizer, device)
        generated_responses.append(bot_response)
        print(f"CHATBOT RESPONSE: {bot_response}")

        individual_score = rouge_metric.compute(
            predictions=[bot_response],
            references=[reference]
        )
        correctness_percentage = individual_score.get('rougeL', 0.0) * 100
        print(f"CORRECTNESS SCORE (ROUGE-L): {correctness_percentage:.2f}%")
        print("-" * (len(f"--- Test Case {i+1}/{num_samples} ---")) + "\n")

    print("\n" + "="*50)
    print("Overall Performance Evaluation")
    print("="*50 + "\n")

    total_scores = rouge_metric.compute(
        predictions=generated_responses,
        references=reference_responses
    )

    for key, value in total_scores.items():
        total_percentage = value * 100
        print(f"Total Average {key.upper()}: {total_percentage:.2f}%")

if __name__ == '__main__':
    FINETUNED_MODEL_DIR = "/kaggle/working/chatbot_finetuned_stable/checkpoint-11310"
    TEST_DATA_PATH = "/kaggle/input/modeldata/generated_data.csv"
    PROMPT_COLUMN = 'prompt'
    RESPONSE_COLUMN = 'response'

    model, tokenizer, device = load_inference_model(FINETUNED_MODEL_DIR)

    if model and tokenizer:
        run_test_with_evaluation(
            model=model,
            tokenizer=tokenizer,
            device=device,
            test_file_path=TEST_DATA_PATH,
            prompt_column=PROMPT_COLUMN,
            response_column=RESPONSE_COLUMN,
            num_samples=15
        )

**Identification of the best model for error cases when executing the final step of the script.**

In [None]:
import json
import os

output_dir = "/kaggle/working/chatbot_finetuned_stable"

all_checkpoints = []
if os.path.exists(output_dir):
    all_checkpoints = [d for d in os.listdir(output_dir) if d.startswith("checkpoint-")]

if not all_checkpoints:
    print(f"WARNING: No checkpoint directory found in '{output_dir}'.")
else:
    latest_checkpoint = sorted(all_checkpoints, key=lambda x: int(x.split('-')[1]))[-1]
    trainer_state_path = os.path.join(output_dir, latest_checkpoint, "trainer_state.json")

    try:
        with open(trainer_state_path, 'r') as f:
            state = json.load(f)

        best_model_path = state.get("best_model_checkpoint")

        if best_model_path:
            print("="*50)
            print(f"The best model is located at: {best_model_path}")
            print("Use this path in your test/chat cell.")
            print("="*50)
        else:
            print(f"WARNING: Unable to find the 'best_model_checkpoint' key. Using the latest checkpoint as an alternative: {os.path.join(output_dir, latest_checkpoint)}")

    except FileNotFoundError:
        print(f"ERROR: trainer_state.json file not found in {trainer_state_path}. Unable to determine the best model.")
    except Exception as e:
        print(f"ERROR: An error occurred while reading the state file: {e}")

**Compression and availability for download of the selected checkpoint.**

In [None]:
!tar -czvf checkpoint-11310.tar.gz /kaggle/working/chatbot_finetuned_stable/checkpoint-11310