In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import (GPT2LMHeadModel, GPT2Tokenizer, Trainer,
                          TrainingArguments, pipeline)

# --- 1. Configuration ---
# Define constants and configuration parameters for the model and training process.
MODEL_NAME = "gpt2"  # Using GPT-2 as it's excellent for text generation.
DATA_FILE = "conversationfile.xlsx - userAuserB.csv" # The provided dataset file.
OUTPUT_DIR = "./chat_reply_model" # Directory to save the fine-tuned model.
TEST_SIZE = 0.1 # 10% of the data will be used for validation.
RANDOM_STATE = 42 # For reproducibility.

# --- 2. Data Loading and Preprocessing ---
def load_and_prepare_data(file_path, tokenizer):
    """
    Loads the conversation data, creates prompt-reply pairs, and formats it for GPT-2.

    Args:
        file_path (str): The path to the CSV data file.
        tokenizer (GPT2Tokenizer): The tokenizer to use for encoding text.

    Returns:
        list: A list of strings, where each string is a formatted conversation turn
              for training (e.g., "User B's message <|endoftext|> User A's reply <|endoftext|>").
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        print("Please make sure the dataset is in the same directory as the script.")
        return None

    # Clean up the message text by removing extra quotes.
    df['Message'] = df['Message'].str.replace('"""', '', regex=False).str.strip()

    # Create pairs of (User B message -> User A reply)
    # This is the core of the task: we want to train the model to generate
    # User A's text based on User B's input.
    training_texts = []
    for i in range(len(df) - 1):
        # A valid training instance is a message from User B followed by a message from User A.
        if df.iloc[i]['Sender'] == 'User B' and df.iloc[i+1]['Sender'] == 'User A':
            prompt = df.iloc[i]['Message']
            reply = df.iloc[i+1]['Message']

            # We format the text with the end-of-sentence token (eos_token)
            # to teach the model where the prompt ends and the completion begins,
            # and where the entire sequence ends.
            formatted_text = f"{prompt}{tokenizer.eos_token}{reply}{tokenizer.eos_token}"
            training_texts.append(formatted_text)

    if not training_texts:
        print("Error: No User B -> User A conversation pairs found in the data.")
        return None

    print(f"Successfully created {len(training_texts)} training pairs.")
    return training_texts


# --- 3. Custom Dataset for PyTorch ---
class ChatDataset(Dataset):
    """
    A custom PyTorch Dataset to hold and serve our tokenized chat data.
    The Trainer API from Hugging Face works best with this structure.
    """
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        # The number of items in the dataset is the number of conversations.
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        # Returns a single data point (a dictionary of tensors) at the given index.
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # The labels for language modeling are the input_ids themselves.
        # The model has to predict the next token in the sequence.
        item['labels'] = item['input_ids'].clone()
        return item


# --- 4. Main Execution Block ---
def main():
    """
    Main function to run the entire pipeline:
    - Load tokenizer and model.
    - Load and process data.
    - Set up and run the training process.
    - Save the fine-tuned model.
    - Demonstrate how to use the model for inference.
    """
    # Initialize tokenizer and model from the pre-trained "gpt2" checkpoint.
    # The problem statement specifies an offline environment, so these models
    # are assumed to be pre-loaded or cached.
    print(f"Loading pre-trained model and tokenizer: '{MODEL_NAME}'...")
    tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
    model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

    # GPT-2's default tokenizer doesn't have a padding token. We set it to the
    # end-of-sentence token. This is a standard practice.
    tokenizer.pad_token = tokenizer.eos_token

    # Load and preprocess the conversation data.
    print(f"Loading and preparing data from '{DATA_FILE}'...")
    formatted_texts = load_and_prepare_data(DATA_FILE, tokenizer)
    if formatted_texts is None:
        return # Stop execution if data loading failed.

    # Split the data into training and validation sets.
    # The validation set is used to monitor the model's performance on unseen data
    # during training to prevent overfitting.
    train_texts, val_texts = train_test_split(
        formatted_texts, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )
    print(f"Data split: {len(train_texts)} training samples, {len(val_texts)} validation samples.")

    # Tokenize the datasets.
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

    # Create PyTorch Dataset objects.
    train_dataset = ChatDataset(train_encodings)
    val_dataset = ChatDataset(val_encodings)

    # Define training arguments. These control the fine-tuning process.
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,              # Directory to save model checkpoints.
        num_train_epochs=4,                # A few epochs are usually enough for fine-tuning.
        per_device_train_batch_size=2,     # Small batch size for a small model.
        per_device_eval_batch_size=2,
        warmup_steps=100,                  # Number of steps for linear warmup.
        weight_decay=0.01,                 # Regularization.
        logging_dir='./logs',              # Directory for TensorBoard logs.
        logging_steps=10,
        evaluation_strategy="epoch",       # Evaluate at the end of each epoch.
        save_strategy="epoch",             # Save a checkpoint at the end of each epoch.
        load_best_model_at_end=True,       # Load the best performing model after training.
    )

    # Initialize the Trainer.
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    # Start the fine-tuning process.
    print("\n--- Starting Model Fine-Tuning ---")
    trainer.train()
    print("--- Fine-Tuning Complete ---\n")

    # Save the final model and tokenizer to the specified directory.
    print(f"Saving the fine-tuned model to '{OUTPUT_DIR}'...")
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print("Model saved successfully.")

    # --- 5. Inference Demonstration ---
    # Show how to use the fine-tuned model to generate a reply.
    print("\n--- Demonstrating Reply Generation ---")

    # Load the fine-tuned model using the Hugging Face pipeline for easy inference.
    generator = pipeline('text-generation', model=OUTPUT_DIR, tokenizer=OUTPUT_DIR)

    # Example prompt from User B.
    user_b_prompt = "Did you see the client's feedback on the mockups?"

    print(f"\nInput from User B: '{user_b_prompt}'")

    # Generate a reply. We pass the prompt and the eos_token to signal the model.
    # The model will then generate the text that should follow.
    generated_reply = generator(
        user_b_prompt + tokenizer.eos_token,
        max_length=50,  # Limit the length of the generated reply.
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id # Suppress padding token warning
    )[0]['generated_text']

    # Post-process the output to get only the reply part.
    # The raw output includes the original prompt, so we split by the EOS token
    # and take the next part.
    reply_text = generated_reply.split(tokenizer.eos_token)[1].strip()

    print(f"Predicted Reply from User A: '{reply_text}'")
    print("----------------------------------------")

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'pandas'