In [10]:
import pandas as pd
import hashlib
import time
from collections import defaultdict
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
import subprocess
subprocess.run(["pip", "install", "accelerate", "-U"])

# Function to concatenate personas with dialogues
def process_example(example):
    user1_persona = example['user 1 personas']
    user2_persona = example['user 2 personas']
    dialogue = example['Best Generated Conversation']
    combined = f"User1 Persona: {user1_persona} User2 Persona: {user2_persona} {dialogue}"
    return {'dialogue': combined}

# Load the dataset from CSV
df = pd.read_csv('synthetic_valid.csv')

# Process the dataset to merge personas with dialogues
processed_data = df.apply(process_example, axis=1).tolist()

# Create a HuggingFace Dataset
dataset = Dataset.from_pandas(pd.DataFrame(processed_data))

# Load the tokenizer and model
model_checkpoint = 'distilgpt2'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Add padding token to tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
model.resize_token_embeddings(len(tokenizer))

# Tokenizing the dataset
def tokenize_function(example):
    return tokenizer(example['dialogue'], truncation=True, padding="max_length", max_length=512)

# Adding a progress bar to tokenization
start_time = time.time()
tokenized_dataset = dataset.map(tokenize_function, batched=True, desc="Tokenizing")
end_time = time.time()

print(f"Tokenization completed in {end_time - start_time:.2f} seconds.")

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./models/gpt2-finetuned',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=3,
    logging_dir='./logs',
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./models/gpt2-finetuned')
tokenizer.save_pretrained('./models/gpt2-finetuned')

# Load the fine-tuned model
model = AutoModelForCausalLM.from_pretrained('./models/gpt2-finetuned')
tokenizer = AutoTokenizer.from_pretrained('./models/gpt2-finetuned')

# Generate a response using the fine-tuned model
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_length=512)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example prompt
prompt = ("User1 Persona: I love to bake cookies. I have a dog. "
          "The county wide bake sale is where I feel most at home. Knitting is my passion. "
          "User2 Persona: I am a boy. I can move objects with my mind. I had to have a transplant. "
          "I was born with my heart outside my body. User1: Hi! User2: Hello! User1: What is your favorite thing to do? User2:")

# Generate output
print(generate_response(prompt))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Tokenizing:   0%|          | 0/54 [00:00<?, ? examples/s]

Tokenization completed in 0.05 seconds.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`