In [23]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.utils.data import Dataset, random_split
import torch
import pandas as pd

# Step 1: Load CSV File
df = pd.read_csv("math_riddles.csv")  # Use the uploaded CSV file path

# Step 2: Convert CSV data to text format
riddles_data = [f"Riddle: {row['riddle']} Answer: {row['solution']}" for _, row in df.iterrows()]

# Step 3: Create a custom dataset
class RiddlesDataset(Dataset):
    def __init__(self, texts, tokenizer, block_size=128):
        self.examples = tokenizer(texts, truncation=True, padding="max_length", 
                                  max_length=block_size, return_tensors="pt")["input_ids"]
    
    def __len__(self):
        return len(self.examples)  # Ensure correct length
    
    def __getitem__(self, i):
        return {"input_ids": self.examples[i], "labels": self.examples[i]}

# Step 4: Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set pad_token_id to eos_token_id to avoid warnings
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

# Step 5: Prepare dataset for fine-tuning
dataset = RiddlesDataset(riddles_data, tokenizer)

# Ensure the dataset is not empty
assert len(dataset) > 0, "Dataset is empty! Check CSV file and tokenization."

# Step 6: Split dataset into train and validation sets
train_size = int(0.8 * len(dataset))  # 80% Training, 20% Validation
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Data collator for dynamic batching
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Step 7: Set up training arguments with validation tracking
training_args = TrainingArguments(
    output_dir="./riddle_factory",
    overwrite_output_dir=True,
    num_train_epochs=10,           
    per_device_train_batch_size=2,
    learning_rate=5e-5,           
    warmup_steps=10,              
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=5,              
    evaluation_strategy="epoch",  # Evaluate validation loss at the end of each epoch
)

# Step 8: Initialize Trainer with training and validation datasets
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  
)

# Step 9: Train and track validation loss
trainer.train()

# Step 10: Save the fine-tuned model
model.save_pretrained("./riddle-factory-model")
tokenizer.save_pretrained("./riddle-factory-model")




Epoch,Training Loss,Validation Loss
1,3.1046,2.149464
2,1.7817,1.508261
3,1.2351,1.305371
4,1.0925,1.255582
5,0.8409,1.215276
6,0.7483,1.245532
7,0.763,1.23675
8,0.6059,1.266319
9,0.7094,1.291729
10,0.5672,1.304875


('./riddle-factory-model\\tokenizer_config.json',
 './riddle-factory-model\\special_tokens_map.json',
 './riddle-factory-model\\vocab.json',
 './riddle-factory-model\\merges.txt',
 './riddle-factory-model\\added_tokens.json')

In [26]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import re

# Load the fine-tuned model and tokenizer
model_name = "./riddle-factory-model"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set pad_token_id to eos_token_id
tokenizer.pad_token = tokenizer.eos_token

def generate_riddle(prompt="Riddle:", max_length=100, num_return_sequences=5):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    output = model.generate(
        input_ids, 
        max_length=max_length, 
        num_return_sequences=num_return_sequences,
        do_sample=True,  
        top_k=40,        
        top_p=0.9,      
        temperature=0.7, 
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id
    )
    
    riddles_with_answers = [tokenizer.decode(riddle, skip_special_tokens=True) for riddle in output]
    
    for i, text in enumerate(riddles_with_answers, 1):
        if "Answer:" in text:
            riddle_part, answer_part = text.split("Answer:", 1)
            # Extract first numeric answer
            numbers = re.findall(r'\d+(?:\.\d+)?', answer_part.split('.')[0])
            answer = numbers[0] if numbers else answer_part.split('.')[0].strip()
            print(f"Riddle {i}: {riddle_part.strip()}")
            print(f"Generated Answer: {answer}")
            # Basic validation
            try:
                x = float(answer)
                if "subtract" in riddle_part and "twice" in riddle_part:
                    num = float(re.search(r'twice (\d+)', riddle_part).group(1))
                    eq = float(re.search(r'get (\d+)', riddle_part).group(1))
                    sub = float(re.search(r'subtract (\d+)', riddle_part).group(1))
                    correct = x - sub == 2 * num
                    print(f"Correct? {correct} (Expected: {2 * num + sub})")
                elif "multiply" in riddle_part and "subtract" in riddle_part:
                    mult = float(re.search(r'multiply me by (\d+)', riddle_part).group(1))
                    sub = float(re.search(r'subtract (\d+)', riddle_part).group(1))
                    eq = float(re.search(r'get (\d+)', riddle_part).group(1))
                    correct = mult * x - sub == eq
                    print(f"Correct? {correct} (Expected: {(eq + sub) / mult})")
                elif "add" in riddle_part and "multiply" in riddle_part:
                    add = float(re.search(r'add (\d+)', riddle_part).group(1))
                    mult = float(re.search(r'multiply by (\d+)', riddle_part).group(1))  # Fixed syntax
                    eq = float(re.search(r'get (\d+)', riddle_part).group(1))
                    correct = mult * (x + add) == eq
                    print(f"Correct? {correct} (Expected: {(eq / mult) - add})")
                else:
                    print("Correct? Requires manual check")
            except:
                print("Correct? Requires manual check")
            print()
        else:
            print(f"Riddle {i}: {text.strip()}\nAnswer: Not generated.\n")

# Generate and display riddles with answers
generate_riddle()

SyntaxError: unmatched ')' (2001919508.py, line 58)






Step,Training Loss
5,2.8381
10,2.2242
15,1.6214


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Riddles:
1. Riddle:  How do you get a new key to unlock the "Vampire's" key? Answer: The key is a number. How do I get the number?  Answer Answer : I add 12 to the key.  If I have 12, I am able to get my number by adding 6 to my key (or 9 if I do not add 6).  I then add my other number to it. Answer 2: I subtract the first number from me, and add
2. Riddle: ?"

Answer: "Yes."
.
-
Riddles: 1-2: What if I was a human?
Jade: Answer: 3. Answer? "What if you were a man?" Answer 3: I am a woman? Answer 4: No, I'm a Human? I can't be a Woman? (Answer 4)
Question: 4-9: When I die, am I a God? Answers: 5, 8,
3. Riddle:  How do you get from one point to another? Answer: Answer Number 1. _________________________________________________________________________
Question: How do I get to a point by one? ________Answer: I am now one; I will now get there by two.
Answer, number 3: Now, I'll get me to one by multiplying by 2; then I'm now two, and I can get back to number 1 by four?
________