In [34]:
### code to clean up data

file_path = 'OneDrive/Dokumente/UH/Fall 24/ICS 661 AI/assignment 3/data'
jokes = []

# Read and print each line from the file
with open(file_path, "r") as file:
    for i,line in enumerate(file):
        if i==0: continue   
        #print(line.strip()[line.strip().find(',') + 2:-1])
        jokes.append(line.strip()[line.strip().find(',') + 2:-1])

with open('joke_data.txt', "w") as file:
    for joke in jokes:
        file.write(joke + "\n")   

In [49]:
### GPT-2 fine tuning

from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling, StoppingCriteriaList, MaxLengthCriteria

import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

model.config.pad_token_id = model.config.eos_token_id

# Load and preprocess the dataset
def load_dataset(filepath, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=filepath,
        block_size=block_size,
    )
    return dataset

# Create data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Load dataset
train_dataset = load_dataset("joke_data.txt", tokenizer)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-joke-finetune",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune model
trainer.train()

# Save the model
model.save_pretrained("./gpt2-joke-finetune")
tokenizer.save_pretrained("./gpt2-joke-finetune")

# Function to generate jokes from a prompt
def generate_joke(prompt, max_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    attention_mask = torch.ones(input_ids.shape, device=device)  # Create attention mask
    
    # Generate tokens
    gen_tokens = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id  # Stop at end-of-sequence
    )
    gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
    return gen_text.split("\n")[0]  # Return only the first joke by splitting on newline




Step,Training Loss


Why did the cow suffer from? Because she was too dirty.


In [51]:
### code to generate jokes
input_strings = ['If you can','What do you','Why do you','How do you', 'What is a', 'Do you want','Why did the','Have you heard', 'How did the']

for i in range(len(input_strings)*10):
    print(generate_joke(input_strings[i%len(input_strings)]))

If you can't get enough of the food you eat, you can't get enough of the wine you drink.
What do you see with the other two letters? With the letters
Why do you get a girl in a dress? Because she's a giraffe.
How do you get to know someone with a passion for math? You can!
What is a dog's favorite food? Dog food
Do you want? A*hole?
Why did the German do? He was in the bath.
Have you heard about the robot who took the most showers? It's just a robot.
How did the turtle eat? A chicken.
If you can't afford to get fired? You can't afford to get fired.
What do you think is going to make a sausage? It's made of meat, but it's kind of cheesy
Why do you call a frog who's a little more sweet? He's a frog.
How do you get a baby frog? A fish.
What is a dog's favorite restaurant? The Jokes
Do you want to know what type of car you are in? A truck.
Why did the cow say, Oh yuck, I'm so sorry? I'm so sorry.
Have you heard of this one? It's called a ghost scare.
How did the bartender say he was going 

In [50]:
### code for generating jokes and checking if they are in the dataset

input_strings = ['What do you','Why do you','How do you', 'What is a', 'Do you want','Why did the','Have you heard', 'How did the']
similar_jokes = []
n = 100

for k, in_string in enumerate(input_strings):
    for i in range(n):
        gen_joke = generate_joke(in_string)

        for joke in jokes:
            if gen_joke in joke and gen_joke not in similar_jokes:
                similar_jokes.append(joke)
        print(f"Progress: {k*n+i} / {n*len(input_strings)}. Jokes found: {len(similar_jokes)}", end = '\r')
      #  print(gen_joke)
       # print(similar_jokes)

Progress: 799 / 800. Jokes found: 0