In [None]:
import pandas as pd
import torch
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments
)
from datasets import Dataset

In [None]:
data_path = "all_haiku.csv"
df = pd.read_csv(data_path)
df['haiku'] = df['0'].astype(str) + "\n" + df['1'].astype(str) + "\n" + df['2'].astype(str)

print(df[['haiku']].head())

haiku_dataset = Dataset.from_pandas(df[['haiku']])

In [None]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    outputs = tokenizer(
        examples["haiku"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    outputs["labels"] = outputs["input_ids"].copy()
    return outputs

tokenized_dataset = haiku_dataset.map(tokenize_function, batched=True, remove_columns=["haiku"])
tokenized_dataset.set_format("torch")

In [None]:
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="./gpt2-haiku",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=100,
    logging_steps=10,
    evaluation_strategy="no",
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()

In [None]:
from google.colab import drive
drive.mount('/content/drive')
save_path = '/content/drive/MyDrive/gpt2-haiku'
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
if torch.cuda.is_available():
    model = model.to("cuda")

# Provide a prompt to guide the haiku generation. You can experiment with different prompts.
prompt = "In the silent night,"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
if torch.cuda.is_available():
    input_ids = input_ids.to("cuda")

# Generate text. Adjust parameters like max_length, top_k, or do_sample to control creativity.
output = model.generate(
    input_ids,
    max_length=50,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    do_sample=True,
    top_k=50,
    temperature=0.7
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("\nGenerated Haiku:\n", generated_text)