# GPT Model w/o Descriptions

In [1]:
import pandas as pd
from datasets import Dataset
import ast
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
import os
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [ ]:
df2 = pd.read_csv('Playlist_data_with_lyrics.csv')
df = pd.read_csv('playlist_data.csv')

In [ ]:
import pandas as pd
from datasets import Dataset
import ast

# Load and parse the dataset

# Convert stringified lists to actual lists
for col in ['Playlist_Songs', 'Playlist_Artists']:
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

## Training Targets 1st Iteration, 3 epoch Loss ~ 3, 5 epoch Loss ~ 2.5
# Create prompt + output text for GPT-2 training
# def format_example(row):
#     song_lines = [f"{s} - {a}" for s, a in zip(row['Playlist_Songs'], row['Playlist_Artists'])]
#     return f"### Prompt: {row['Playlist_Name']}\n### Playlist:\n" + "\n".join(song_lines)

## Training Targets 2nd Iteration, 5 epoch Loss ~ 1.3
# def format_example(row):
#     lines = [f"[SONG] {s} [ARTIST] {a}" for s, a in zip(row["Playlist_Songs"], row["Playlist_Artists"])]
#     return f"### Prompt: {row['Playlist_Name']}\n### Playlist:\n" + "\n".join(lines)

## Training targets 3rd Iteration, 5 epoch Loss ~ 
def format_example(row):
    lines = [f"[SONG] {song} [ARTIST] {artist}" for song, artist in zip(row["Playlist_Songs"], row["Playlist_Artists"])]
    playlist_body = "\n".join(lines)
    
    # Include Playlist_Description in the prompt for training only
    return (
        f"### Prompt: {row['Playlist_Name']}\n"
        f"### Description: {row['Playlist_Description']}\n"
        f"### Playlist:\n{playlist_body}"
    )

df['text'] = df.apply(format_example, axis=1)

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict({"text": df['text'].tolist()})

In [ ]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token

model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model.resize_token_embeddings(len(tokenizer))  # In case we add special tokens

# Tokenize
def tokenize(batch):
    encodings = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)
    encodings["labels"] = encodings["input_ids"].copy()  # 🔥 Add this line
    return encodings
    
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [ ]:
from transformers import Trainer, TrainingArguments
import torch

training_args = TrainingArguments(
    output_dir="./gpt2_playlist_model_w_Descriptions",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    fp16=True if torch.cuda.is_available() else False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()

In [ ]:
def generate_playlist(prompt, max_length=200):
    input_text = f"### Prompt: {prompt}\n### Playlist:\n"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    output = model.generate(
        input_ids = input_ids.to('cuda'),
        max_length=max_length,
        temperature=0.9,
        top_p=0.95,
        do_sample=True,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

    result = tokenizer.decode(output[0], skip_special_tokens=True)
    return result.split("### Playlist:\n")[1].strip()

# Try it out!
prompt = "James broke his computer"
print(generate_playlist(prompt))

In [ ]:
## Import Saved Model and use with generate_playlist function
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("./gpt2_playlist_model_w_Descriptions/checkpoint-5725")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_playlist_model_w_Descriptions/checkpoint-5725")

In [10]:
os.chdir("..")
os.getcwd()

'/sfs/gpfs/tardis/home/nuf8ms/Documents/MSDS/LLM/DS6051-Project'

In [6]:
os.path.join()

['checkpoint-5725', 'checkpoint-5500']

# GPT Model w/ Descriptions