In [None]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"



In [None]:
import pandas as pd
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from transformers import pipeline
import torch
import os


In [None]:
# Load dataset
df = pd.read_excel("./TaylorSwiftSongs.xlsx")
df = df.dropna(subset=["LYRICS"])
df = df[df["LYRICS"].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]
lyrics_texts = df["LYRICS"].tolist()

# Create HuggingFace Dataset
dataset = Dataset.from_dict({"text": lyrics_texts})

In [None]:
# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT2 doesn't have pad token by default
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
# Tokenization
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-taylorswift",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_steps=50,
    save_steps=200,
    save_total_limit=1,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available()
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train model
trainer.train()

# Save model
trainer.save_model("./gpt2-taylorswift")
tokenizer.save_pretrained("./gpt2-taylorswift")


  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,3.2049
100,2.9029
150,2.7369


('./gpt2-taylorswift/tokenizer_config.json',
 './gpt2-taylorswift/special_tokens_map.json',
 './gpt2-taylorswift/vocab.json',
 './gpt2-taylorswift/merges.txt',
 './gpt2-taylorswift/added_tokens.json')

In [None]:
# Sample predictions
generator = pipeline('text-generation', model='./gpt2-taylorswift', tokenizer=tokenizer)


prompts = [
    "We were both young when I first saw you",
    "I remember tears streaming down your face",
    "Loving him is like driving a new Maserati",
    "we were both young when I first saw you",
]

for prompt in prompts:
    print(f"\nPrompt: {prompt}")
    output = generator(prompt, max_length=100, num_return_sequences=1)[0]["generated_text"]
    print(f"Generated Lyrics:\n{output}")

Device set to use cpu



Prompt: We were both young when I first saw you
Generated Lyrics:
We were both young when I first saw you
You were my beautiful, blue eyed boyfriend, who lived just minutes away from me
You were my only love, a small family with me, who would kiss, drink
Baby, come back again if you do
Baby, if you do it ever again
Baby, if you do it ever again
Baby-even if you don't, you'd take it back if nobody knew

When I didn't find you, my old secret

Prompt: I remember tears streaming down your face
Generated Lyrics:
I remember tears streaming down your face as you gave me the lead over the cliff
I remember feeling the first blood when you put the gun in my face
I remember your laugh and how you said sorry
And 'cause we've had that moment

We just got a little under our skin
It was a pretty sight
Takeaway, I got my first girlfriend tonight
We danced tonight on a rooftop
That kind of makes me wonder if I can be a better girlfriend than you


Prompt: Loving him is like driving a new Maserati
Gene

In [None]:
generator = pipeline('text-generation', model='./gpt2-taylorswift', tokenizer=tokenizer)


prompts = [
    "Nice to meet you,where you been",
    "I'm pretty sure we broke up last night",
]

for prompt in prompts:
    print(f"\nPrompt: {prompt}")
    output = generator(prompt, max_length=100, num_return_sequences=1)[0]["generated_text"]
    print(f"Generated Lyrics:\n{output}")

Device set to use cpu



Prompt: Nice to meet you,where you been
Generated Lyrics:
Nice to meet you,where you been all day and how I feel?
I know I feel better now.
But I can't stop thinking about you now
Because it's been a long time since I've met you
I can see you in my past
I've been a pretty easy girl when I'm dating you,
But don't see me looking ahead anymore

[Chorus]
Oh, oh, oh
I can see you, you can see my eyes

Prompt: I'm pretty sure we broke up last night
Generated Lyrics:
I'm pretty sure we broke up last night."
And then the doorbell rings and the rest is history
In that picture there's something on your phone that's a big deal
But this time it's a little more strange, so I try not to
I've stopped thinking about it
And to think that this is the end
[Chorus]
It's pretty clear that nothing's changed
Your boyfriend's not my boyfriend anymore
She's my girlfriend (the last time
