In [1]:
import torch
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

## 1. Import the lyrics dataset

In [3]:
d = pd.read_csv('taylor_swift_lyrics.csv', encoding='latin1')
text = []

for title in d['track_title'].unique():
    song = ""
    for l in d[d['track_title'] == title]['lyric']:
        song += l + ' \n '
    text.append(song + ' <|endoftext|>')

### Sample from the dataset

In [5]:
print(text[5])

I didn't know what I would find 
 When I went looking for a reason, I know 
 I didn't read between the lines 
 And, baby, I've got nowhere to go 
 I tried to take the road less traveled by 
 But nothing seems to work the first few times 
 Am I right? 
 So how can I ever try to be better? 
 Nobody ever lets me in 
 I can still see you, this ain't the best view 
 On the outside looking in 
 I've been a lot of lonely places 
 I've never been on the outside 
 You saw me there, but never knew 
 I would give it all up to be 
 A part of this, a part of you 
 And now it's all too late so you see 
 You could've helped if you had wanted to 
 But no one notices until it's too 
 Late to do anything 
 So how can I ever try to be better? 
 Nobody ever lets me in 
 I can still see you, this ain't the best view 
 On the outside looking in 
 I've been a lot of lonely places 
 I've never been on the outside 
 So how can I ever try to be better? 
 Nobody ever lets me in 
 I can still see you, this ain't 

## 2. Finetuning the pretrained GPT2 model

### Creating the dataset class

In [6]:
from torch.utils.data import Dataset
import torch.cuda
import torch

class CustomTextDataset(Dataset):
    def __init__(self, tokenizer, text, block_size):
        self.examples = tokenizer.batch_encode_plus(
            text,
            add_special_tokens=True,
            max_length=block_size,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.examples.items()}
        return item

### Model training

In [7]:
def fine_tune_gpt2(output_dir, model_name="gpt2", epochs=1, batch_size=2, learning_rate=1e-4):
    # Load the pretrained GPT-2 model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
    tokenizer.pad_token = tokenizer.eos_token
    
    tokenized_text = tokenizer.encode(text)

    dataset = CustomTextDataset(tokenizer, text, block_size=128)  # Adjust block_size as per your requirements


    # Create a data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        learning_rate=learning_rate,
        save_steps=500,  # Save checkpoints every 500 steps
        save_total_limit=2,
        fp16=True  # Only keep the last 2 checkpoints
    )

    # Create a Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset
    )

    # Fine-tune the model
    trainer.train()

    # Save the fine-tuned model
    trainer.save_model(output_dir)
    
    del model
    del tokenizer
    del trainer

    print("Fine-tuning complete. Model saved to:", output_dir)


output_dir = "../models/ts1"

fine_tune_gpt2(output_dir)



Step,Training Loss


Fine-tuning complete. Model saved to: ../models/ts1


## 3. Generating new Lyrics

### Load finetuned model

In [8]:
def load_fine_tuned_model(model_dir):
    # Load the fine-tuned GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_dir)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    
    return model, tokenizer

model, tokenizer = load_fine_tuned_model(output_dir)

### Generate Lyrics

In [9]:
def generate_text(model, tokenizer, prompt_text, max_length=300):
    input_ids = tokenizer.encode(prompt_text, return_tensors="pt")
    output = model.generate(
        input_ids, 
        do_sample=True, 
        max_length=max_length, 
        top_p=0.92, 
        top_k=0
    )
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return generated_text

In [12]:
prompt = "beautiful summer"
generated_text = generate_text(model, tokenizer, prompt)

print("Generated Text:")
print(generated_text)

Generated Text:
beautiful summer 

 And what a beautiful moment of arrival 
 When something terrible happened 
 My own pain was wanting to smile 
 When it came crashing down 
 And I stood alone, my heart pounding 
 Let alone you 
 But I did my best to convince you 
 But me like that 
 So that's the way it ended 
 And it was good 
 So don't look back, though 
 Because I didn't have the drive 
 Because I had to ask 
 'Cause you keep lying 
 And it'll go either way 
 And you keep going, and you keep going, and you get lost in the beat of eternity 
 And I 1910~05 
 I took my woman camping, and 
 And after a long ride home, we were still at the dock when we spotted a blue light 
 And what a love story 
 Don't believe me 
 Just like you always tell me 
 But just like you always tell me 
 So how long ago 
 Come home in the back yard and tell me the story 
 But you still lie 
 Just like you always tell me 
 You're not the best time 
 But just like me like you always tell me 
 And never you'll 