In [1]:
import torch
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

## 1. Import the lyrics dataset

In [2]:
d = pd.read_csv('taylor_swift_lyrics.csv', encoding='latin1')
text = []

for title in d['track_title'].unique():
    song = ""
    for l in d[d['track_title'] == title]['lyric']:
        song += l + ' \n '
    text.append(song + ' <|endoftext|>')

### Sample from the dataset

In [3]:
print(text[10])

I was riding shotgun with my hair undone 
 In the front seat of his car 
 He's got a one-hand feel on the steering wheel 
 The other on my heart 
 I look around, turn the radio down 
 He says, "Baby is something wrong?" 
 I say, "Nothing, I was just thinking 
 How we don't have a song" 
 And he says 
 Our song is the slamming screen door 
 Sneakin' out late, tapping on your window 
 When we're on the phone and you talk real slow 
 'Cause it's late and your mama don't know 
 Our song is the way you laugh 
 The first date: "Man, I didn't kiss her, and I should have" 
 And when I got home 'fore I said amen 
 Asking God if he could play it again 
 I was walking up the front porch steps 
 After everything that day 
 Had gone all wrong and been trampled on 
 And lost and thrown away 
 Got to the hallway, well on my way 
 To my lovin' bed 
 I almost didn't notice all the roses 
 And the note that said 
 Our song is the slamming screen door 
 Sneakin' out late, tapping on your window 
 When we

## 2. Finetuning the pretrained GPT2 model

### Creating the dataset class

In [4]:
from torch.utils.data import Dataset
import torch.cuda
import torch

class CustomTextDataset(Dataset):
    def __init__(self, tokenizer, text, block_size):
        self.examples = tokenizer.batch_encode_plus(
            text,
            add_special_tokens=True,
            max_length=block_size,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.examples.items()}
        return item

### Model training

In [5]:
def fine_tune_gpt2(output_dir, model_name="gpt2", epochs=10, batch_size=2, learning_rate=1e-4):
    # Load the pretrained GPT-2 model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
    tokenizer.pad_token = tokenizer.eos_token
    
    tokenized_text = tokenizer.encode(text)

    dataset = CustomTextDataset(tokenizer, text, block_size=128)  # Adjust block_size as per your requirements


    # Create a data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        learning_rate=learning_rate,
        save_steps=500,  # Save checkpoints every 500 steps
        save_total_limit=2,
        fp16=True  # Only keep the last 2 checkpoints
    )

    # Create a Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset
    )

    # Fine-tune the model
    trainer.train()

    # Save the fine-tuned model
    trainer.save_model(output_dir)
    
    del model
    del tokenizer
    del trainer

    print("Fine-tuning complete. Model saved to:", output_dir)


output_dir = "/mnt/d/Projects/custom_gpt/models/ts1"

fine_tune_gpt2(output_dir)



Step,Training Loss


Fine-tuning complete. Model saved to: /mnt/d/Projects/custom_gpt/models/ts1


## 3. Generating new Lyrics

### Load finetuned model

In [6]:
def load_fine_tuned_model(model_dir):
    # Load the fine-tuned GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_dir)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    
    return model, tokenizer

model, tokenizer = load_fine_tuned_model(output_dir)

### Generate Lyrics

In [17]:
def generate_text(model, tokenizer, prompt_text, max_length=300):
    input_ids = tokenizer.encode(prompt_text, return_tensors="pt")
    output = model.generate(
        input_ids, 
        do_sample=True, 
        max_length=max_length, 
        top_p=0.92, 
        top_k=0
    )
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return generated_text

In [18]:
prompt = "a beutiful magic love affair"
generated_text = generate_text(model, tokenizer, prompt)

print("Generated Text:")
print(generated_text)

Generated Text:
a beutiful magic love affair 
 That lasted a day and a half 
 That lasted for almost 2 hours 
 Forcing laughter and faking smiles 
 In the back of my mind saying 
 We've been getting closer 
 And closer 
 Always feeling strange 
 And strange bedfellows in our love 
 Dark clouds and lonely places 
 Are somehow holding on 
 Without us 
 Nothing is ever gonna be alright 
 And darling, I could dance to this beat 
 Forevermore 
 I can feel your arms twistin' round me 
 My arms are shaking with anticipation 
 And all I know is 
 Baby, I could dance to this beat 
 Forevermore 
 I can feel your arms twistin' round me 
 My arms are shaking with anticipation 
 And darling, I could dance to this beat 
 Forevermore 
 Holdin' on 
 Forevermore 
 I could dance to this beat 
 Forevermore 
 Touch me, you'll never forget me 
 Forevermore 
 Touch me, you'll never forget me 
 Forevermore 
 I could dance to this beat 
 Forevermore 
 Touch me, you'll never forget me 
 Forevermore 
 Touch me,