In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
!pip install datasets



In [3]:
from datasets import load_dataset
dataset = load_dataset("haely/Taylor1D")

In [4]:
# Create test set
from datasets import DatasetDict
# Get the 'train' split from the DatasetDict
train_dataset = dataset["train"]


# Perform the train_test_split on the *train dataset*
train_test_split = train_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset_split = train_test_split["train"]
test_val_dataset = train_test_split["test"]


# Split the test/validation set
test_val_split = test_val_dataset.train_test_split(test_size=0.5, seed=42)
test_dataset = test_val_split["train"]
val_dataset = test_val_split["test"]


# Create the new DatasetDict
final_dataset = DatasetDict({
   "train": train_dataset_split,  # Use the split train dataset
   "test": test_dataset,
   "validation": val_dataset,
})


print(final_dataset)

DatasetDict({
    train: Dataset({
        features: ['track_title', 'lyric'],
        num_rows: 6387
    })
    test: Dataset({
        features: ['track_title', 'lyric'],
        num_rows: 798
    })
    validation: Dataset({
        features: ['track_title', 'lyric'],
        num_rows: 799
    })
})


In [5]:
!pip install transformers accelerate bitsandbytes sentencepiece



In [6]:
!pip install -U transformers accelerate



In [7]:
!pip install -U bitsandbytes



In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Use BitsAndBytesConfig for 8-bit quantization
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with correct quantization and device placement
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # Use BitsAndBytesConfig
    device_map="auto"  # Automatically assigns model to GPU if available
)

# No need to manually call .to(device), it's handled automatically
print("Model loaded successfully on:", model.device)


Model loaded successfully on: cuda:0


In [9]:
print(final_dataset["train"][0])

{'track_title': 'Forever & Always (Piano Version) [Taylor’s Version]', 'lyric': "You didn't mean it baby, I don't think so"}


In [10]:
print(final_dataset["test"][0])

{'track_title': 'Girl at Home', 'lyric': "Don't look at me, you got a girl at home"}


In [11]:
!pip install -U transformers datasets peft trl accelerate bitsandbytes



In [12]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")  # Change to your model

# Define tokenization function
def tokenize_function(examples):  # Note: "examples" now (plural)
    # Convert track_title and lyric to string (handling potential lists)
    track_titles = [" ".join(title) if isinstance(title, list) else str(title) for title in examples["track_title"]]
    lyrics = [" ".join(lyric) if isinstance(lyric, list) else str(lyric) for lyric in examples["lyric"]]

    # Concatenate track title and lyric (using list comprehension for efficiency)
    texts = [title + " - " + lyric for title, lyric in zip(track_titles, lyrics)]

    # Tokenize, padding to max length, return as a dictionary
    return tokenizer(texts, truncation=True, padding="max_length", max_length=128, return_attention_mask=True) # crucial


# Apply tokenization to dataset (using batched=True is correct and efficient)
tokenized_dataset = final_dataset.map(tokenize_function, batched=True, remove_columns=final_dataset.column_names["train"]) #remove_columns speeds up things
tokenized_dataset.set_format("torch") #set to torch format

In [13]:
print(tokenized_dataset["validation"])

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 799
})


In [15]:
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from transformers import DataCollatorForLanguageModeling # Import data collator

# Load model with 4-bit quantization
bnb_config = BitsAndBytesConfig(load_in_4bit=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Apply LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # Layers to fine-tune
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)


# Data collator for language modeling (important for causal LM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # mlm=False for Causal LM


# Training Arguments
training_args = TrainingArguments(
    output_dir="./finetuned_lyrics",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,  # Adjust if needed
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,
    push_to_hub=False
)



# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator # Add data collator
)

# Train
trainer.train()

# Save the PEFT model (more efficient)
model.save_pretrained("./finetuned_lyrics")
tokenizer.save_pretrained("./finetuned_lyrics")




Epoch,Training Loss,Validation Loss
1,1.9258,2.010336
2,1.8183,1.932009
3,1.7248,1.894891


('./finetuned_lyrics/tokenizer_config.json',
 './finetuned_lyrics/special_tokens_map.json',
 './finetuned_lyrics/tokenizer.model',
 './finetuned_lyrics/added_tokens.json',
 './finetuned_lyrics/tokenizer.json')

In [16]:
# that took 35 mins
import torch

def generate_song(prompt, model, tokenizer, max_length=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            eos_token_id=tokenizer.eos_token_id,  # Essential
            pad_token_id=tokenizer.pad_token_id # Essential
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

In [23]:
prompt = "love princess mine forever dress"
generated_song = generate_song(prompt, model, tokenizer, max_length=200)
print(generated_song)


love princess mine forever dressing up like a fairytale - I'm a love-princess, I know it, but I don't care, baby, 'cause I love you, love, oh, yeah, like that, you know, that I do, do love ya, ya (Yeah) love y'all, yall (Love y’all) yeah (Nah, nah, no, don’t, please, let me go) oh (Oh)

I've been thinking about you a lot lately, and I can' t stop thinking of you
(Yet) and it's hard to let go of the memories we're making
and I wanna hold on to the love we have, even though it hurts
yeah (Hurts) (But) I want to hold onto the memory of us
even though we’re broken
but I


In [35]:
prompt = "princess rain dance love real"
generated_song = generate_song(prompt, model, tokenizer, max_length=200)
#print(generated_song)


In [34]:
# Replace commas with commas + newline
formatted_song = generated_song.replace(",", ",\n")

print(formatted_song)

princess rain dance love real - I'm a princed out in the rain,
 dancing love,
 real (Dancing,
 love) (Real) real,
 (Love) love (real)
And I wanna be your princeland,
 princelland (Princelands) and you're my princes (Your prince)s
I wana be the princa (Crazy) of your life,
 your queen (Queen)es
You'll be my queen,
 my life (Life) is princing

Verse 2:
But I know you wan't be mine,
 you don'ts want to be
So I need to know,
 I want you to say
That you want me to stay,
 that you love me
To stay
(You love him) you know that,
 yeah,
 baby,

you know it,
 yes,
 oh,
 he loves you
yeah,

