# Prep dataset in HF format
https://github.com/ace-step/ACE-Step?tab=readme-ov-file

keys: Unique identifier for each audio sample
filename: Path to the audio file
tags: List of descriptive tags (e.g., ["pop", "rock"])
norm_lyrics: Normalized lyrics text
Optional fields:
speaker_emb_path: Path to speaker embedding file (use empty string if not available)
recaption: Additional tag descriptions in various formats


In [None]:
#!pip install datasets
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
import spacy
nlp = spacy.load("en_core_web_sm")
import re
from datasets import load_dataset, Audio, DatasetDict
#!pip install huggingface_hub[hf_transfer]
# Set HF_HUB_ENABLE_HF_TRANSFER=1 as an environment variable for faster downloads
import os
from huggingface_hub import snapshot_download

# 1) (Optional) set env vars so *all* HF artifacts go under your OneDrive folder:
BASE = "'/Users/USER/Library/CloudStorage/OneDrive-TheUniversityofChicago/School/Bayesian - Spring 2025/Final Project/hf_data'"

os.makedirs(BASE + "/datasets", exist_ok=True)
os.makedirs(BASE + "/models", exist_ok=True)
os.makedirs(BASE + "/modules", exist_ok=True)

os.environ["HF_HOME"]            = BASE         # root for everything
os.environ["HF_DATASETS_CACHE"]  = BASE + "/datasets"
os.environ["TRANSFORMERS_CACHE"] = BASE + "/models"
os.environ["HF_MODULES_CACHE"]   = BASE + "/modules"
snapshot_download(
    repo_id="vikhyatk/lofi",
    repo_type="dataset",
    local_dir="/Users/USER/Library/CloudStorage/OneDrive-TheUniversityofChicago/School/Bayesian - Spring 2025/Final Project/hf_data/datasets/vikhyatk_lofi",
    resume_download=True,
)
from datasets import load_dataset, Audio

ds = load_dataset(
    "/Users/USER/Library/CloudStorage/OneDrive-TheUniversityofChicago/School/Bayesian - Spring 2025/Final Project/hf_data/datasets/vikhyatk_lofi",
    split="train",
)
ds = ds.cast_column("audio", Audio())
# landing tokenizers in my folder
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(
    "ACE-Step/ACE-Step-v1-3.5B",
    cache_dir="/Users/USER/…/hf_data/models"
)
model = AutoModelForCausalLM.from_pretrained(
    "ACE-Step/ACE-Step-v1-3.5B",
    cache_dir="/Users/USER/…/hf_data/models"
)
# schema_builder

def preprocess(song):
    # lowercase & lemmatize, dropping stopwords & non‑alpha tokens
    doc = nlp(song["prompt"].lower())
    tags = [tok.lemma_ for tok in doc
            if tok.is_alpha and tok.text not in stop_words]

    return {
        "keys": song["id"],
        "filename": song["audio"]["path"],
        "tags": tags,
        "speaker_emb_path": "",
        "norm_lyrics":      ""
    }

# after your `preprocess` fn:
cleaned = ds.map(preprocess, remove_columns=ds.column_names, num_proc=4)
cleaned.save_to_disk("/Users/USER/.../hf_data/datasets/cleaned_lofi_dataset")

from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from datasets import load_from_disk

# 1) Load cleaned dataset
dataset = load_from_disk("/Users/USER/.../hf_data/datasets/cleaned_lofi_dataset")

# 2) Tokenizer & base model
tokenizer = AutoTokenizer.from_pretrained(
    "/Users/USER/.../hf_data/models/ACE-Step-v1-3.5B",
    use_fast=True
)
model = AutoModelForCausalLM.from_pretrained(
    "/Users/USER/.../hf_data/models/ACE-Step-v1-3.5B",
    torch_dtype="auto",
    device_map="auto",
    low_cpu_mem_usage=True
)

# 3) Attach a LoRA adapter
lora_cfg = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05
)
model = get_peft_model(model, lora_cfg)

# 4) Prepare features → tokens
def tokenize_tags(example):
    # e.g. join your tags into a prompt string, then tokenize
    txt = "Tags: " + " ".join(example["tags"])
    return tokenizer(txt, truncation=True)
dataset = dataset.map(tokenize_tags, batched=True)

# 5) TrainingArguments

training_args = TrainingArguments(
    output_dir="PATH",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    fp16=True,
    logging_dir="PATH",
    logging_steps=50,
    save_total_limit=3,
    save_strategy="steps",
    save_steps=200
)

# 6) A simple data collator
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer, mlm=False
)

# 7) Trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)
trainer.train()
trainer.save_model("./lofi_lora_final")
