### 加载模型

In [1]:
from transformers import AutoTokenizer, MusicgenForConditionalGeneration, AutoProcessor, EncodecModel
import torch
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 这个模型是我们要训练的
model_name = "/root/autodl-tmp/musicgen-large"  # 可选：small, medium, large
# 初次使用记得去掉local_files_only=True
tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
processor = AutoProcessor.from_pretrained(model_name, local_files_only=True)
model = MusicgenForConditionalGeneration.from_pretrained(model_name, local_files_only=True).half().to(device)
# model.half()解决精度问题报错

model.config.decoder.decoder_start_token_id = model.generation_config.decoder_start_token_id
print("decoder_start_token_id:", model.config.decoder.decoder_start_token_id)
print(model.config)

# 这个模型用于将音频转化为 tokens
encodec_model_name = "/root/autodl-tmp/encodec_32khz"
encodec_model = EncodecModel.from_pretrained(encodec_model_name, local_files_only=True).to(device)
encodec_model.eval()
print(encodec_model.config)

model

device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

decoder_start_token_id: 2048
MusicgenConfig {
  "architectures": [
    "MusicgenForConditionalGeneration"
  ],
  "audio_encoder": {
    "_name_or_path": "facebook/encodec_32khz",
    "architectures": [
      "EncodecModel"
    ],
    "audio_channels": 1,
    "chunk_length_s": null,
    "codebook_dim": 128,
    "codebook_size": 2048,
    "compress": 2,
    "dilation_growth_rate": 2,
    "hidden_size": 128,
    "kernel_size": 7,
    "last_kernel_size": 7,
    "model_type": "encodec",
    "norm_type": "weight_norm",
    "normalize": false,
    "num_filters": 64,
    "num_lstm_layers": 2,
    "num_residual_layers": 1,
    "overlap": null,
    "pad_mode": "reflect",
    "residual_kernel_size": 3,
    "sampling_rate": 32000,
    "target_bandwidths": [
      2.2
    ],
    "torch_dtype": "float32",
    "trim_right_ratio": 1.0,
    "upsampling_ratios": [
      8,
      5,
      4,
      4
    ],
    "use_causal_conv": false,
    "use_conv_shortcut": false
  },
  "decoder": {
    "activation_dr

MusicgenForConditionalGeneration(
  (text_encoder): T5EncoderModel(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): L

### 加载数据集

In [None]:
import librosa
from datasets import Dataset
import os
import numpy as np

def process_and_expand_dataset(dataset):
    """
    Process dataset by expanding each audio into multiple 10-second slices
    """
    all_slices = []
    skipped_count = 0
    
    print("Processing and slicing audio files...")
    
    for i, record in enumerate(dataset['train']):
        audio_array = record['audio']['array']
        sampling_rate = record['audio']['sampling_rate']
        
        # Resample to 32kHz if needed
        if sampling_rate != 32000:
            audio_array = librosa.resample(audio_array, orig_sr=sampling_rate, target_sr=32000)
            sampling_rate = 32000
        
        # Calculate 10-second slice parameters
        slice_duration = 30.0
        slice_length = int(slice_duration * sampling_rate)  # 320,000 samples
        audio_length = len(audio_array)
        num_slices = audio_length // slice_length
        
        # Skip audio shorter than 10 seconds
        if num_slices == 0:
            skipped_count += 1
            continue
        
        # Enhanced text processing for Chinese National Pentatonic Mode Dataset
        # Define label mappings
        system_labels = ['C', '#C/bD', 'D', '#D/bE', 'E', 'F', '#F/bG', 'G', '#G/bA', 'A', '#A/bB', 'B']
        tonic_labels = ['C', '#C/bD', 'D', '#D/bE', 'E', 'F', '#F/bG', 'G', '#G/bA', 'A', '#A/bB', 'B']
        pattern_labels = ['Gong', 'Shang', 'Jue', 'Zhi', 'Yu']
        type_labels = ['Pentatonic', 'Hexatonic_Qingjue', 'Hexatonic_Biangong', 'Heptatonic_Yayue', 'Heptatonic_Qingyue', 'Heptatonic_Yanyue']
        
        # Extract musical attributes
        system = system_labels[record['system']]
        tonic = tonic_labels[record['tonic']]
        pattern = pattern_labels[record['pattern']]
        type_val = type_labels[record['type']]
        
        # Create rich, natural language description for MusicGen
        text = f"Chinese traditional music in {pattern} pattern, {type_val.replace('_', ' ').lower()} scale, key of {tonic}"
        # print(f"Sample text description: {text}") 
        text_tokens = tokenizer(text, return_tensors="pt")
        
        # Process each complete 10-second slice
        for slice_idx in range(num_slices):
            start_idx = slice_idx * slice_length
            end_idx = start_idx + slice_length
            
            # Extract slice
            audio_slice = audio_array[start_idx:end_idx]
            
            # Encode on GPU
            audio_tensor = torch.from_numpy(audio_slice).float().unsqueeze(0).unsqueeze(0)
            
            with torch.no_grad():
                audio_tensor = audio_tensor.to(device)
                encoded_audio = encodec_model.encode(audio_tensor)
                audio_tokens = encoded_audio.audio_codes.squeeze(0).squeeze(0).transpose(0, 1).cpu()
                
                # Clean up GPU memory
                del audio_tensor, encoded_audio
            
            # Create slice record
            slice_record = {
                "input_ids": text_tokens["input_ids"].squeeze(0).clone(),
                "attention_mask": text_tokens["attention_mask"].squeeze(0).clone(),
                "labels": audio_tokens,
                "original_index": i,
                "slice_index": slice_idx,
                "total_slices": num_slices
            }
            
            all_slices.append(slice_record)
        
        # Progress tracking
        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1}/{len(dataset['train'])} files, "
                  f"generated {len(all_slices)} slices so far")
            torch.cuda.empty_cache()  # Clean up periodically
    
    print(f"Processing complete!")
    print(f"Original files: {len(dataset['train'])}")
    print(f"Skipped files: {skipped_count}")
    print(f"Total slices generated: {len(all_slices)}")
    
    # Convert to HuggingFace dataset format
    from datasets import Dataset
    
    # Organize data by columns
    dataset_dict = {
        "input_ids": [slice_rec["input_ids"] for slice_rec in all_slices],
        "attention_mask": [slice_rec["attention_mask"] for slice_rec in all_slices],
        "labels": [slice_rec["labels"] for slice_rec in all_slices],
        "original_index": [slice_rec["original_index"] for slice_rec in all_slices],
        "slice_index": [slice_rec["slice_index"] for slice_rec in all_slices],
        "total_slices": [slice_rec["total_slices"] for slice_rec in all_slices]
    }
    
    # Create new dataset
    sliced_dataset = Dataset.from_dict(dataset_dict)
    
    return sliced_dataset

def my_collator(batch):
    # Extract data from batch
    input_ids_list = [torch.tensor(item["input_ids"], dtype=torch.long) for item in batch]
    labels_list = [torch.tensor(item["labels"], dtype=torch.long) for item in batch]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids_list, batch_first=True, padding_value=0)
    mask = (input_ids != 0).bool()
    labels = torch.stack(labels_list, dim=0)
    return {
        "input_ids": input_ids,
        "attention_mask": mask,
        "labels": labels
    }

In [None]:
from datasets import load_dataset
dataset_name = "/root/autodl-tmp/CNPM"
dataset = load_dataset(dataset_name)

# Process and expand dataset
dataset_sliced = process_and_expand_dataset(dataset)

print(f"Final sliced dataset size: {len(dataset_sliced)}")
if len(dataset['train']) > 0:
    print(f"Average slices per original audio: {len(dataset_sliced) / len(dataset['train']):.1f}")

# Wrap in DatasetDict format to match original structure
from datasets import DatasetDict
dataset_sliced = DatasetDict({"train": dataset_sliced})

Processing and slicing audio files...
Processed 10/287 files, generated 153 slices so far
Processed 20/287 files, generated 297 slices so far
Processed 30/287 files, generated 495 slices so far
Processed 40/287 files, generated 645 slices so far
Processed 50/287 files, generated 784 slices so far
Processed 60/287 files, generated 1008 slices so far
Processed 70/287 files, generated 1169 slices so far
Processed 80/287 files, generated 1354 slices so far
Processed 90/287 files, generated 1545 slices so far
Processed 100/287 files, generated 1716 slices so far
Processed 110/287 files, generated 1869 slices so far
Processed 120/287 files, generated 2028 slices so far
Processed 130/287 files, generated 2175 slices so far
Processed 140/287 files, generated 2361 slices so far
Processed 150/287 files, generated 2552 slices so far
Processed 160/287 files, generated 2735 slices so far
Processed 170/287 files, generated 2863 slices so far
Processed 180/287 files, generated 3021 slices so far
Proc

### 设置lora参数

In [None]:
from peft import LoraConfig, get_peft_model
import gc

# LoRA Configuration
lora_config = LoraConfig(
    r=8,                          # LoRA rank
    lora_alpha=32,                # Scaling factor
    target_modules=["q_proj", "v_proj", "k_proj", "out_proj"],  # Target attention modules
    lora_dropout=0.05,            # Dropout rate
    bias="none",                  # Don't adjust bias
    task_type="CAUSAL_LM",        # Causal language modeling
)

# Clean up any existing LoRA model
try:
    if 'lora_model' in globals():
        lora_model.unload()
        del lora_model
    if hasattr(model, 'peft_config'):
        delattr(model, 'peft_config')
    torch.cuda.empty_cache()
    gc.collect()
    print("Cleaned up existing LoRA model.")
except Exception as e:
    print(f"No existing LoRA model to clean up: {e}")

# Apply LoRA and explicitly move to GPU (safe regardless of base model device)
lora_model = get_peft_model(model, lora_config).to(device)
print(f"LoRA model device: {next(lora_model.parameters()).device}")

lora_model.print_trainable_parameters()

# Save initial LoRA weights
lora_model.save_pretrained("./outputs/musicgen-lora/initial_lora")

Cleaned up existing LoRA model.
LoRA model device: cuda:0
trainable params: 6,291,456 || all params: 3,429,761,602 || trainable%: 0.1834


In [None]:
from transformers import TrainingArguments, Trainer
import wandb
from datetime import datetime

WANDB_API_KEY = "your_wandb_api_key_here"  # Replace with your actual WandB API key

wandb.login(key=WANDB_API_KEY)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjelly-zhao-42[0m ([33mjelly-zhao-42-peking-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### 设置训练参数

In [None]:
# Initialize wandb tracking
wandb.init(
    project="musicgen-chinese-pentatonic",
    name=f"lora-finetune-{datetime.now().strftime('%Y%m%d_%H%M')}",
    config={
        "model": "musicgen-large",
        "lora_rank": 16,
        "lora_alpha": 32,
        "batch_size": 16,
        "gradient_accumulation_steps": 4,
        "learning_rate": 2e-4,
        "num_epochs": 5,
        "dataset": "Chinese National Pentatonic Mode",
        "gpu": "A100-80GB"
    },
    tags=["musicgen", "lora", "chinese-music"],
)

# High-performance training arguments with wandb monitoring
training_args = TrainingArguments(
    output_dir="./outputs/musicgen-lora",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=2e-4,
    fp16=True,
    dataloader_pin_memory=True,
    dataloader_num_workers=2,
    logging_steps=10,                    # Frequent logging for good plots
    save_steps=500,
    eval_steps=250,
    eval_strategy="steps",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # WANDB Integration
    report_to="wandb",                   # Enable wandb logging
    logging_dir="./logs",
    
    remove_unused_columns=False,
    warmup_steps=200,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    max_grad_norm=1.0,
    dataloader_prefetch_factor=2,
    group_by_length=False,
    label_names=["labels"],
)

dataset_sliced = dataset_sliced["train"].train_test_split(test_size=0.2, seed=42)

# Memory monitoring function with target usage
def print_memory_usage():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"GPU Memory - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB, Total: {total:.2f}GB")
        print(f"Utilization: {allocated/total*100:.1f}% (Target: 60-70%)")

# Enhanced trainer with memory monitoring and aggressive optimization
class HighPerformanceTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.step_count = 0

    def training_step(
        self, model, inputs, num_items_in_batch=None
    ):
        """Optimized training step for high throughput"""
        model.train()
        
        # Move inputs to device
        inputs = self._prepare_inputs(inputs)
        
        # Forward pass with autocast for mixed precision
        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
        
        if self.args.n_gpu > 1:
            loss = loss.mean()
        
        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps
        
        # Backward pass
        self.accelerator.backward(loss)
        
        self.step_count += 1
        
        # Less frequent memory cleanup for better performance
        if self.step_count % 50 == 0:
            torch.cuda.empty_cache()
        
        return loss.detach()
    
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        """Optimized prediction step"""
        result = super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
        
        # Minimal cleanup during evaluation
        if torch.cuda.memory_allocated() > 50 * 1024**3:  # Only if using >50GB
            torch.cuda.empty_cache()
        
        return result

# Initialize trainer with proper label configuration
trainer = HighPerformanceTrainer(
    model=lora_model,
    args=training_args,
    train_dataset=dataset_sliced["train"],
    eval_dataset=dataset_sliced["test"],
    data_collator=my_collator,
)

Detected kernel version 4.19.90, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


### 训练

In [None]:
from transformers.models.musicgen.modeling_musicgen import MusicgenSinusoidalPositionalEmbedding

# Add the offset property to the class
MusicgenSinusoidalPositionalEmbedding.offset = 0  # or 2, depending on your needs

# Train the model with high performance settings
print("Initial memory usage:")
print_memory_usage()
trainer.train()
print("High-performance training completed successfully!")
    
# Save final model
trainer.save_model("./outputs/musicgen-lora/final_model")

wandb.finish()
    
# Final memory cleanup
torch.cuda.empty_cache()
gc.collect()

Initial memory usage:
GPU Memory - Allocated: 6.65GB, Reserved: 7.46GB, Total: 79.14GB
Utilization: 8.4% (Target: 60-70%)




Step,Training Loss,Validation Loss
250,6.357,6.331653


High-performance training completed successfully!
