In [16]:
import torch
import os
import json
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch
from torch.optim import AdamW
from torch.cuda.amp import GradScaler
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader, Subset
import os

from audiocraft.models import MusicGen
from personal_musicgen.data.datasets import AudioDataset
from personal_musicgen.model_utils import train_step, eval_step


# 모델과 프로세서 로드
MODEL="small"
model = MusicGen.get_pretrained(MODEL)

# 데이터셋 준비 (폴더 내의 .wav 및 .json 쌍을 자동으로 찾기)
class MusicDataset(Dataset):
    def __init__(self, audio_dir, processor, max_length=500):
        self.audio_dir = audio_dir
        self.processor = processor
        self.max_length = max_length
        self.data = []

        # 폴더 내 .wav 파일과 동일한 이름을 가진 .json 파일을 찾기
        for file in os.listdir(audio_dir):
            if file.endswith(".wav"):
                audio_file = file
                json_file = audio_file.replace(".wav", ".json")
                
                # .json 파일이 존재하면 data에 추가
                if os.path.exists(os.path.join(audio_dir, json_file)):
                    self.data.append({
                        "audio_file": audio_file,
                        "json_file": json_file
                    })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        description_path = os.path.join(self.audio_dir, item['json_file'])
        
        # JSON 파일에서 description 읽기
        with open(description_path, 'r') as f:
            description = json.load(f)['description']
        
        audio_path = os.path.join(self.audio_dir, item['audio_file'])

        # .wav 파일 로딩
        audio, sr = librosa.load(audio_path, sr=None)
        audio = librosa.resample(audio, sr, 16000)  # 16kHz로 리샘플링

        # 텍스트를 모델 입력으로 변환
        input_ids = self.processor(description, return_tensors="pt").input_ids.squeeze(0)

        # .wav 파일을 텐서로 변환
        audio = torch.tensor(audio).float()

        return input_ids, audio


# 데이터셋과 데이터로더 설정
dataset = MusicDataset(audio_dir="C:/Users/a/Desktop/MusicGen/musicgen-model/dataset_wav")
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# P-Tuning을 위한 하이퍼파라미터 설정
learning_rate = 5e-5
epochs = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# 모델 훈련
model.train()
for epoch in range(epochs):
    loop = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}")
    for input_ids, audio in loop:
        # device.to() 제거
        input_ids = input_ids
        audio = audio

        optimizer.zero_grad()

        # 모델에 입력값 전달
        outputs = model(input_ids=input_ids, labels=input_ids, decoder_input_ids=audio)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())


# 모델 저장
model_save_path = "C:/Users/a/Desktop/MusicGen/musicgen-model/musicgen_model"
processor_save_path = "C:/Users/a/Desktop/MusicGen/musicgen-model/musicgen_processor"

model.save_pretrained(model_save_path)

# 모델 평가 (학습 완료 후)
model.eval()




TypeError: MusicDataset.__init__() missing 1 required positional argument: 'processor'

In [18]:
pip install --upgrade datasets


Collecting datasetsNote: you may need to restart the kernel to use updated packages.

  Using cached datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.1-cp311-cp311-win_amd64.whl.metadata (8.0 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Using cached aiohappyeyeballs-2.4.3-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Using cached aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting attrs>=17.3.0 (from aiohttp->data

In [2]:
from transformers import AutoTokenizer
from peft import get_peft_config, get_peft_model, PeftType, PromptEncoderConfig
from audiocraft.models import MusicGen
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
import json
import librosa
from torch.optim import AdamW

# 모델과 tokenizer 설정
MODEL_NAME = "facebook/musicgen-large"  # 사용할 MusicGen 모델 크기 (small, medium, large 등)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# MusicGen 모델 로드
model = MusicGen.get_pretrained(MODEL_NAME)

# P-Tuning 설정
peft_config = PromptEncoderConfig(
    prompt_length=32,  # 프롬프트 길이 설정 (길이에 따라 모델 성능에 차이 있음)
    peft_type=PeftType.PROMPT_TUNING,  # P-Tuning 유형 설정
)

# P-Tuning 모델 생성
peft_model = get_peft_model(model, peft_config)

# 데이터셋 준비
class MusicDataset(torch.utils.data.Dataset):
    def __init__(self, audio_dir, tokenizer, max_length=500):
        self.audio_dir = audio_dir
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = []

        # 폴더 내 .wav 파일과 동일한 이름을 가진 .json 파일을 찾기
        for file in os.listdir(audio_dir):
            if file.endswith(".wav"):
                audio_file = file
                json_file = audio_file.replace(".wav", ".json")
                
                if os.path.exists(os.path.join(audio_dir, json_file)):
                    self.data.append({
                        "audio_file": audio_file,
                        "json_file": json_file
                    })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        description_path = os.path.join(self.audio_dir, item['json_file'])
        
        # JSON 파일에서 description 읽기
        with open(description_path, 'r') as f:
            description = json.load(f)['description']
        
        audio_path = os.path.join(self.audio_dir, item['audio_file'])

        # .wav 파일 로딩
        audio, sr = librosa.load(audio_path, sr=None)
        audio = librosa.resample(audio, sr, 16000)  # 16kHz로 리샘플링

        # 텍스트를 모델 입력으로 변환
        input_ids = self.tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=self.max_length).input_ids.squeeze(0)

        # .wav 파일을 텐서로 변환
        audio = torch.tensor(audio).float()

        return input_ids, audio


# 데이터셋과 데이터로더 설정
audio_dir = "C:/Users/a/Desktop/MusicGen/musicgen-model/dataset_wav"
dataset = MusicDataset(audio_dir, tokenizer)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# 하이퍼파라미터 설정
learning_rate = 5e-5
epochs = 3
optimizer = AdamW(peft_model.parameters(), lr=learning_rate)

# 모델 훈련
peft_model.train()
for epoch in range(epochs):
    loop = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}")
    for input_ids, audio in loop:
        optimizer.zero_grad()

        # 모델에 입력값 전달 (P-Tuning을 통해 프롬프트 학습)
        outputs = peft_model(input_ids=input_ids, labels=input_ids, decoder_input_ids=audio)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())

# 모델 저장
model_save_path = "C:/Users/a/Desktop/MusicGen/musicgen-model/musicgen_model_peft"
peft_model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


TypeError: PromptEncoderConfig.__init__() got an unexpected keyword argument 'prompt_length'

In [None]:
##다시
# 
# 
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, set_peft_model_state_dict, PeftType, PromptEncoderConfig
from datasets import load_dataset
import evaluate
import torch


#설정값 지정
model_name_or_path = "facebook/musicgen-small"
task = "mrpc"
num_epochs = 3
lr = 1e-3
batch_size = 32

# 데이터셋 준비
class MusicDataset(torch.utils.data.Dataset):
    def __init__(self, audio_dir, tokenizer, max_length=500):
        self.audio_dir = audio_dir
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = []

        # 폴더 내 .wav 파일과 동일한 이름을 가진 .json 파일을 찾기
        for file in os.listdir(audio_dir):
            if file.endswith(".wav"):
                audio_file = file
                json_file = audio_file.replace(".wav", ".json")
                
                if os.path.exists(os.path.join(audio_dir, json_file)):
                    self.data.append({
                        "audio_file": audio_file,
                        "json_file": json_file
                    })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        description_path = os.path.join(self.audio_dir, item['json_file'])
        
        # JSON 파일에서 description 읽기
        with open(description_path, 'r') as f:
            description = json.load(f)['description']
        
        audio_path = os.path.join(self.audio_dir, item['audio_file'])

        # .wav 파일 로딩
        audio, sr = librosa.load(audio_path, sr=None)
        audio = librosa.resample(audio, sr, 16000)  # 16kHz로 리샘플링

        # 텍스트를 모델 입력으로 변환
        input_ids = self.tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=self.max_length).input_ids.squeeze(0)

        # .wav 파일을 텐서로 변환
        audio = torch.tensor(audio).float()

        return input_ids, audio
    

#평가 metric
import numpy as np

metric = evaluate.load("glue", task)

def compute_metrics(eval_pred) :
  preds, labels = eval_pred
  preds = np.argmax(preds, axis = 1)
  return metric.compute(predictions=preds, references=labels)



#tokenizer 생성
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id


#데이터셋 토큰화
def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
    return outputs

#데이터셋 패딩
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")




In [None]:
import torch
from transformers import MusicgenForConditionalGeneration, AutoTokenizer
from peft import get_peft_config, get_peft_model, PeftType, PromptEncoder, PromptEncoderConfig
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# 사전 학습된 MusicGen 모델
modelpath = "facebook/musicgen-small"
model = MusicgenForConditionalGeneration.from_pretrained(modelpath)


# 데이터셋 로드
dataset = load_dataset("makeDataset.py")

# 전역 설정값 지정
task = "mrpc"
num_epochs = 3
lr = 1e-3
batch_size = 2


# P-tuning 설정
config = PromptEncoderConfig(
    peft_type="P_TUNING", 
    task_type="CAUSAL_LM",
    num_virtual_tokens=20,          # 가상 토큰의 수
    num_transformer_submodules=1,
    num_attention_heads=12,
    num_layers=12,
    encoder_reparameterization_type="MLP", 
    encoder_hidden_size=768,
)
prompt_encoder = PromptEncoder(config)

# P-tuning 모델 준비
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# 훈련 인자 설정
training_args = TrainingArguments(
    output_dir="musicgen-ptuning",             # 모델 저장 위치
    per_device_train_batch_size=4,             # 학습 배치 크기
    gradient_accumulation_steps=4,             # 기울기 누적 단계
    per_device_eval_batch_size=4,              # 평가 배치 크기
    warmup_steps=500,                          # 워밍업 단계 수
    max_steps=5000,                            # 최대 학습 단계 수
    learning_rate=1e-3,                        # 학습률
    weight_decay=0.01,                         # 가중치 감쇠
    logging_dir="./logs",                      # 로그 저장 위치
    logging_steps=100,                         # 로그 기록 간격
    evaluation_strategy="steps",               # 평가 전략
    eval_steps=500,                           # 평가 간격
    save_strategy="steps",                    # 저장 전략
    save_steps=500,                          # 저장 간격
    load_best_model_at_end=True,             # 최적 모델 로드
    metric_for_best_model="eval_loss"        # 최적 모델 선택 기준
)

#데이터 프로세싱

tokenizer = AutoTokenizer.from_pretrained(modelpath)

# 데이터 수집기 설정
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # 마스크 언어 모델링 비활성화
)


# Trainer 인스턴스 생성
trainer = Trainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


# 모델 학습
trainer.train()

# 튜닝된 모델 저장
model.save_pretrained("musicgen-ptuning-tuned")

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "leng

ValueError: The repository for makeDataset contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/makeDataset.
Please pass the argument `trust_remote_code=True` to allow custom code to be run.

In [4]:
pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
Downloading transformers-4.46.2-py3-none-any.whl (10.0 MB)
   ---------------------------------------- 0.0/10.0 MB ? eta -:--:--
   ------------------------------ --------- 7.6/10.0 MB 39.0 MB/s eta 0:00:01
   ---------------------------------------- 10.0/10.0 MB 34.8 MB/s eta 0:00:00
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.45.1
    Uninstalling transformers-4.45.1:
      Successfully uninstalled transformers-4.45.1
Successfully installed transformers-4.46.2
Note: you may need to restart the kernel to use updated packages.
