# 목표: Huggingface에서 DistilHuBert 모델을 불러와, 내 데이터 셋으로 파인튜닝 시켜 나만의 분류 모델 만들기

In [1]:
# %pip install datasets evaluate git+https://github.com/huggingface/transformers

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# 내 데이터셋 불러오기

In [2]:
from datasets import load_dataset, Audio

dataset_id = 'Hoonvolution/hoons_music_data'
my_dataset = load_dataset(dataset_id, data_dir='/')
my_dataset = my_dataset.class_encode_column("artist")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/2380 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/480 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/480 [00:00<?, ?it/s]

In [3]:
id2label_fn = my_dataset['train'].features['artist'].int2str

In [4]:
from transformers import AutoFeatureExtractor

model_id = "ntu-spml/distilhubert"
# 모델에 맞는 feature를 추출해주는 클래스
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

In [5]:
sampling_rate = feature_extractor.sampling_rate  # sampling rate 확인
sampling_rate

16000

In [6]:
from constants import Constants as C

In [7]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * C.DURATION_SEC),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [8]:
# feature extractor 적용

my_dataset_encoded = my_dataset.map(
    preprocess_function,
    remove_columns=["audio", "album", "title", "seg_id"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
my_dataset_encoded = my_dataset_encoded.rename_column("artist", "label")

In [9]:
# id <-> label 매핑 만들기

id2label = {
    str(i): id2label_fn(i)
    for i in range(len(my_dataset_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}

id2label

{'0': 'Epik High',
 '1': 'Fall Out Boy',
 '2': 'Madeon',
 '3': 'Mika',
 '4': 'Mr.Big',
 '5': 'Muse',
 '6': 'Rage Against the Machine',
 '7': 'Red Hot Chili Peppers',
 '8': 'Suede',
 '9': '브로콜리 너마저'}

# (Pretrained) DistilHuBert 모델 불러오기

In [10]:
from transformers import AutoModelForAudioClassification

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import TrainingArguments
from hyperparameters import HyperParameters as H

model_name = model_id.split("/")[-1]


training_args = TrainingArguments(
    f"{model_name}-finetuned-hoons_music",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=H.LEARNING_RATE,
    per_device_train_batch_size=H.BATCH_SIZE,
    gradient_accumulation_steps=H.GRADIENT_ACCUMULATION_STEPS,
    per_device_eval_batch_size=H.BATCH_SIZE,
    num_train_epochs=H.NUM_TRAIN_EPOCHS,
    warmup_ratio=H.WARMUP_RATIO,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True,
)



In [12]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
my_dataset_encoded.set_format(type='torch')

# Fine-tuning 시작

In [14]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=my_dataset_encoded["train"],
    eval_dataset=my_dataset_encoded["validation"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6265,1.765206,0.379167
2,0.9028,1.077231,0.647917
3,0.3958,0.785743,0.78125
4,0.2335,0.560094,0.852083
5,0.1384,0.804223,0.822917
6,0.0517,0.703061,0.85
7,0.0025,0.726057,0.847917
8,0.0018,0.710324,0.85
9,0.0015,0.732874,0.845833
10,0.0015,0.730689,0.84375


TrainOutput(global_step=2980, training_loss=0.406124652582127, metrics={'train_runtime': 982.5478, 'train_samples_per_second': 24.223, 'train_steps_per_second': 3.033, 'total_flos': 5.41301152896e+17, 'train_loss': 0.406124652582127, 'epoch': 10.0})

# Huggingface에 모델 정보 업로드

In [15]:
kwargs = {
    "dataset_tags": dataset_id,
    "dataset": "Hoons music data",
    "model_name": f"{model_name}-finetuned-hoon_music",
    "finetuned_from": model_id,
    "tasks": "audio-classification",
}
trainer.push_to_hub(**kwargs)

model.safetensors:   0%|          | 0.00/94.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Hoonvolution/distilhubert-finetuned-hoons_music/commit/713ebc221c9d08a02c029c9e3f2ae407e0f6c670', commit_message='End of training', commit_description='', oid='713ebc221c9d08a02c029c9e3f2ae407e0f6c670', pr_url=None, pr_revision=None, pr_num=None)