<a href="https://colab.research.google.com/github/jhyeon-kim/ai_study/blob/main/%EC%9D%8C%EC%95%85_%EA%B0%90%EC%A0%95_%EB%B6%84%EB%A5%98_CNN_(ds%EB%B3%91%ED%95%A9).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchaudio pandas



In [2]:
from google.colab import drive

# Google Drive 마운트
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import torchaudio
import os
from sklearn.model_selection import train_test_split


# 첫 번째 CSV 파일 경로 (MER_audio_taffc_dataset)
csv_file_path_1 = '/content/drive/MyDrive/sentiment_classification_projects_241014/music_sentiments_classification/MER_audio_taffc_dataset/songs_moods_labeled.csv'
# 두 번째 CSV 파일 경로 (OSF_IO)
csv_file_path_2 = '/content/drive/MyDrive/sentiment_classification_projects_241014/music_sentiments_classification/OSF_IO/set1_labeled.csv'

# 첫 번째 데이터셋 로드
data_1 = pd.read_csv(csv_file_path_1)
# 두 번째 데이터셋 로드
data_2 = pd.read_csv(csv_file_path_2)

# 첫 번째 데이터셋의 오디오 파일 경로 설정 함수 (MER_audio_taffc_dataset)
def get_audio_path_taffc(song, quadrant):
    return f'/content/drive/MyDrive/sentiment_classification_projects_241014/music_sentiments_classification/MER_audio_taffc_dataset/{quadrant}/{song}.mp3'

# 두 번째 데이터셋의 오디오 파일 경로 설정 함수 (OSF_IO)
def get_audio_path_osf(set, number):
    song_num_str = str(number).zfill(3)
    return f'/content/drive/MyDrive/sentiment_classification_projects_241014/music_sentiments_classification/OSF_IO/{set}/{song_num_str}.mp3'

# 첫 번째 데이터셋에 오디오 경로 추가
data_1['audio_path'] = data_1.apply(lambda row: get_audio_path_taffc(row['Song'], row['Quadrant']), axis=1)

# 두 번째 데이터셋에 오디오 경로 추가
data_2['audio_path'] = data_2.apply(lambda row: get_audio_path_osf("Set1", row['Nro']), axis=1)

# 두 데이터셋 병합
data_combined = pd.concat([data_1, data_2], ignore_index=True)


## 2. MelSpectrogram 전처리 및 데이터셋 준비

오디오 파일을 불러와서 CNN이 처리할 수 있도록 스펙트로그램으로 변환합니다.



In [34]:
import torchaudio.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import torch

import random

class AudioSpectrogramDataset(Dataset):
    def __init__(self, dataframe, target_sample_rate=16000, segment_length=80000):  # 5초 = 80000 샘플
        self.data = dataframe
        self.labels = dataframe['label'].values
        self.audio_paths = dataframe['audio_path'].values
        self.mel_spectrogram = transforms.MelSpectrogram(sample_rate=target_sample_rate, n_mels=64)
        self.segment_length = segment_length  # 고정된 길이 5초
        self.segments = []
        self._prepare_segments()

    def _prepare_segments(self):
        # 각 파일을 5초 단위로 나누고 세그먼트 리스트를 만든다
        for idx, audio_path in enumerate(self.audio_paths):
            waveform, sample_rate = torchaudio.load(audio_path)
            label = self.labels[idx]
            num_segments = waveform.shape[1] // self.segment_length

            for segment_idx in range(num_segments):
                start = segment_idx * self.segment_length
                end = start + self.segment_length
                segment = waveform[:, start:end]

                # 스테레오 데이터를 모노로 변환
                if segment.shape[0] > 1:
                    segment = segment.mean(dim=0, keepdim=True)

                # 세그먼트와 레이블을 저장
                self.segments.append((segment, label))

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        segment, label = self.segments[idx]
        # MelSpectrogram 변환
        spec = self.mel_spectrogram(segment)
        return spec, label



AudioSpectrogramDataset은 각 오디오 파일을 불러와서 MelSpectrogram으로 변환한 후, CNN이 학습할 수 있는 형식으로 반환합니다.



In [35]:
import torch

def pad_spectrogram(spec, max_length):
    # spec: [1, n_mels, time]
    pad_size = max_length - spec.shape[-1]
    return torch.nn.functional.pad(spec, (0, pad_size), "constant", 0)  # 오른쪽으로 패딩

def collate_fn(batch):
    specs, labels = zip(*batch)

    # 각 배치의 가장 긴 스펙트로그램 길이 찾기
    max_length = max([spec.shape[-1] for spec in specs])

    # 패딩 적용하여 모든 스펙트로그램의 길이 맞추기
    padded_specs = [pad_spectrogram(spec, max_length) for spec in specs]

    # 텐서로 변환
    padded_specs = torch.stack(padded_specs)
    labels = torch.tensor(labels)

    return padded_specs, labels


In [36]:

# 병합된 데이터셋으로 AudioSpectrogramDataset 준비 (5초씩 조각내기)
dataset = AudioSpectrogramDataset(data_combined)

# 세그먼트를 섞기
random.shuffle(dataset.segments)

# Train/Validation Split (섞인 세그먼트로부터 분리)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# DataLoader 설정
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [37]:
from collections import Counter

# Train/Validation Dataset 크기 출력
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

# Train 데이터의 레이블 분포 출력
train_labels = [label for _, label in train_dataset]
train_label_count = Counter(train_labels)
print(f"Train label distribution: {train_label_count}")

# Validation 데이터의 레이블 분포 출력
val_labels = [label for _, label in val_dataset]
val_label_count = Counter(val_labels)
print(f"Validation label distribution: {val_label_count}")


Train dataset size: 8920
Validation dataset size: 2231
Train label distribution: Counter({2: 3382, 0: 2497, 1: 1143, 4: 967, 3: 931})
Validation label distribution: Counter({2: 879, 0: 593, 1: 284, 3: 241, 4: 234})


In [38]:
import torch.nn as nn
import torch.nn.functional as F
class CNNEmotionClassifier(nn.Module):
    def __init__(self, num_classes=5):  # 감정 분류 클래스 수
        super(CNNEmotionClassifier, self).__init__()

        # CNN Layer
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.dropout = nn.Dropout(0.5)

        # Fully connected layer
        self.fc1 = nn.Linear(128 * 8 * 50, 256)  # 128 * 8 * 50 = 51200
        self.fc2 = nn.Linear(256, num_classes)  # 감정 분류 레이블 수

    def forward(self, x):
        # 입력 데이터 크기: [batch_size, 1, 64, time]
        x = self.pool(F.relu(self.conv1(x)))  # (N, 32, 64, time/2)
        x = self.pool(F.relu(self.conv2(x)))  # (N, 64, 32, time/4)
        x = self.pool(F.relu(self.conv3(x)))  # (N, 128, 16, time/8)

        # CNN 출력 후 크기 출력
        # print(f"After conv3: {x.shape}")

        # Flatten: Conv 레이어의 출력을 1D로 변환
        x = x.view(x.size(0), -1)  # Flatten (N, 128 * 8 * 50 = 51200)
        # print(f"After flatten: {x.shape}")

        # Fully connected layer
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [41]:
import torch.optim as optim

# 모델 초기화
model = CNNEmotionClassifier(num_classes=5)  # 5개 감정 범주
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [42]:
import torch
import os
# GPU가 사용 가능하면 GPU로 모델 이동
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 모델을 저장할 디렉토리 설정
model_save_path = "/content/drive/MyDrive/sentiment_classification_projects_241014/music_sentiments_classification/sentiment_classification_cnn_model_4.pth"  # 원하는 경로로 수정

# 초기화
best_val_accuracy = 0.0  # 최고 검증 정확도를 추적

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}')

    # 검증 단계
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_accuracy = 100 * correct / total
    print(f'Validation Loss: {val_loss / len(val_loader)}, Accuracy: {val_accuracy}')

    # 현재 에폭에서의 validation accuracy가 최고일 경우 모델 저장
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        print(f'New best accuracy: {best_val_accuracy}. Saving model...')
        torch.save(model.state_dict(), model_save_path)

print(f"Training completed. Best validation accuracy: {best_val_accuracy}")


Epoch 1, Loss: 1.6767267875346659
Validation Loss: 1.3496027307850973, Accuracy: 47.24338861497087
New best accuracy: 47.24338861497087. Saving model...
Epoch 2, Loss: 1.2229695108629042
Validation Loss: 1.3458802159343446, Accuracy: 44.55401165396683
Epoch 3, Loss: 1.0215396370511756
Validation Loss: 1.3899811110326221, Accuracy: 46.43657552666966
Epoch 4, Loss: 0.8058919799797851
Validation Loss: 1.5638098912579672, Accuracy: 46.122814881219185
Epoch 5, Loss: 0.6239697207366267
Validation Loss: 1.7123533521379743, Accuracy: 47.51232631107127
New best accuracy: 47.51232631107127. Saving model...
Epoch 6, Loss: 0.48726363409991547
Validation Loss: 2.1730080710990087, Accuracy: 42.98520842671448
Epoch 7, Loss: 0.3825800475133683
Validation Loss: 2.2954503655433656, Accuracy: 46.03316898251905
Epoch 8, Loss: 0.35791355027653626
Validation Loss: 2.599057109441076, Accuracy: 45.27117884356791
Epoch 9, Loss: 0.26688980514110205
Validation Loss: 2.8748591410262243, Accuracy: 39.8924249215598

In [43]:
!pip install huggingface_hub



In [44]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [45]:
from huggingface_hub import HfApi, HfFolder, upload_file

# 모델 업로드 경로 설정
repository_id = "jeonghyeon97/music_emontion_classifier_4"

# Hugging Face Hub에 새 저장소 생성
api = HfApi()
api.create_repo(repo_id=repository_id)

# 모델 업로드
model_path = model_save_path  # 저장된 모델 경로
upload_file(
    path_or_fileobj=model_path,
    path_in_repo="pytorch_model.bin",  # 저장소에 저장될 파일명
    repo_id=repository_id
)

# 모델 설명을 추가하기 위해 README 파일 업로드
model_card_content = """
# CNN Emotion Classifier

This model was trained on music emotion classification using a CNN architecture.
"""
with open("README.md", "w") as f:
    f.write(model_card_content)

upload_file(
    path_or_fileobj="README.md",
    path_in_repo="README.md",  # README 파일 업로드
    repo_id=repository_id
)

print(f"Model uploaded to: https://huggingface.co/{repository_id}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


sentiment_classification_cnn_model_4.pth:   0%|          | 0.00/52.8M [00:00<?, ?B/s]

- empty or missing yaml metadata in repo card


Model uploaded to: https://huggingface.co/jeonghyeon97/music_emontion_classifier_4
