# Import

In [11]:
import os
import random
import sys

# sys.path.append(
#     os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# )

import pandas as pd
import numpy as np

from PIL import Image
from tqdm import tqdm 

from sklearn.model_selection import train_test_split

import torchvision.models as tv_models
import timm

import torch
from torch.utils.data import Dataset, DataLoader, Subset
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch import nn, optim

from sklearn.metrics import log_loss
import unicodedata

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


# Hyperparameter Setting

In [12]:
CFG = {
    'IMG_SIZE': 224,
    'BATCH_SIZE': 64,
    'EPOCHS': 20,
    'LEARNING_RATE': 1e-4,
    'SEED' : 42
}

# Fixed RandomSeed

In [13]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED']) # Seed 고정

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# CustomDataset

In [4]:
class CustomImageDataset(Dataset):
    def __init__(self, root_dir, transform=None, is_test=False):
        self.root_dir = root_dir
        self.transform = transform
        self.is_test = is_test
        self.samples = []

        if is_test:
            # 테스트셋: 라벨 없이 이미지 경로만 저장
            for fname in sorted(os.listdir(root_dir)):
                if fname.lower().endswith(('.jpg')):
                    img_path = os.path.join(root_dir, fname)
                    self.samples.append((img_path,))
        else:
            # 학습셋: 클래스별 폴더 구조에서 라벨 추출
            self.classes = sorted(os.listdir(root_dir))
            self.class_to_idx = {cls_name: i for i, cls_name in enumerate(self.classes)}

            for cls_name in self.classes:
                cls_folder = os.path.join(root_dir, cls_name)
                for fname in os.listdir(cls_folder):
                    if fname.lower().endswith(('.jpg')):
                        img_path = os.path.join(cls_folder, fname)
                        label = self.class_to_idx[cls_name]
                        self.samples.append((img_path, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        if self.is_test:
            img_path = self.samples[idx][0]
            image = Image.open(img_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
            return image
        else:
            img_path, label = self.samples[idx]
            image = Image.open(img_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
            return image, label


# Data Load

In [15]:
train_root = '../data/train'
test_root = '../data/test'

In [None]:
train_transform = transforms.Compose([
    transforms.Resize((CFG['IMG_SIZE'], CFG['IMG_SIZE'])),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [16]:
train_transform = transforms.Compose([
    transforms.Resize((CFG['IMG_SIZE'], CFG['IMG_SIZE'])),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.2),
    transforms.RandomRotation(15),
    transforms.ColorJitter(
        brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1
    ),
    transforms.RandomResizedCrop(CFG['IMG_SIZE'], scale=(0.8, 1.0)),
    transforms.RandomAffine(degrees=10, translate=(0.05, 0.05), scale=(0.95, 1.05)),
    transforms.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5)),  # PIL.Image에 적용
    transforms.ToTensor(),  # 여기까지는 PIL.Image 변환
    # transforms.RandomErasing(p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3)),  # Tensor에 적용
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])



val_transform = transforms.Compose([
    transforms.Resize((CFG['IMG_SIZE'], CFG['IMG_SIZE'])),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# 30개 데이터셋만 샘플링해서 실험

In [17]:
# 전체 데이터셋 로드
full_dataset = CustomImageDataset(train_root, transform=None)
print(f"원본 데이터셋 크기: {len(full_dataset)}")

# 1. 30개 클래스 무작위 선택
all_classes = full_dataset.classes
np.random.seed(42)  # 재현성 보장
selected_classes = np.random.choice(all_classes, size=30, replace=False)

# 2. 선택된 클래스의 인덱스 추출
selected_class_indices = [full_dataset.class_to_idx[cls] for cls in selected_classes]

# 3. 선택된 클래스에 해당하는 샘플 인덱스 추출
selected_sample_indices = [
    i for i, (_, label) in enumerate(full_dataset.samples) 
    if label in selected_class_indices
]

# 4. 선택된 샘플의 라벨 추출 (stratified split을 위해)
selected_labels = [full_dataset.samples[i][1] for i in selected_sample_indices]

# 5. Stratified Split 수행 (80:20 비율)
train_idx, val_idx = train_test_split(
    selected_sample_indices,
    test_size=0.2,
    stratify=selected_labels,
    random_state=42
)

# 6. 서브셋 생성 (트랜스포메이션 적용)
train_dataset = Subset(
    CustomImageDataset(train_root, transform=train_transform), 
    train_idx
)
val_dataset = Subset(
    CustomImageDataset(train_root, transform=val_transform), 
    val_idx
)

print(f"선택 클래스 수: {len(selected_classes)}")
print(f"선택 샘플 수: {len(selected_sample_indices)}")
print(f"학습 데이터 크기: {len(train_dataset)}, 검증 데이터 크기: {len(val_dataset)}")
class_names = selected_classes

# 7. DataLoader 생성
train_loader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)


원본 데이터셋 크기: 33137
선택 클래스 수: 30
선택 샘플 수: 2474
학습 데이터 크기: 1979, 검증 데이터 크기: 495


# 원본 - 시간이 너무 오려 걸려서 위에 30개 샘플로 실험할 것

In [27]:
# 전체 데이터셋 로드
full_dataset = CustomImageDataset(train_root, transform=None)
print(f"총 이미지 수: {len(full_dataset)}")

targets = [label for _, label in full_dataset.samples]
class_names = full_dataset.classes

# Stratified Split
train_idx, val_idx = train_test_split(
    range(len(targets)), test_size=0.2, stratify=targets, random_state=42
)

# Subset + transform 각각 적용
train_dataset = Subset(CustomImageDataset(train_root, transform=train_transform), train_idx)
val_dataset = Subset(CustomImageDataset(train_root, transform=val_transform), val_idx)
print(f'train 이미지 수: {len(train_dataset)}, valid 이미지 수: {len(val_dataset)}')


# DataLoader 정의
train_loader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)

총 이미지 수: 33137
train 이미지 수: 26509, valid 이미지 수: 6628


# baseModel Define : resnet18

In [38]:
class BaseModel(nn.Module):
    def __init__(self, num_classes):
        super(BaseModel, self).__init__()
        self.backbone = tv_models.resnet18(pretrained=True)  # ResNet18 모델 불러오기
        self.feature_dim = self.backbone.fc.in_features 
        self.backbone.fc = nn.Identity()  # feature extractor로만 사용
        self.head = nn.Linear(self.feature_dim, num_classes)  # 분류기

    def forward(self, x):
        x = self.backbone(x)       
        x = self.head(x) 
        return x
    

model = BaseModel(num_classes=len(class_names)).to(device)




# base4model.py

In [18]:
class BaseModel(nn.Module):
    def __init__(self, num_classes, backbone_name='resnet18', pretrained=True):
        super(BaseModel, self).__init__()
        self.backbone_name = backbone_name.lower()
        
        if self.backbone_name.startswith('resnet'):
            self.backbone = getattr(tv_models, self.backbone_name)(pretrained=pretrained)
            self.feature_dim = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
            self.head = nn.Linear(self.feature_dim, num_classes)
        
        elif self.backbone_name.startswith('efficientnet'):
            self.backbone = timm.create_model(self.backbone_name, pretrained=pretrained)
            self.feature_dim = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.head = nn.Linear(self.feature_dim, num_classes)
        
        # DenseNet 지원 추가!
        elif self.backbone_name.startswith('densenet'):
            self.backbone = getattr(tv_models, self.backbone_name)(pretrained=pretrained)
            self.feature_dim = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.head = nn.Linear(self.feature_dim, num_classes)
        
        else:
            raise ValueError(f"지원하지 않는 backbone: {backbone_name}")

    def forward(self, x):
        features = self.backbone(x)
        out = self.head(features)
        return out
    
# 여기 모델에서 반복문으로 돌리기.
models = [
    'resnet18',
    'densenet121',
    'resnet101',         # 추가 추천!
    'efficientnet_b3',
    'efficientnet_b4',
    # 'densenet121',     # 보너스로 추가해도 좋음
    # 'vit_base_patch16_224',  # 최신 트렌드 실험용
]


# Train/ Validation

In [19]:
for backbone_name in models:
    model = BaseModel(num_classes=len(class_names), backbone_name = backbone_name).to(device)

    print(f"\n🚀 Training Model: {backbone_name} \n\n")
    # 기준값 초기화
    best_logloss = float('inf')

    patience = 5
    no_improve = 0
    
    # 손실 함수
    criterion = nn.CrossEntropyLoss()

    # 옵티마이저
    optimizer = optim.Adam(model.parameters(), lr=CFG['LEARNING_RATE'])

    # 스케줄러
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)

    # 학습 및 검증 루프 
    for epoch in range(CFG['EPOCHS']):
        # Train
        model.train()
        train_loss = 0.0
        for images, labels in tqdm(train_loader, desc=f"[{backbone_name} Epoch {epoch+1}/Train]"):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)  # logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        all_probs = []
        all_labels = []

        with torch.no_grad():
            for images, labels in tqdm(val_loader, desc=f"[{backbone_name} Epoch {epoch+1}/Val]"):
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                # Accuracy
                _, preds = torch.max(outputs, 1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

                # LogLoss
                probs = F.softmax(outputs, dim=1)
                all_probs.extend(probs.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = 100 * correct / total
        val_logloss = log_loss(all_labels, all_probs, labels=list(range(len(class_names))))

        # 결과 출력
        print(f"[{backbone_name}] Epoch {epoch+1} | "
                f"Train Loss: {avg_train_loss:.4f} | "
                f"Val Loss: {avg_val_loss:.4f} | "
                f"Val LogLoss: {val_logloss:.4f} | "
                f"Valid Accuracy : {val_accuracy:.4f}%")    
        
        scheduler.step(val_logloss)

        # Best model 저장
        
        if val_logloss < best_logloss:
            best_logloss = val_logloss
            no_improve = 0
            torch.save(model.state_dict(), f"best_{backbone_name}_epoch{epoch+1}.pth")
            print(f"✅ Best model saved (LogLoss: {val_logloss:.4f})")
        else:
            no_improve += 1
            if no_improve >= patience:
                print(f"🛑 Early stopping at epoch {epoch+1}")
                break

    del model  # 모델 학습 후 메모리 해제
    torch.cuda.empty_cache()




RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# 추가학습

In [None]:
## 작성중

# 모델 불러오기
model = BaseModel(num_classes=len(class_names), backbone_name='efficientnet_b4').to(device)
model.load_state_dict(torch.load('best_efficientnet_b4_epoch10.pth'))
backbone_name = 'efficientnet_b4'
# 옵티마이저/스케줄러 재설정
optimizer = optim.Adam(model.parameters(), lr=CFG['LEARNING_RATE'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)

# 이어서 학습
for epoch in range(10, 20):  # 11~20에포크
    print(f"\n🚀 Training Model: {backbone_name} \n\n")
    model.train()
    train_loss = 0.0
    for images, labels in tqdm(train_loader, desc=f"[{backbone_name} Epoch {epoch+1}/Train]"):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)  # logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    # 추가 학습 전 검증 수행으로 best_logloss 초기화
    model.eval()
    val_loss, val_logloss, val_accuracy = validate(model, val_loader, criterion, device)
    best_logloss = val_logloss
    no_improve = 0
    patience = 5  # patience도 정의 필요

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc=f"[{backbone_name} Epoch {epoch+1}/Val]"):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Accuracy
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            # LogLoss
            probs = F.softmax(outputs, dim=1)
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct / total
    val_logloss = log_loss(all_labels, all_probs, labels=list(range(len(class_names))))

    # 결과 출력
    print(f"[{backbone_name}] Epoch {epoch+1} | "
            f"Train Loss: {avg_train_loss:.4f} | "
            f"Val Loss: {avg_val_loss:.4f} | "
            f"Val LogLoss: {val_logloss:.4f} | "
            f"Valid Accuracy : {val_accuracy:.4f}%")    
    
    scheduler.step(val_logloss)

    # Best model 저장
    
    if val_logloss < best_logloss:
        best_logloss = val_logloss
        no_improve = 0
        torch.save(model.state_dict(), f"best_{backbone_name}_epoch{epoch+1}.pth")
        print(f"✅ Best model saved (LogLoss: {val_logloss:.4f})")
    else:
        no_improve += 1
        if no_improve >= patience:
            print(f"🛑 Early stopping at epoch {epoch+1}")
            break

del model  # 모델 학습 후 메모리 해제
torch.cuda.empty_cache()

# Inference and submission

In [41]:
test_dataset = CustomImageDataset(test_root, transform=val_transform, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)

In [44]:

def normalize_cols(cols):
    return [unicodedata.normalize('NFC', col) for col in cols]


In [45]:

def normalize_cols(cols):
    return [unicodedata.normalize('NFC', col) for col in cols]

# records = ['best_densenet121_epoch7.pth','best_efficientnet_b3_epoch10.pth',
        #    'best_efficientnet_b4_epoch10.pth','best_resnet101_epoch8.pth']
records = ['best_resnet18_epoch17.pth']
for record in records:
    # 저장된 모델 로드
    parts = record.split('_')
# 올바른 조건문
    if record in ['best_resnet18_epoch17.pth', 'best_resnet101_epoch8.pth']:
        backbone_name = f"{parts[1]}"  # densenet121 등
    else:
        backbone_name = f"{parts[1]}_{parts[2]}"  # efficientnet_b3 등
    
    model = BaseModel(num_classes=len(class_names), backbone_name=backbone_name)
    model.load_state_dict(torch.load(record, map_location=device))
    model.to(device)

    # 추론
    model.eval()
    results = []

    with torch.no_grad():
        for images in test_loader:
            images = images.to(device)
            outputs = model(images)
            probs = F.softmax(outputs, dim=1)

            # 각 배치의 확률을 리스트로 변환
            for prob in probs.cpu():  # prob: (num_classes,)
                result = {
                    class_names[i]: prob[i].item()
                    for i in range(len(class_names))
                }
                results.append(result)
                
    pred = pd.DataFrame(results)

    submission = pd.read_csv('../data/sample_submission.csv', encoding='utf-8-sig')

    # 'ID' 컬럼을 제외한 클래스 컬럼 정렬
    
    class_columns = submission.columns[1:]
    pred.columns = normalize_cols(pred.columns)
    class_columns = normalize_cols(class_columns)
    pred = pred[class_columns]

    submission[class_columns] = pred.values
    submission.to_csv(f'{record}.csv', index=False, encoding='utf-8-sig')



In [18]:
display(pred.head())
display(submission.head())

Unnamed: 0,1시리즈_F20_2013_2015,1시리즈_F20_2016_2019,1시리즈_F40_2020_2024,2008_2015_2017,2시리즈_그란쿠페_F44_2020_2024,2시리즈_액티브_투어러_F45_2019_2021,2시리즈_액티브_투어러_U06_2022_2024,3008_2세대_2018_2023,3시리즈_E90_2005_2012,3시리즈_F30_2013_2018,...,티볼리_에어_2021_2022,파나메라_2010_2016,파나메라_971_2017_2023,파사트_GT_B8_2018_2022,파일럿_3세대_2016_2018,팰리세이드_2019_2022,팰리세이드_LX3_2025,프리우스_4세대_2016_2018,프리우스_4세대_2019_2022,프리우스_C_2018_2020
0,7.511935e-07,1.445385e-06,1.592065e-07,4.357784e-06,3.298061e-07,2e-06,7.070755e-07,7.793965e-07,2.955839e-07,1e-06,...,5.70489e-07,2e-06,8.812796e-07,1e-06,2.839943e-06,3e-06,1.176699e-05,2e-06,1.2e-05,1.151721e-06
1,5.752513e-05,9.487465e-05,0.0001280151,3.506709e-05,3.918644e-05,3.8e-05,0.000632635,7.821716e-05,0.0001066425,3.1e-05,...,7.235167e-05,1.5e-05,1.742213e-05,1.5e-05,2.497383e-05,0.000106,5.077703e-05,1e-05,4.1e-05,1.293226e-05
2,5.629503e-05,1.849107e-05,0.0001507656,6.678894e-06,6.522421e-05,3e-06,0.0002378862,1.058777e-05,5.987513e-07,1e-05,...,1.142872e-05,8e-06,0.0001057245,1.4e-05,2.381332e-05,5e-06,1.378176e-05,3e-06,2.9e-05,2.411537e-06
3,3.519213e-06,1.676211e-07,1.582139e-06,5.28335e-07,2.171661e-06,2e-06,2.027208e-05,3.179571e-06,2.785918e-06,4e-06,...,3.961595e-07,4e-06,3.668167e-05,8e-06,8.54951e-07,1.9e-05,1.212071e-05,2e-06,6e-06,5.782074e-07
4,0.0001922888,0.005125251,4.367732e-05,5.357838e-06,1.428535e-05,9.8e-05,1.358307e-06,1.325173e-06,5.119073e-05,0.000526,...,1.372026e-06,6e-05,1.196302e-06,9e-06,3.170576e-07,4e-06,4.8737e-07,6e-06,2e-06,6.729611e-06


Unnamed: 0,ID,1시리즈_F20_2013_2015,1시리즈_F20_2016_2019,1시리즈_F40_2020_2024,2008_2015_2017,2시리즈_그란쿠페_F44_2020_2024,2시리즈_액티브_투어러_F45_2019_2021,2시리즈_액티브_투어러_U06_2022_2024,3008_2세대_2018_2023,3시리즈_E90_2005_2012,...,티볼리_에어_2021_2022,파나메라_2010_2016,파나메라_971_2017_2023,파사트_GT_B8_2018_2022,파일럿_3세대_2016_2018,팰리세이드_2019_2022,팰리세이드_LX3_2025,프리우스_4세대_2016_2018,프리우스_4세대_2019_2022,프리우스_C_2018_2020
0,TEST_00000,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TEST_00001,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TEST_00002,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TEST_00003,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TEST_00004,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Submission

In [15]:
class_columns = submission.columns[1:]

print(class_columns)

Index(['1시리즈_F20_2013_2015', '1시리즈_F20_2016_2019', '1시리즈_F40_2020_2024',
       '2008_2015_2017', '2시리즈_그란쿠페_F44_2020_2024',
       '2시리즈_액티브_투어러_F45_2019_2021', '2시리즈_액티브_투어러_U06_2022_2024',
       '3008_2세대_2018_2023', '3시리즈_E90_2005_2012', '3시리즈_F30_2013_2018',
       ...
       '티볼리_에어_2021_2022', '파나메라_2010_2016', '파나메라_971_2017_2023',
       '파사트_GT_B8_2018_2022', '파일럿_3세대_2016_2018', '팰리세이드_2019_2022',
       '팰리세이드_LX3_2025', '프리우스_4세대_2016_2018', '프리우스_4세대_2019_2022',
       '프리우스_C_2018_2020'],
      dtype='object', length=396)


In [16]:
print(pred.columns)
print(class_columns)


Index(['1시리즈_F20_2013_2015', '1시리즈_F20_2016_2019',
       '1시리즈_F40_2020_2024', '2008_2015_2017',
       '2시리즈_그란쿠페_F44_2020_2024',
       '2시리즈_액티브_투어러_F45_2019_2021',
       '2시리즈_액티브_투어러_U06_2022_2024', '3008_2세대_2018_2023',
       '3시리즈_E90_2005_2012', '3시리즈_F30_2013_2018',
       ...
       '티볼리_에어_2021_2022', '파나메라_2010_2016',
       '파나메라_971_2017_2023', '파사트_GT_B8_2018_2022',
       '파일럿_3세대_2016_2018', '팰리세이드_2019_2022',
       '팰리세이드_LX3_2025', '프리우스_4세대_2016_2018',
       '프리우스_4세대_2019_2022', '프리우스_C_2018_2020'],
      dtype='object', length=396)
Index(['1시리즈_F20_2013_2015', '1시리즈_F20_2016_2019', '1시리즈_F40_2020_2024',
       '2008_2015_2017', '2시리즈_그란쿠페_F44_2020_2024',
       '2시리즈_액티브_투어러_F45_2019_2021', '2시리즈_액티브_투어러_U06_2022_2024',
       '3008_2세대_2018_2023', '3시리즈_E90_2005_2012', '3시리즈_F30_2013_2018',
       ...
       '티볼리_에어_2021_2022', '파나메라_2010_2016', '파나메라_971_2017_20

In [20]:
if list(pred.columns) == list(class_columns):
    print("컬럼명이 완전히 일치합니다!")
else:
    print("컬럼명이 다릅니다!")
    print("pred에만 있는 컬럼:", set(pred.columns) - set(class_columns))
    print("class_columns에만 있는 컬럼:", set(class_columns) - set(pred.columns))


컬럼명이 완전히 일치합니다!


In [None]:
import unicodedata

def normalize_cols(cols):
    return [unicodedata.normalize('NFC', col) for col in cols]

pred.columns = normalize_cols(pred.columns)
class_columns = normalize_cols(class_columns)


In [21]:
submission = pd.read_csv('../data/sample_submission.csv', encoding='utf-8-sig')

# 'ID' 컬럼을 제외한 클래스 컬럼 정렬

pred.columns = normalize_cols(pred.columns)
class_columns = normalize_cols(class_columns)
class_columns = submission.columns[1:]
pred = pred[class_columns]

submission[class_columns] = pred.values
submission.to_csv('baseline_submission.csv', index=False, encoding='utf-8-sig')

# MEtric = LogLoss

In [None]:

import numpy as np
from sklearn.metrics import log_loss
import pandas as pd

def multiclass_log_loss(answer_df, submission_df):
    class_list = sorted(answer_df['label'].unique())
    
    if submission_df.shape[0] != answer_df.shape[0]:
        raise ValueError("submission_df 행 개수가 answer_df와 일치하지 않습니다.")

    submission_df = submission_df.sort_values(by='ID').reset_index(drop=True)
    answer_df = answer_df.sort_values(by='ID').reset_index(drop=True)

    if not all(answer_df['ID'] == submission_df['ID']):
        raise ValueError("ID가 정렬되지 않았거나 불일치합니다.")
    
    missing_cols = [col for col in class_list if col not in submission_df.columns]
    if missing_cols:
        raise ValueError(f"클래스 컬럼 누락: {missing_cols}")
    
    if submission_df[class_list].isnull().any().any():
        raise ValueError("NaN 포함됨")
    for col in class_list:
        if not ((submission_df[col] >= 0) & (submission_df[col] <= 1)).all():
            raise ValueError(f"{col}의 확률값이 0~1 범위 초과")

    # 정답 인덱스 변환
    true_labels = answer_df['label'].tolist()
    true_idx = [class_list.index(lbl) for lbl in true_labels]

    # 확률 정규화 + clip
    probs = submission_df[class_list].values
    probs = probs / probs.sum(axis=1, keepdims=True)
    y_pred = np.clip(probs, 1e-15, 1 - 1e-15)

    return log_loss(true_idx, y_pred, labels=list(range(len(class_list))))

# 예시 데이터 (클래스 3개: A, B, C)
answer_df = pd.DataFrame({
    'ID': [1, 2, 3],
    'label': ['A', 'B', 'C']
})

submission_df = pd.DataFrame({
    'ID': [1, 2, 3],
    'A': [0.7, 0.1, 0.2],
    'B': [0.2, 0.8, 0.3],
    'C': [0.1, 0.1, 0.5]
})

# 함수 실행
loss = multiclass_log_loss(answer_df, submission_df)
print(f"Log Loss: {loss:.6f}") # Log Loss: 0.424322