## 0) 필요한 라이브러리 설치 및 Import

In [None]:
!pip install -q torchvision sklearn

# import
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

## 1) 캐글에서 데이터 다운로드

In [None]:
from google.colab import files
files.upload()  # kaggle.json

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d paultimothymooney/chest-xray-pneumonia
!unzip -q chest-xray-pneumonia.zip -d /content

# 중복 제거 (있는 경우만)
!mv /content/chest_xray/chest_xray/* /content/chest_xray/ 2>/dev/null
!rm -r /content/chest_xray/chest_xray 2>/dev/null

## 1) eda

In [None]:
csv_files = {
    'train': '/content/DATA/csv/train_list.csv',
    'val': '/content/DATA/csv/val_list.csv',
    'test': '/content/DATA/csv/test_list.csv',
}

#각 split 별, label 별 historgram 살펴보기

fig, axes = plt.subplots(nrow=3, ncols=1, figsize=(8, 12))
fig.title('Histogram of CT image datasets', fontsize=16)
colors = ['skyblue', 'orange']

for idx, (split, path) in enumerate(csv_paths.items()):
    df = pd.read_csv(path)
    
    label_counts = df['label'].value_counts().sort_index()
    
    # 막대 차트 그리기
    axes[idx].bar(['Normal (0)', 'Pneumonia (1)'], label_counts, color=colors)
    axes[idx].set_title(f'{split} Set')
    axes[idx].set_ylabel('Count')
    axes[idx].set_ylim(0, max(label_counts) * 1.2)

    # 개수 텍스트로 표시
    for i, count in enumerate(label_counts):
        axes[idx].text(i, count + 5, str(count), ha='center', va='bottom')

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

- train 데이터의 불균형이 심하다. 따라서 augmentation을 통해 데이터 개수의 균형을 맞추는 것이 중요


In [None]:
#미리 보기

#train 각 label 별로 / val/test.. 2*3 형태로


# Subplot: 3행 2열
fig, axes = plt.subplots(3, 2, figsize=(10, 10))
axes = axes.flatten()  # 1차원 배열로 변환

for split, path in csv_paths.items():
    for lb in ["NORMAL", "PNEUMONIA"]:
        df = pd.read_csv(path)
        sample_df = df.loc[df.label == lb, ['path', 'label']]
        
        img_path = sample_df.loc[0, 'path']
        label = sample_df.loc[0, 'label']
        
        # 이미지 불러오기
        image = Image.open(img_path)
        
        # 시각화
        axes[i].imshow(image, cmap='gray')
        axes[i].set_title(f'Label: {label}')
        axes[i].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# dataloader

class CSVImageDataset(Dataset):
    def __init__(self, csv_path, transform=None):
        self.df = pd.read_csv(csv_path)
        self.transform = transform
        # 문자열 라벨을 숫자로 바꾸는 매핑
        self.label_map = {'NORMAL': 0, 'PNEUMONIA': 1}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_path = self.df.iloc[idx]['path']
        label_str = self.df.iloc[idx]['label']
        label = self.label_map[label_str]

        # 흑백 이미지 불러오기
        image = Image.open(image_path).convert('L')
        if self.transform:
            image = self.transform(image)
        return image, label