In [None]:
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import os

## 데이터 로드

In [None]:
# https://drive.google.com/file/d/1-HdykExDqZrR5FO9a0vl2c0koPqJB-8F/view?usp=sharing

import gdown, os, zipfile

file_id = '1-HdykExDqZrR5FO9a0vl2c0koPqJB-8F'
gdown.download(f'http://drive.google.com/uc?id={file_id}', 'file.zip', quiet=False)

dir = 'hair-loss'
os.makedirs(dir, exist_ok=True)

with zipfile.ZipFile('file.zip', 'r') as z:
  z.extractall(dir)

Downloading...
From (original): http://drive.google.com/uc?id=1-HdykExDqZrR5FO9a0vl2c0koPqJB-8F
From (redirected): https://drive.google.com/uc?id=1-HdykExDqZrR5FO9a0vl2c0koPqJB-8F&confirm=t&uuid=2b93ae95-9807-408c-886c-06c218a0a00d
To: /content/file.zip
100%|██████████| 1.25G/1.25G [00:07<00:00, 172MB/s]


In [None]:
def load_data(base_dir = 'hair-loss/Training'):
    images = []
    labels = []
    class_names = []

    for class_name in os.listdir(base_dir):
        class_name_path = os.path.join(base_dir, class_name)

        # 디렉토리인 경우만
        if os.path.isdir(class_name_path):
            for image_name in os.listdir(class_name_path):
                image_path = os.path.join(class_name_path, image_name)
                images.append(image_path)
                labels.append(int(class_name))

    return images, np.array(labels)

images, labels = load_data()

print(labels.shape)

(5402,)


## 클래스별 개수 확인

In [None]:
classes, counts = np.unique(labels, return_counts=True)

# 출력
for cls, count in zip(classes, counts):
    print(f"Class {cls}: {count}개")

Class 0: 534개
Class 1: 235개
Class 2: 3797개
Class 3: 836개


## 데이터 resize

In [None]:
import tensorflow as tf
import cv2

# 리사이즈 크기 설정
IMAGE_SIZE = (224, 224)

# 결과 저장 리스트
processed_images = []

# 이미지 경로 리스트 반복 - 이미지 읽기
for image_path in images:
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, IMAGE_SIZE)
    processed_images.append(image)

# 최종 이미지
images = np.array(processed_images)

print(images.shape)
print(labels.shape)

(5402, 224, 224, 3)
(5402,)


## 데이터 분할

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# 1. train + val : test = 90:10
tr_images, test_images, tr_labels, test_labels = train_test_split(
    images, labels, test_size=0.1, random_state=42, stratify=labels
)

# 2. train : val = 80 : 10 (train_val 데이터셋에서 다시 분할)
tr_images, val_images, tr_labels, val_labels = train_test_split(
    tr_images, tr_labels, test_size=0.111, random_state=42, stratify=tr_labels
)

tr_images, val_images = np.array(tr_images), np.array(val_images)
tr_labels, val_labels = np.array(tr_labels), np.array(val_labels)


# 결과 확인
print(f"Train data: {tr_images.shape}, {tr_labels.shape}")
print(f'Validation data: {val_images.shape}, {val_labels.shape}')
print(f'Test data: {test_images.shape}, {test_labels.shape}')
# print(tr_images[0]) # numpy 배열 형태


'''
Train data: (4321, 224, 224, 3), (4321,)
Validation data: (540, 224, 224, 3), (540,)
Test data: (541, 224, 224, 3), (541,)
'''

Train data: (4321, 224, 224, 3), (4321,)
Validation data: (540, 224, 224, 3), (540,)
Test data: (541, 224, 224, 3), (541,)


'\nTrain data: (4321, 224, 224, 3), (4321,)\nValidation data: (540, 224, 224, 3), (540,)\nTest data: (541, 224, 224, 3), (541,)\n'

## Train data 증강 준비
- 클래스 별 데이터 분리
- 증강 설정 (crop, rotation, flip)
- 최종 증강 이미지 개수 설정

In [None]:
import os
import numpy as np
import albumentations as A
from tqdm import tqdm
import cv2

# 데이터 분리 (이미지 픽셀 데이터)
class0_images = tr_images[tr_labels == 0]
class1_images = tr_images[tr_labels == 1]
class2_images = tr_images[tr_labels == 2]
class3_images = tr_images[tr_labels == 3]

# 증강 설정
augmentor = A.Compose([
    A.RandomCrop(height=200, width=200, p=0.5),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=10, p=0.7),
    A.HorizontalFlip(p=1),
    A.VerticalFlip(p=1),
    A.Resize(height=224, width=224, p=1)
])

# 증강할 개수 설정
aug_class0_count = len(class0_images) * 2
aug_class1_count = len(class1_images) * 5
aug_class3_count = len(class3_images)


# 이미지 저장 함수 (batch 처리)
def save_images_batch(images, labels, class_dirs, prefix, batch_size=100):
    """
    배치 단위로 이미지 저장합니다.
    """
    for i in tqdm(range(0, len(images), batch_size)):
        batch_images = images[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]

        for j, (image, label) in enumerate(zip(batch_images, batch_labels)):
            class_dir = class_dirs[label]
            file_path = os.path.join(class_dir, f'{prefix}_image_{i + j}.jpg')
            cv2.imwrite(file_path, image)  # OpenCV로 이미지 저장


# 전체 디렉토리
save_dir = 'aug_training'
os.makedirs(save_dir, exist_ok=True)

# class 별 디렉토리 설정
class_dirs = {0: os.path.join(save_dir, '0'),
              1: os.path.join(save_dir, '1'),
              2: os.path.join(save_dir, '2'),
              3: os.path.join(save_dir, '3')}

for class_label, class_path in class_dirs.items():
    os.makedirs(class_path, exist_ok=True)


# 증강 함수
def images_augment_save(images, labels, count, class_dirs, prefix, batch_size=100):
    """
    이미지를 증강하고 배치 단위로 디스크에 저장
    """
    augmented_images = []
    for i in tqdm(range(count)):
        for image in images:
            augmented = augmentor(image=image)['image']
            augmented_images.append(augmented)

        if len(augmented_images) >= batch_size:
            save_images_batch(augmented_images, [labels[0]] * len(augmented_images), class_dirs, prefix, batch_size)
            augmented_images = []  # 배치 저장 후 리스트 비우기

    # 남은 배치가 있을 경우 저장
    if augmented_images:
        save_images_batch(augmented_images, [labels[0]] * len(augmented_images), class_dirs, prefix, batch_size)


# 증강 이미지 저장 (기존 이미지 포함)
save_images_batch(class0_images, [0] * len(class0_images), class_dirs, 'original', batch_size=50)
save_images_batch(class1_images, [1] * len(class1_images), class_dirs, 'original', batch_size=50)
save_images_batch(class2_images, [2] * len(class2_images), class_dirs, 'original', batch_size=50)
save_images_batch(class3_images, [3] * len(class3_images), class_dirs, 'original', batch_size=50)

# 증강된 이미지 저장
images_augment_save(class0_images, [0] * len(class0_images), aug_class0_count, class_dirs, 'aug', batch_size=50)
images_augment_save(class1_images, [1] * len(class1_images), aug_class1_count, class_dirs, 'aug', batch_size=50)
images_augment_save(class3_images, [3] * len(class3_images), aug_class3_count, class_dirs, 'aug', batch_size=50)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  0%|          | 0/9 [00:00<?, ?it/s][A
100%|██████████| 9/9 [00:00<00:00, 51.11it/s]
 83%|████████▎ | 710/856 [05:51<01:12,  2.01it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A
100%|██████████| 9/9 [00:00<00:00, 56.03it/s]
 83%|████████▎ | 711/856 [05:51<01:11,  2.02it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A
100%|██████████| 9/9 [00:00<00:00, 56.08it/s]
 83%|████████▎ | 712/856 [05:52<01:11,  2.00it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A
100%|██████████| 9/9 [00:00<00:00, 56.36it/s]
 83%|████████▎ | 713/856 [05:52<01:11,  2.00it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A
100%|██████████| 9/9 [00:00<00:00, 57.11it/s]
 83%|████████▎ | 714/856 [05:52<01:10,  2.01it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A
100%|██████████| 9/9 [00:00<00:00, 55.86it/s]
 84%|████████▎ | 715/856 [05:53<01:10,  2.01it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A
100%|██████████| 9/9 [00:00<00:00, 54.85it/s]
 84%|████████▎ | 716/856 [

In [None]:
for i in range(4):
    directory_path = f'/content/aug_training/{i}'
    # 파일 개수 세기
    image_count = sum(1 for file in os.listdir(directory_path) if file.lower().endswith('.jpg'))
    print(f"class{i}: {image_count}")

class0: 856
class1: 376
class2: 3037
class3: 1336


## zip 파일로 저장

In [None]:
import shutil

# 7. 'aug_training' 폴더를 'aug_training.zip'으로 압축
shutil.make_archive('aug_training', 'zip', 'aug_training')

'/content/aug_training.zip'