# 이미지 식별 머신을 위한 데이터 준비

### 0.준비

1. 라이브러리 import

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn, optim
import torchvision
from torchvision import datasets, transforms
from PIL import Image  # 이미지 처리 라이브러리(pillow)

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

2. 데이터 디렉토리, 분할 비율, 데이터 변환 방법 설정

- ✔️ResNet : 

- torch transforms : https://docs.pytorch.org/vision/0.9/transforms.html

In [None]:
data_dir = './data'
valid_size = 0.2

# 이미지 데이터의 변환 방법 설정 (t_transforms)
t_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),  # ResNet이 처리하는 이미지 크기(224x224)로 crop -> tensor 변환
    transforms.Resize(224),
    transforms.ToTensor()
])

torchvision.transforms.transforms.Compose

In [4]:
print(t_transforms)  # size-(224, 224), ToTensor() 확인

Compose(
    RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bilinear)
    Resize(size=224, interpolation=bilinear, max_size=None, antialias=None)
    ToTensor()
)


### 1. train/test 데이터 로더 생성

- 데이터셋 폴더 등 지정

In [5]:
# train/test 데이터 디렉터리/변환방식 지정

train_data = datasets.ImageFolder(data_dir, transform=t_transforms)
test_data = datasets.ImageFolder(data_dir, transform=t_transforms)

print(train_data)

print(len(train_data))
print(len(test_data))

Dataset ImageFolder
    Number of datapoints: 155
    Root location: ./data
    StandardTransform
Transform: Compose(
               RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bilinear)
               Resize(size=224, interpolation=bilinear, max_size=None, antialias=None)
               ToTensor()
           )
155
155


- 인덱스 섞기

In [6]:
# train_data의 사이즈만큼의 정수값을 갖는 인덱스 리스트(indices) 생성

num_train = len(train_data)
indices = list(range(num_train))
print(indices)
print(len(indices))

# 인덱스 리스트를 랜덤으로 섞고 확인
np.random.shuffle(indices)
print(indices)
print(len(indices))


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154]
155
[129, 80, 62, 14, 51, 60, 89, 77, 86, 79, 71, 72, 116, 85, 119, 76, 40, 42, 16, 67, 106, 148, 9, 34, 5, 78, 0, 131, 125, 12, 102, 55, 35, 61, 18, 150, 126, 41, 13, 15, 111, 115, 75, 84, 128, 58, 104, 93, 59, 141, 2, 7, 8, 120, 154, 137, 37, 98, 22, 95, 132, 48, 100, 143, 144, 20, 112, 70, 139, 121, 73, 46, 123, 17, 11, 24, 152, 

- 데이터 로더 함수 : 분할 지점 계산 (train과 valid)

In [None]:
print(valid_size)  # 0.2
print(num_train)  # 155

0.2
155


In [7]:
# split 지점 계산
split = int(np.floor(num_train * valid_size))
print(split)

31


In [8]:
# split을 기준으로 -> train/test 인덱스 리스트 분할

train_idx, test_idx = indices[split:], indices[:split]

print(train_idx)
print(len(train_idx))
print(test_idx)
print(len(test_idx))

[55, 35, 61, 18, 150, 126, 41, 13, 15, 111, 115, 75, 84, 128, 58, 104, 93, 59, 141, 2, 7, 8, 120, 154, 137, 37, 98, 22, 95, 132, 48, 100, 143, 144, 20, 112, 70, 139, 121, 73, 46, 123, 17, 11, 24, 152, 81, 57, 134, 151, 105, 6, 44, 130, 1, 45, 135, 21, 19, 138, 108, 94, 39, 146, 23, 133, 27, 107, 36, 124, 90, 142, 3, 117, 26, 4, 147, 114, 30, 92, 96, 127, 47, 56, 101, 50, 32, 99, 136, 87, 63, 153, 29, 52, 43, 91, 66, 68, 28, 53, 82, 49, 145, 149, 113, 69, 88, 110, 74, 83, 64, 65, 31, 38, 54, 25, 109, 103, 140, 10, 33, 122, 97, 118]
124
[129, 80, 62, 14, 51, 60, 89, 77, 86, 79, 71, 72, 116, 85, 119, 76, 40, 42, 16, 67, 106, 148, 9, 34, 5, 78, 0, 131, 125, 12, 102]
31


- 데이터 로더 함수 : 샘플러 및 로더

In [11]:
# 데이터 샘플링 방식 지정
from torch.utils.data.sampler import SubsetRandomSampler
train_sampler = SubsetRandomSampler(train_idx)
test_sampler = SubsetRandomSampler(test_idx)

trainloader = torch.utils.data.DataLoader(train_data, sampler=train_sampler, batch_size=16)
testloader = torch.utils.data.DataLoader(test_data, sampler=test_sampler, batch_size=16)

print(trainloader.dataset.classes)

['Basalt', 'Highland']


In [None]:
# 위의 코드들을 묶어서 load_split_train_test() 함수를 만든다. (입력 : 데이터 디렉토리, 분할 비율) (출력 : 학습 데이터 로더, 테스트 데이터 로더)

def load_split_train_test(data_dir, valid_size) :
    t_transforms = transforms.Compose([
                transforms.RandomResizedCrop(224),
                transforms.Resize(224),
                transforms.ToTensor()
    ])
    
    train_data = datasets.ImageFolder(data_dir, transform=t_transforms)
    test_data = datasets.ImageFolder(data_dir, transform=t_transforms)
    num_train = len(train_data)
    indices = list(range(num_train))

    np.random.shuffle(indices)
    split = int(np.floor(num_train * valid_size))
    train_idx, test_idx = indices[split:], indices[:split]
    from torch.utils.data.sampler import SubsetRandomSampler

    train_sampler = SubsetRandomSampler(train_idx)
    test_sampler = SubsetRandomSampler(test_idx)

    trainloader = torch.utils.data.DataLoader(train_data, sampler=train_sampler, batch_size=16)
    testloader = torch.utils.data.DataLoader(test_data, sampler=test_sampler, batch_size=16)

    return trainloader, testloader


### 2. Compute device 설정

### 3. 모델 선정 : ResNet50 (pretrained)

### 4. 신경망 FCL 수정

#### (1)신경망 입/출력층 설정

### 5. 모델 train/val/test

### 6. 암석 예측