In [1]:
import shutil
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPool2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

from tensorflow.keras.layers.experimental.preprocessing import RandomFlip
from tensorflow.keras.layers.experimental.preprocessing import RandomRotation
from tensorflow.keras.layers.experimental.preprocessing import RandomZoom
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from keras.utils import to_categorical

In [2]:
X_df = pd.read_csv('data/X_train.csv', header=None)
Y_df = pd.read_csv('data/y_train.csv', header=None)
    
X = X_df.values / 255 # 0부터 1사이의 값으로 변환
Y = Y_df.values

In [3]:
### 모델만들기
def get_simple_model():
    model = Sequential([
        Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=(28, 28, 4)),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(64, activation="relu"),
        Dense(4, activation="softmax"), # 4개의 카테고리를 가지고 있어서...
    ])
    return model

def get_alexnet():
    model = Sequential([
        Conv2D(kernel_size=(3, 3), filters=32, activation='relu', input_shape=(28, 28, 4)),
        MaxPool2D(strides=(2, 2)),
        Conv2D(kernel_size=(3, 3),filters=64, activation='relu'),
        MaxPool2D(strides=(2, 2)),
        Conv2D(kernel_size=(3, 3),filters=128, activation='relu'),
        Flatten(),
        Dense(units=128, activation='relu'),
        Dense(4, activation='softmax')
    ])
    return model

def get_vggnet():
    model = tf.keras.Sequential([
        Conv2D(kernel_size=(3,3), filters=32, padding='same', activation='relu', input_shape=(28, 28, 4)),
        Conv2D(kernel_size=(3,3), filters=64, padding='same', activation='relu'),
        MaxPool2D(strides=(2,2)),
        Dropout(rate=0.3),
        Conv2D(kernel_size=(3,3), filters=128,  padding='same', activation='relu'),
        Conv2D(kernel_size=(3,3), filters=256,  padding='valid', activation='relu'),
        MaxPool2D(strides=(2,2)),
        Dropout(rate=0.3),
        Flatten(),
        Dense(units=512, activation='relu'),
        Dropout(rate=0.3),
        Dense(units=256, activation='relu'),
        Dropout(rate=0.3),
        Dense(4, activation='softmax')
    ])
    return model

In [4]:
class CNNSatelliteProject():
    '''
        oversampling_type=['SMOTE', 'ADASYN']
        model_type=['simple', 'alexnet', 'vggnet']
    '''
    def __init__(self,
                 n_splits=10,
                 oversampling_type='SMOTE',
                 oversampling_n_neighbors=3,
                 model_type='simple',
                 optimizer='adam',
                 batch_size=32,
                 epochs=50,
                 log="default"):
        self.n_splits=n_splits
        self.oversampling_type=oversampling_type
        self.oversampling_n_neighbors=oversampling_n_neighbors
        self.model_type=model_type
        self.optimizer=optimizer
        self.batch_size=batch_size
        self.epochs=epochs
        self.log=log
        
        self.prepare_data()
        self.prepare_model()
        self.print()

    def prepare_data(self):
        # oversampling
        if self.oversampling_type=='SMOTE':
            smt = SMOTE(k_neighbors=self.oversampling_n_neighbors)
            self.X, self.Y = smt.fit_sample(X, Y)
        else:
            adasyn = ADASYN(n_neighbors=self.oversampling_n_neighbors)
            self.X, self.Y = adasyn.fit_sample(X, Y)
            
        ### Reshape X
        # Before: 2000 * 3136
        # AFter: 2000 * 28 * 28 * 4
        self.X = self.X.reshape([self.X.shape[0], 28, 28, 4])
        
        ### Stratified sampling
        self.x_train, self.x_valid, self.y_train, self.y_valid = train_test_split(self.X, self.Y, test_size=0.2, random_state=0, stratify=self.Y, shuffle=True)
        
        ### Image Augmentation
        self.generator = ImageDataGenerator(
            featurewise_center=True,
            featurewise_std_normalization=True,
            rotation_range=20,       # 회전
            # zoom_range=0.15,       # 확대
            width_shift_range=0.2,   # 이동
            height_shift_range=0.2,
            # shear_range=0.15,      # 굴절
            horizontal_flip=True,    # 가로반전
            fill_mode='nearest'
        )
        self.generator.fit(self.x_train)
            
    def prepare_model(self):
        if self.model_type == 'alexnet':
            self.model = get_alexnet()
        elif self.model_type == 'vggnet':
            self.model = get_vggnet()
        else:
            self.model = get_simple_model()

        self.model.compile(loss="categorical_crossentropy", optimizer=self.optimizer, metrics=['accuracy'])
        self.model.summary()
    
    def fit(self):
        log_dir = 'log/{}'.format(self.log)
        shutil.rmtree(log_dir, ignore_errors=True)
        tensorboard = TensorBoard(log_dir=log_dir)
        self.model.fit_generator(
            self.generator.flow(self.x_train, self.y_train, batch_size=self.batch_size), 
            validation_data=(self.x_valid, self.y_valid),
            steps_per_epoch=len(self.x_train) // 32,
            epochs=self.epochs,
            callbacks=[tensorboard])
    
    def print(self):
        print('orignal', X.shape, Y.shape)
        print('oversampling', self.X.shape, self.Y.shape)
        print('Stratified sampling', self.x_train.shape, self.x_valid.shape, self.y_train.shape, self.y_valid.shape)
        
        ratios = np.sum(self.Y, axis=0) / self.Y.shape[0]
        labels = ['Barren Land', 'Trees', 'Grassland', 'Other']
        for l, v in zip(labels, ratios * 100):
            print('{}: {:.1f}%'.format(l, v))

In [5]:
# 기본적인 사용법
runner = CNNSatelliteProject()
runner.fit()

# runner = CNNSatelliteProject(log='alexnet', model_type='alexnet', oversampling_n_neighbors=5)
# runner.fit()



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 28, 28, 32)        1184      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 14, 14, 32)        0         
_________________________________________________________________
flatten (Flatten)            (None, 6272)              0         
_________________________________________________________________
dense (Dense)                (None, 64)                401472    
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 260       
Total params: 402,916
Trainable params: 402,916
Non-trainable params: 0
_________________________________________________________________
orignal (2000, 3136) (2000, 4)
oversampling (4000, 28, 28, 4) (4000, 4)
Stratified sampling (3200, 28, 28, 4) (800, 



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 26, 26, 32)        1184      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 13, 13, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 11, 11, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 3, 3, 128)         73856     
_________________________________________________________________
flatten_1 (Flatten)          (None, 1152)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)              

In [6]:
# def to_class_str(labels, idx):
#     if labels[idx, 0] == 1:
#         return 'Barren Land'
#     elif labels[idx, 1] == 1:
#         return 'Trees'
#     elif labels[idx, 2] == 1:
#         return 'Grassland'
#     else:
#         return 'Other'

In [7]:
### 일단은 주석처리 (필요시 주석제거하고 사용)
# X_test_path = 'data/X_test.csv'

# print('Loading Test Data')
# X_test_df = pd.read_csv(X_test_path, header=None)
# X_test = X_test_df.values / 255
# print(X_test.shape)

# n_images = X_test.shape[0]
# shape = [n_images, 28, 28, 4]
# X_test_img = X_test.reshape(shape)
# print(X_test_img.shape)  

# preds = tf.argmax(model.predict(X_test_img), 1)
# preds = preds.numpy()

# with open('team 3.csv', 'w') as f:
#     f.write('ID' + ',' + 'CLASS' + '\n')
#     for i, label in enumerate(preds):         
#         f.write(str(i) + ',' + str(label) + '\n')

In [8]:
# from sklearn.utils.class_weight import compute_class_weight
# y_integers = np.argmax(Y_train, axis=1)
# class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers)
# class_weights = dict(enumerate(class_weights))
# print(class_weights)

In [9]:
### Evaluation
# Kfold인데 성능상 큰 차이 없음. 쓸지말지 고려해야함.
# skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=10)

# newY = np.argmax(Y, axis=1)
# all_scores = []

# for train_idx, valid_idx in skf.split(X, newY):
#     X_train, X_valid = X[train_idx], X[valid_idx]
#     y_train, y_valid = newY[train_idx], newY[valid_idx]
    
#     y_train = to_categorical(y_train, num_classes=4)
#     y_valid = to_categorical(y_valid, num_classes=4)
#     print(y_train.shape, y_valid.shape)
#     model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
#     scores = model.evaluate(X_valid, y_valid, batch_size=batch_size)
#     all_scores.append(scores)
# loss, accuracy = np.mean(all_scores, axis=0)
# print(accuracy)