# 모델 제작 및 predict 수행

### 모델에 필요한 패키지 호출

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

### 전처리 과정에서 처리한 데이터들을 호출
#### 호출후 labeling을 추가하여 DataFrame을 생성

In [None]:
real_path = './real_MTCNN/'
fake_path = './fake_MTCNN/' 


file = []
for filename in os.listdir(real_path ):
    file.append(real_path + '/' + filename)
    
real_df = pd.DataFrame(columns = ['file','label'])
real_df['file'] = file
real_df['label'] = 0
print(real_df[:3])

file = []
for filename in os.listdir(fake_path ):
    file.append(fake_path + '/' + filename)
    
fake_df = pd.DataFrame(columns = ['file','label'])
fake_df['file'] = file
fake_df['label'] = 1
print(fake_df[:3])

### 저장된 Real DataFrame과 Fake DataFrame을 결하여 기본 DataSet을 구성
#### sklearn의 train_test_split 모듈을 활용하여 train DataSet과 valid DataSet으로 분리

In [None]:
df = pd.concat([real_df, fake_df], axis=0)
train_df, valid_df = train_test_split(df, test_size=0.2, random_state = 25, stratify = df['label'])

print(train_df['label'].value_counts())
print(valid_df['label'].value_counts())

### 저장된 데이터셋을 활용하여 ImageDataGenerator를 제작
#### 이미지를 model에 입력할수있도록 제작 

In [None]:
batch_size = 10
image_size = (300,300)

train_datagen = ImageDataGenerator(rescale=1/255,)
validation_datagen = ImageDataGenerator(rescale=1/255,)
test_datagen = ImageDataGenerator(rescale=1/255,)

train_gen  = train_datagen.flow_from_dataframe(
    train_df,
    x_col = 'file',
    y_col = 'label',
    target_size=image_size,    
    batch_size=batch_size,                                    
    class_mode='raw',
    shuffle=True
    )

val_gen  = validation_datagen.flow_from_dataframe(
    valid_df,
    x_col = 'file',
    y_col = 'label',
    target_size=image_size,    
    batch_size=batch_size,                                    
    class_mode='raw',
    shuffle=False
    )

## callbacks 설정 
### EarlyStopping
#### monitor= 무었을 관찰할것인가 , patience= 몇번이상 줄지않으면 정지할 것 인가
### ReduceLROnPlateau
#### monitor= 무었을 관찰할것인가, factor= 몇배로 줄일것인가, min_lr= 최저의 학습률은?
### ModelCheckpoint
#### filepath= 저장하는 위치와 이름, save_best_only= 가장 좋은 모델만을 저장할것인가, save_weights_only= 모델의 가중치만을 저장할것인가

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', verbose=2, patience=5)
learning_rate_reduction=ReduceLROnPlateau(
                        monitor= "val_loss", 
                        patience = 2, 
                        factor = 0.5, 
                        min_lr=1e-7,
                        verbose=1)


checkpointer = ModelCheckpoint(filepath='./NADAM.h5',
                                   monitor='val_loss',
                                   verbose=1,
                                   save_best_only=True,
                                   save_weights_only=False)

callbacks = [early_stopping, learning_rate_reduction, checkpointer]

### 1. 7C2L (Nadam)

In [None]:
model = Sequential()

model.add(Conv2D(32, (3,3), activation="relu", input_shape=(300, 300 , 3)))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(64, (3,3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(128, (3,3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(256, (3,3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Conv2D(256, (3,3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Dropout(0.2))

model.add(Conv2D(512, (3,3), activation="relu"))

model.add(Conv2D(51 2, (3,3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))

# Fully Connected 
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(Dense(1,activation="sigmoid"))

model.compile(optimizer=Nadam(learning_rate=1e-4),
             loss='binary_crossentropy',
             metrics=['accuracy']) 

model.summary()
with tf.device('/device:GPU:0'):
    history = model.fit(train_gen,
                        steps_per_epoch=len(train_df)//batch_size,
                        epochs=10,
                        validation_data=val_gen,
                        validation_steps=len(valid_df)//batch_size,
                        verbose=1,
                        callbacks=callbacks)

### 2. 5C2L_FullDropout (Nadam)

In [None]:
model = Sequential()
model.add(Conv2D(32, (3,3), activation="relu", input_shape=(300, 300 , 3)))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(64, (3,3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(128, (3,3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(256, (3,3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(512, (3,3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2,2)))

# Fully Connected 
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(Dense(1,activation="sigmoid"))

model.compile(optimizer=Nadam(learning_rate=1e-4),
             loss='binary_crossentropy',
             metrics=['accuracy'])  

model.summary()
with tf.device('/device:GPU:0'):
    history = model.fit(train_gen,
                        steps_per_epoch=len(train_df)//batch_size,
                        epochs=10,
                        validation_data=val_gen,
                        validation_steps=len(valid_df)//batch_size,
                        verbose=1,
                        callbacks=callbacks)

### 3. 4C1L (Nadam)

In [None]:
model=Sequential()
model.add(Conv2D(filters=32,
                 kernel_size=(3,3),
                 activation='relu',
                 input_shape=(300,300,3)))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(filters=64,
                 kernel_size=(3,3),
                 activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(filters=128,
                 kernel_size=(3,3),
                 activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(filters=128,
                 kernel_size=(3,3),
                 activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Flatten())

model.add(Dropout(0.5))

model.add(Dense(units=512,
                activation='relu'))

model.add(Dense(units=1,activation='sigmoid'))

model.compile(optimizer=Nadam(learning_rate=1e-4),
             loss='binary_crossentropy',
             metrics=['accuracy'])

model.summary()
with tf.device('/device:GPU:0'):
    history = model.fit(train_gen,
                        steps_per_epoch=len(train_df)//batch_size,
                        epochs=10,
                        validation_data=val_gen,
                        validation_steps=len(valid_df)//batch_size,
                        verbose=1,
                        callbacks=callbacks)

### 교육시킨 모델에 대한 성능을 plot으로 확인
#### 교육을 진행할때 verbose를 1로 설정하여 획득한 accuracy와 val_accuracy, loss와 val_loss를 
#### 진행되는 epoch에 따라 그래프 생성하여 모델의 성능을 그림으로 확인

In [None]:
print(history.history.keys())
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
train_loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(train_acc, 'bo', color='r', label='training accuracy')
plt.plot(val_acc, 'b', color='b', label='validation accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()

plt.plot(train_loss, 'bo', color='r', label='training loss')
plt.plot(val_loss, 'b', color='b', label='validation loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

### 교육시킨 모델을 활용하여 실제 데이터를 예측

In [None]:
test_path = './leaderboard_MTCNN'
test_data = [f for f in os.listdir(test_path) if f.endswith('.jpg')]
test_data = sorted(test_data, key=lambda x: int(x[:-4]))
test_datagen = ImageDataGenerator(rescale=1./255,)

test_gen  = test_datagen.flow_from_dataframe(
    pd.DataFrame({'file':test_data}),
    test_path,
    x_col='file',
    y_col='file',
    class_mode = 'raw',
    target_size=(300,300),    
    batch_size=len(test_data),
    shuffle=False
    )



paths = []
for x, y in test_gen:
    paths = y
    classes = model.predict(x)
    break
    
for i in range(len(classes)):
    if classes[i] < 0.5:
        classes[i] = 0
    else:
        classes[i] = 1
pred_df = pd.DataFrame({'path':list(map(lambda x: int(x[:-4]),paths)),
                       'y':classes.ravel().astype('int')})
test = pred_df.sort_values('path')
# 제출 파일 제작
sub = pd.read_csv('sample_submission.csv')
sub['y'] = test['y'].values
sub.to_csv('NADAM.csv', index=False)
print(sum(classes))