In [None]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import random
from tensorflow.keras.layers import Dense, Conv2D, Dropout, Conv2DTranspose, MaxPooling2D, BatchNormalization, Activation, concatenate, Input, GlobalAveragePooling2D
from tensorflow.keras import Model
import warnings
 

In [None]:
warnings.filterwarnings("ignore")
# 재생산성을 위해 시드 고정
np.random.seed(7)
random.seed(7)
tf.random.set_seed(7)

In [None]:
# 사용할 피쳐 개수 지정
FEATURES = 14

# 40*40 픽셀에서 몇 픽셀이상에 비가 내려야 데이터로 사용할 지 정하는 기준 값
RAINS = 50

def trainGenerator():
    
    train_path = 'inputs/train'
    train_files = sorted(glob.glob(train_path + '/*'))
    
    for file in train_files:
        
        dataset = np.load(file)
        
        target= dataset[:,:,-1].reshape(40,40,1)
        
        cutoff_labels = np.where(target < 0, 0, target) 
        outliers = np.where(target < 0, target, 0)
        # np.where(조건, 조건에 맞을 때 값, 조건과 다를 때 값)
        # x = np.array([5, 4, 3, 2, 1, 0])
        # np.where(x >= 3, 3, x)
        # array([3, 3, 3, 2, 1, 0])
        
        feature = dataset[:,:,:FEATURES]
        
        # 40*40 1600픽셀 중 비내린 픽셀이 RAINS개 미만이면 찍혀있으면 패스. 최소 30개 이상의 픽셀에 비가 내려야지 fit에 사용.
        if (cutoff_labels > 0).sum() < RAINS:
            continue
            
        # 아웃라이어(-9999)가 존재하면 패스
        if (outliers < 0).sum() < 0:
            continue

        yield (feature, cutoff_labels)
        
train_dataset = tf.data.Dataset.from_generator(trainGenerator, (tf.float32, tf.float32), (tf.TensorShape([40,40,FEATURES]),tf.TensorShape([40,40,1])))
train_dataset = train_dataset.batch(512).prefetch(1)
test_path = 'inputs/test'
test_files = sorted(glob.glob(test_path + '/*'))

In [None]:
X_test = []

for file in tqdm(test_files, desc = 'test'):
    
    data = np.load(file)
    
    X_test.append(data[:,:,:FEATURES])
                  
X_test = np.array(X_test)

In [None]:
import seaborn as sns
color_map = plt.cm.get_cmap('RdBu')
color_map = color_map.reversed()
image_sample = np.load('inputs/train/subset_010462_02.npy')
plt.style.use('fivethirtyeight')
plt.figure(figsize=(30, 30))

In [None]:
for i in range(9):
    plt.subplot(1,10,i+1)
    plt.imshow(image_sample[:, :, i], cmap=color_map)

plt.subplot(1,20,20)
plt.imshow(image_sample[:,:,-1], cmap = color_map)
plt.show()

In [None]:
def build_model(input_layer, start_neurons):
    
    # 40 x 40 -> 20 x 20
    conv1 = Conv2D(start_neurons * 1, (3, 3), activation="relu", padding="same")(input_layer)
    conv1 = Conv2D(start_neurons * 1, (3, 3), activation="relu", padding="same")(conv1)
    pool1 = BatchNormalization()(conv1)
    pool1 = MaxPooling2D((2, 2))(pool1)
    pool1 = Dropout(0.25)(pool1)

    # 20 x 20 -> 10 x 10
    conv2 = Conv2D(start_neurons * 2, (3, 3), activation="relu", padding="same")(pool1)
    conv2 = Conv2D(start_neurons * 2, (3, 3), activation="relu", padding="same")(conv2)
    pool2 = BatchNormalization()(conv2)
    pool2 = MaxPooling2D((2, 2))(pool2)
    pool2 = Dropout(0.25)(pool2)

    # 10 x 10 
    convm = Conv2D(start_neurons * 4, (3, 3), activation="relu", padding="same")(pool2)

    # 10 x 10 -> 20 x 20
    deconv2 = Conv2DTranspose(start_neurons * 2, (3, 3), strides=(2, 2), padding="same")(convm)
    uconv2 = concatenate([deconv2, conv2])
    uconv2 = Dropout(0.25)(uconv2)
    uconv2 = Conv2D(start_neurons * 2, (3, 3), activation="relu", padding="same")(uconv2)
    uconv2 = Conv2D(start_neurons * 2, (3, 3), activation="relu", padding="same")(uconv2)
    uconv2 = BatchNormalization()(uconv2)

    # 20 x 20 -> 40 x 40
    deconv1 = Conv2DTranspose(start_neurons * 1, (3, 3), strides=(2, 2), padding="same")(uconv2)
    uconv1 = concatenate([deconv1, conv1])
    uconv1 = Dropout(0.25)(uconv1)
    uconv1 = Conv2D(start_neurons * 1, (3, 3), activation="relu", padding="same")(uconv1)
    uconv1 = Conv2D(start_neurons * 1, (3, 3), activation="relu", padding="same")(uconv1)
    uconv1 = BatchNormalization()(uconv1)
    uconv1 = Dropout(0.25)(uconv1)
    output_layer = Conv2D(1, (1,1), padding="same", activation='relu')(uconv1)
    
    return output_layer

input_layer = Input((40, 40, FEATURES))
output_layer = build_model(input_layer, 32)
model = Model(input_layer, output_layer)

In [None]:
from sklearn.metrics import f1_score

def mae(y_true, y_pred) :
    
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    y_true = y_true.reshape(1, -1)[0]
    
    y_pred = y_pred.reshape(1, -1)[0]
    
    over_threshold = y_true >= 0.1
    
    return np.mean(np.abs(y_true[over_threshold] - y_pred[over_threshold]))

def fscore(y_true, y_pred):
    
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    y_true = y_true.reshape(1, -1)[0]
    
    y_pred = y_pred.reshape(1, -1)[0]
    
    remove_NAs = y_true >= 0
    
    y_true = np.where(y_true[remove_NAs] >= 0.1, 1, 0)
    
    y_pred = np.where(y_pred[remove_NAs] >= 0.1, 1, 0)
    
    return(f1_score(y_true, y_pred))

def maeOverFscore(y_true, y_pred):
    
    return mae(y_true, y_pred) / (fscore(y_true, y_pred) + 1e-07)

def fscore_keras(y_true, y_pred):
    score = tf.py_function(func=fscore, inp=[y_true, y_pred], Tout=tf.float32, name='fscore_keras')
    return score

def maeOverFscore_keras(y_true, y_pred):
    score = tf.py_function(func=maeOverFscore, inp=[y_true, y_pred], Tout=tf.float32,  name='custom_mse') 
    return score

In [None]:
# model.compile(loss="mae", optimizer="adam", metrics=[maeOverFscore_keras, fscore_keras])
model.compile(loss="mae", optimizer="adam", metrics=["mae"])


In [None]:
model_history = model.fit(train_dataset, epochs = 10, verbose=1)


In [None]:
pred = model.predict(X_test)
submission = pd.read_csv('inputs/sample_submission.csv')
submission.iloc[:,1:] = pred.reshape(-1, 1600)
submission.to_csv('Dacon_baseline.csv', index = False)

submission 정리
1. RAINS = 50, FEATURES = 9, epochs = 10, loss="mae" - 3.1437019608
2. RAINS = 50, FEATURES = 14, epochs = 10, loss="mae" - 2.4486110373
3. RAINS = 30, FEATURES = 14, epochs = 10, loss="mae" - 2.6535480502

해야할 일
- scaling 처리 - robust, standard, log
- pretrained 모델 활용
- 위도 경도 피쳐를 가지고 위도+경도, 위도-경도 피쳐 만들기
- 현재 custom metric이 작동하지 않으므로, 이 또한 수정해야함.
- tree모델 만들고, 앙상블