In [0]:
import os

In [0]:
# 디렉토리 변경
os.chdir('drive/My Drive/Colab Notebooks/DACON/AIFrenz_Season2/')

# 데이터 압축 풀기

In [0]:
import zipfile

In [0]:
# train 데이터 압축 해제
train_path = './Data/train'
if not os.path.exists(train_path):
    train_zip = zipfile.ZipFile('./Data/train.zip')
    train_zip.extractall(train_path)
    train_zip.close()

In [0]:
# test 데이터 압축 해제
test_path = './Data/test'
if not os.path.exists(test_path):
    test_zip = zipfile.ZipFile('./Data/test.zip')
    test_zip.extractall(test_path)
    test_zip.close()

---

# 데이터 나누기

In [0]:
from tqdm import tqdm

In [0]:
import numpy as np

In [0]:
# train 데이터 분리
if not os.path.exists('./Data/dacon_npy/train.npy'):
    train = []
    for fname in tqdm(os.listdir(train_path)):
        data = np.load(train_path + '/' + fname)
        train.append(data)
    np.save('./Data/dacon_npy/train', train)

In [0]:
# test 데이터 분리
if not os.path.exists('./Data/dacon_npy/test.npy'):
    test = []
    for fname in tqdm(os.listdir(test_path)):
        data = np.load(test_path + '/' + fname)
        test.append(data)
    np.save('./Data/dacon_npy/test', test)

---

# 데이터 읽어오기

In [0]:
train = np.load('./Data/dacon_npy/train.npy')

In [0]:
# 강수량 픽셀이 0인 값에 대한 데이터 삭제
train = np.delete(train, np.where(train[:, :, :, -1] < 0), axis=0)

In [0]:
train_images = train[:, :, :, :9]
train_surface = train[:, :, :, 9]
train_location = train[:, :, :, 10:14]
train_precipitation = train[:, :, :, 14]

In [0]:
del train

In [0]:
test = np.load('./Data/dacon_npy/test.npy')

In [0]:
test_images = test[:, :, :, :9]
test_surface = test[:, :, :, 9]
test_location = test[:, :, : , 10:14]

In [0]:
del test

---

# MinMax Scale 적용

## 밝기 온도 MinMaxScale 적용

In [0]:
Tb_max = train_images.max()
Tb_min = train_images.min()

In [0]:
train_images = (train_images - Tb_min) / (Tb_max - Tb_min)

In [0]:
test_images = (test_images - Tb_min) / (Tb_max - Tb_min)

---

## 지표 타입 값 변경
- 지표 타입 (앞자리 0: Ocean, 앞자리 1: Land, 앞자리 2: Coastal, 앞자리 3: Inland Water)에서
- 지표 타입 (앞자리 0: Ocean, 앞자리 1: Coastal, 앞자리 2: Inland Water, 앞자리 3: Land)로 변경하기

In [0]:
train_surface = np.where(train_surface >= 300, 2, np.where(train_surface >= 200, 1, np.where(train_surface >= 100, 3, train_surface)))

In [0]:
test_surface = np.where(test_surface >= 300, 2, np.where(test_surface >= 200, 1, np.where(test_surface >= 100, 3, test_surface)))

In [0]:
import pandas as pd
train_surface = np.where(train_surface >= 300, 3, 
                         np.where(train_surface >= 200, 2, 
                                  np.where(train_surface >= 100, 1, train_surface)))
pd.DataFrame([np.eye(4)[int(i)] for i in train_surface.reshape(-1, 1)]).corr()

## 지표 타입 MinMaxScale 적용

In [0]:
train_surface = train_surface / 3

In [0]:
test_surface = test_surface / 3

---

## 위경도 MinMaxScale 적용

In [0]:
lat_min = train_location[:, :, :, 1].min()
lat_max = train_location[:, :, :, 1].max()
lon_min = train_location[:, :, :, 2].min()
lon_max = test_location[:, :, :, 0].max()

In [0]:
min_arr = np.array([lon_min, lat_min, lon_min, lat_min])
max_arr = np.array([lon_max, lat_max, lon_max, lat_max])

In [0]:
train_location = (train_location - min_arr) / (max_arr - min_arr)

In [0]:
test_location = (test_location - min_arr) / (max_arr - min_arr)

---

# Standard Scale 적용

---

# 위에서 전처리한 데이터 모두 합치기

In [0]:
train = np.concatenate((train_images, train_surface.reshape(-1, 40, 40, 1), train_location, train_precipitation.reshape(-1, 40, 40, 1)), axis=3)

In [0]:
del train_images, train_surface, train_location, train_precipitation

In [0]:
test = np.concatenate((test_images, test_surface.reshape(-1, 40, 40, 1), test_location), axis=3)

In [0]:
del test_images, test_surface, test_location

---

# 전처리된 데이터 저장

In [0]:
np.save('./Data/dacon_npy/train_refined', train)

In [0]:
np.save('./Data/dacon_npy/test_refined', test)

---

# 전처리된 데이터 불러오기

In [0]:
train = np.load('./Data/dacon_npy/train_refined.npy')

In [0]:
train.shape

(75917, 40, 40, 15)

In [0]:
test = np.load('./Data/dacon_npy/test_refined.npy')

In [0]:
test.shape

(2416, 40, 40, 14)