In [1]:
import joblib
import warnings
from scipy import stats
from utils.scaler import *
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from IPython.core.interactiveshell import InteractiveShell

In [2]:
plt.rc('font', family='GULIM')
warnings.filterwarnings(action='ignore')
InteractiveShell.ast_node_interactivity = "all"

In [3]:
df = pd.read_csv('Database/rainfall_train.csv', index_col=0)
df_test = pd.read_csv('Database/rainfall_test.csv')
train_col = df.columns.tolist()
train_col.remove(train_col[-2])
df_test.columns = train_col
train, self_test_df = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)

# Train General Preprocessing

In [4]:
df_1 = train.copy()
mask = df_1['rainfall_train.class_interval'] == -999
df_1 = df_1[~mask]

delete_list = ['rainfall_train.fc_year', 'rainfall_train.fc_month', 'rainfall_train.fc_day', 'rainfall_train.fc_hour',
               'rainfall_train.ef_year', 'rainfall_train.vv']
df_1 = df_1.drop(columns=delete_list)

# Train 이산형변수 처리

In [5]:
df_2 = df_1.copy()

original_values = list(range(3, 241, 3))
reversed_values = original_values[::-1]
mapping_table = dict(zip(original_values, reversed_values))


def map_value(x):
    if pd.isna(x):
        return x
    return mapping_table.get(x, x)


df_2['rainfall_train.dh'] = df_2['rainfall_train.dh'].apply(map_value)

In [6]:
enc_dict = {}
onehot_df = pd.DataFrame(index=df_2.index)
for i, col in enumerate(['rainfall_train.ef_month', 'rainfall_train.ef_day', 'rainfall_train.ef_hour']):
    enc_dict[i] = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    one_hot_encoded = enc_dict[i].fit_transform(pd.DataFrame(df_2[col]))

    encoded_df = pd.DataFrame(one_hot_encoded, columns=enc_dict[i].get_feature_names_out([col]), index=df_2.index)
    onehot_df = pd.concat([onehot_df, encoded_df], axis=1)
    df_2 = df_2.drop(columns=[col])
df_2 = pd.concat([df_2, onehot_df], axis=1)

# Train 파생변수

In [7]:
df_3 = df_2.copy()

continuous_list = [
    'rainfall_train.v01', 'rainfall_train.v02', 'rainfall_train.v03', 'rainfall_train.v04', 'rainfall_train.v05',
    'rainfall_train.v06', 'rainfall_train.v07', 'rainfall_train.v08', 'rainfall_train.v09']

df_3.insert(11, 'cum_prob', df_3[continuous_list].sum(axis=1))
df_3.insert(12, 'Zero_Count', (df_3[continuous_list] == 0).sum(axis=1))

# Train 정규화

In [8]:
df_4 = df_3.copy()

minmax_list = ['rainfall_train.dh', 'Zero_Count']
minmax_scaler = MinMaxScaler()
df_4[minmax_list] = minmax_scaler.fit_transform(df_4[minmax_list])

# Train 연속형변수 분포저장

In [9]:
y = df_4['rainfall_train.class_interval']
X = df_4.drop(columns='rainfall_train.class_interval')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=42, shuffle=True)

In [10]:
X_train_norm = X_train.copy()
X_val_norm = X_val.copy()

continuous_list = [
    'rainfall_train.v01', 'rainfall_train.v02', 'rainfall_train.v03', 'rainfall_train.v04', 'rainfall_train.v05',
    'rainfall_train.v06', 'rainfall_train.v07', 'rainfall_train.v08', 'rainfall_train.v09', 'cum_prob']

X_train_norm.loc[:, continuous_list], scaler = standard_scale_train(X_train, continuous_list)
X_val_norm.loc[:, continuous_list] = standard_scale_val(X_val, continuous_list, scaler)

# 불균형 해소를 위한 학습파일 저장

In [11]:
total_train = pd.concat([X_train_norm, y_train], axis=1)
total_val = pd.concat([X_val_norm, y_val], axis=1)

total_train = total_train.iloc[:, 1:]
total_val = total_val.iloc[:, 1:]

train_mask = total_train['rainfall_train.class_interval'] == 0
val_mask = total_val['rainfall_train.class_interval'] == 0

In [12]:
# total_train[~train_mask].to_csv('Database/total/total_train.csv')
# total_val[~val_mask].to_csv('Database/total/total_val.csv')

# test General Preprocessing

In [13]:
self_test = True

if self_test:
    df_1 = self_test_df.copy()
if not self_test:
    df_1 = df_test.copy()

mask = df_1['rainfall_train.class_interval'] == -999
df_1 = df_1[~mask]

delete_list = ['rainfall_train.fc_year', 'rainfall_train.fc_month', 'rainfall_train.fc_day', 'rainfall_train.fc_hour',
               'rainfall_train.ef_year', 'rainfall_train.vv']
df_1 = df_1.drop(columns=delete_list)

# test 이산형변수 처리

In [14]:
df_2 = df_1.copy()

original_values = list(range(3, 241, 3))
reversed_values = original_values[::-1]
mapping_table = dict(zip(original_values, reversed_values))


def map_value(x):
    if pd.isna(x):
        return x
    return mapping_table.get(x, x)


df_2['rainfall_train.dh'] = df_2['rainfall_train.dh'].apply(map_value)

In [15]:
onehot_df = pd.DataFrame(index=df_2.index)
for i, col in enumerate(['rainfall_train.ef_month', 'rainfall_train.ef_day', 'rainfall_train.ef_hour']):
    one_hot_encoded = enc_dict[i].transform(pd.DataFrame(df_2[col]))

    encoded_df = pd.DataFrame(one_hot_encoded, columns=enc_dict[i].get_feature_names_out([col]), index=df_2.index)
    onehot_df = pd.concat([onehot_df, encoded_df], axis=1)
    df_2 = df_2.drop(columns=[col])
df_2 = pd.concat([df_2, onehot_df], axis=1)

# test 파생변수

In [16]:
df_3 = df_2.copy()

continuous_list = [
    'rainfall_train.v01', 'rainfall_train.v02', 'rainfall_train.v03', 'rainfall_train.v04', 'rainfall_train.v05',
    'rainfall_train.v06', 'rainfall_train.v07', 'rainfall_train.v08', 'rainfall_train.v09']

df_3.insert(11, 'cum_prob', df_3[continuous_list].sum(axis=1))
df_3.insert(12, 'Zero_Count', (df_3[continuous_list] == 0).sum(axis=1))

# test 정규화

In [17]:
df_4 = df_3.copy()

minmax_list = ['rainfall_train.dh', 'Zero_Count']
df_4[minmax_list] = minmax_scaler.transform(df_4[minmax_list])

# test 이산형변수 칼럼통일

In [18]:
if self_test:
    y2 = df_4['rainfall_train.class_interval']
    X2 = df_4.drop(columns='rainfall_train.class_interval')

    # train에만 있고 test에 없는 경우, 해당 column name으로 test에 zero columns 추가.
    X2[list(X_val_norm.columns[X_val_norm.columns.isin(X2) == False])] = 0

    # test에만 있고 train에는 없는 경우, 해당 column name은 제거.
    X2 = X2.drop(columns=list(X2.columns[X2.columns.isin(X_val_norm) == False]))
    X2 = X2[X_val_norm.columns]

if not self_test:
    X2 = df_4

    # train에만 있고 test에 없는 경우, 해당 column name으로 test에 zero columns 추가.
    X2[list(X_val_norm.columns[X_val_norm.columns.isin(X2) == False])] = 0

    # test에만 있고 train에는 없는 경우, 해당 column name은 제거.
    X2 = X2.drop(columns=list(X2.columns[X2.columns.isin(X_val_norm) == False]))
    X2 = X2[X_val_norm.columns]

# test 연속형변수 분포통일

In [19]:
continuous_list = [
    'rainfall_train.v01', 'rainfall_train.v02', 'rainfall_train.v03', 'rainfall_train.v04', 'rainfall_train.v05',
    'rainfall_train.v06', 'rainfall_train.v07', 'rainfall_train.v08', 'rainfall_train.v09', 'cum_prob']

X2.loc[:, continuous_list] = standard_scale_val(X2, continuous_list, scaler)

# 뷸균형 해소

In [90]:
import torch
from Module.model import MLP, VAE

backbone = MLP(57, 10)
backbone.load_state_dict(torch.load('Weight/MLP.pth'))
model = VAE(32, backbone, 10000)
model.load_state_dict(torch.load('Weight/VAE.pth'))

<All keys matched successfully>

<All keys matched successfully>

In [91]:
syn_data = model()

In [92]:
syn_df = pd.DataFrame(syn_data.cpu().detach().numpy(),
                      columns=list(X_train_norm.iloc[:, 1:].columns) + ['rainfall_train.class_interval'])

In [93]:
syn_df

Unnamed: 0,rainfall_train.dh,rainfall_train.v01,rainfall_train.v02,rainfall_train.v03,rainfall_train.v04,rainfall_train.v05,rainfall_train.v06,rainfall_train.v07,rainfall_train.v08,rainfall_train.v09,...,rainfall_train.ef_day_31,rainfall_train.ef_hour_0,rainfall_train.ef_hour_3,rainfall_train.ef_hour_6,rainfall_train.ef_hour_9,rainfall_train.ef_hour_12,rainfall_train.ef_hour_15,rainfall_train.ef_hour_18,rainfall_train.ef_hour_21,rainfall_train.class_interval
0,0.376365,-0.013162,-0.113957,-0.166673,-0.188229,-0.178822,-0.198383,-0.233038,-0.202920,-0.267656,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
1,0.487413,1.354501,1.453131,1.596100,1.695975,1.742457,1.808003,1.905329,2.082436,2.451569,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,0.438954,1.485934,1.554065,1.631917,1.533264,1.336128,0.973985,0.578394,0.076985,-0.031197,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0
3,0.341036,1.196080,1.241774,1.296054,1.240824,1.154350,1.121007,1.060655,0.432339,0.062682,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0
4,0.382730,-0.222958,-0.306842,-0.333544,-0.313706,-0.284878,-0.350062,-0.390659,-0.278607,-0.173864,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.794983,2.208244,2.450306,2.787531,2.750167,2.270176,0.918296,-0.046919,-0.137167,0.019627,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0
9996,0.502323,-0.523167,-0.603606,-0.568130,-0.541640,-0.529569,-0.498500,-0.457386,-0.248479,-0.093787,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
9997,0.501277,1.826962,1.929247,2.005712,1.894628,1.721744,1.285312,0.657939,0.098284,-0.033035,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
9998,0.855952,2.454066,2.772650,3.370296,4.035474,4.722421,4.174794,1.785704,0.096058,0.053224,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0


# 지역별 분리

In [94]:
# X_train_norm_dict = {}
# y_train_norm_dict = {}
# 
# for i, STN in enumerate(df['rainfall_train.stn4contest'].unique().tolist()):
#     train_mask = (X_train_norm['rainfall_train.stn4contest'] == STN)
#     X_train_norm_dict[STN] = X_train_norm[train_mask]
#     y_train_norm_dict[STN] = pd.DataFrame(y_train_norm)[train_mask]
# 
# for i, (key, value) in enumerate(X_train_norm_dict.items()):
#     X_train_norm_dict[key] = value.iloc[:, 1:]
# 
# X_val_norm = X_val_norm.iloc[:, 1:]
# X2 = X2.iloc[:, 1:]

# GMM분리

In [21]:
X_train_norm = X_train_norm.iloc[:, 1:]
X_val_norm = X_val_norm.iloc[:, 1:]
X2 = X2.iloc[:, 1:]

GMM = joblib.load('Database/gmm_model.pkl')
X_train_norm = pd.concat([X_train_norm, pd.DataFrame(GMM.predict(X_train_norm), index=X_train_norm.index)], axis=1)

X_train_norm_dict = {}
y_train_norm_dict = {}

for i, cluster_num in enumerate(X_train_norm[0].unique().tolist()):
    train_mask = (X_train_norm[0] == cluster_num)
    X_train_norm_dict[cluster_num] = X_train_norm[train_mask]
    y_train_norm_dict[cluster_num] = pd.DataFrame(y_train)[train_mask]
    X_train_norm_dict[cluster_num] = X_train_norm_dict[cluster_num].drop(columns=0)

In [22]:
for i, (key, value) in enumerate(X_train_norm_dict.items()):
    print(f'{key} Train set')
    print(value.shape)
    print(y_train_norm_dict[key].shape)

print('Validation set')
print(X_val_norm.shape)
print(y_val.shape)

print('Test set')
print(X2.shape)

if self_test:
    print(y2.shape)

10 Train set
(123072, 57)
(123072, 1)
4 Train set
(97820, 57)
(97820, 1)
1 Train set
(90284, 57)
(90284, 1)
6 Train set
(125411, 57)
(125411, 1)
8 Train set
(150807, 57)
(150807, 1)
9 Train set
(115253, 57)
(115253, 1)
5 Train set
(20187, 57)
(20187, 1)
3 Train set
(54302, 57)
(54302, 1)
7 Train set
(18608, 57)
(18608, 1)
14 Train set
(74820, 57)
(74820, 1)
12 Train set
(93290, 57)
(93290, 1)
0 Train set
(175835, 57)
(175835, 1)
13 Train set
(80389, 57)
(80389, 1)
2 Train set
(5023, 57)
(5023, 1)
11 Train set
(13570, 57)
(13570, 1)
Validation set
(65194, 57)
(65194,)
Test set
(144897, 57)
(144897,)


# 전처리 파일저장

In [23]:
for i, (key, value) in enumerate(X_train_norm_dict.items()):
    value.to_csv(f'Database/train/GMM{key}_X_train_norm.csv')
    y_train_norm_dict[key].to_csv(f'Database/train/GMM{key}_y_train.csv')
    print(f'GMM{key} Train set saved!')

X_val_norm.to_csv(f'Database/val/X_val_norm.csv')
y_val.to_csv(f'Database/val/y_val.csv')
print('Validation set saved!')

if self_test:
    X2.to_csv(f'Database/test/X_self_test_norm.csv')
    y2.to_csv(f'Database/test/y_self_test.csv')

if not self_test:
    X2.to_csv(f'Database/test/X_real_test_norm.csv')
print('Test set saved!')

GMM10 Train set saved!
GMM4 Train set saved!
GMM1 Train set saved!
GMM6 Train set saved!
GMM8 Train set saved!
GMM9 Train set saved!
GMM5 Train set saved!
GMM3 Train set saved!
GMM7 Train set saved!
GMM14 Train set saved!
GMM12 Train set saved!
GMM0 Train set saved!
GMM13 Train set saved!
GMM2 Train set saved!
GMM11 Train set saved!
Validation set saved!
Test set saved!
