In [24]:
import warnings
from joblib import dump
from scipy import stats
from utils.scaler import *
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from IPython.core.interactiveshell import InteractiveShell

In [2]:
plt.rc('font', family='GULIM')
warnings.filterwarnings(action='ignore')
InteractiveShell.ast_node_interactivity = "all"

In [3]:
df = pd.read_csv('Database/rainfall_train.csv', index_col=0)
df_test = pd.read_csv('Database/rainfall_test.csv', index_col=0)
train_col = df.columns.tolist()
train_col.remove(train_col[-2])
df_test.columns = train_col
train, self_test = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)

# Train General Preprocessing

In [4]:
df_1 = train.copy()
mask = df_1['rainfall_train.class_interval'] == -999
df_1 = df_1[~mask]

delete_list = ['rainfall_train.fc_year', 'rainfall_train.fc_month', 'rainfall_train.fc_day', 'rainfall_train.fc_hour',
               'rainfall_train.ef_year', 'rainfall_train.class_interval']
df_1 = df_1.drop(columns=delete_list)

# Train 이산형변수 처리

In [5]:
df_2 = df_1.copy()

original_values = list(range(3, 241, 3))
reversed_values = original_values[::-1]
mapping_table = dict(zip(original_values, reversed_values))


def map_value(x):
    if pd.isna(x):
        return x
    return mapping_table.get(x, x)


df_2['rainfall_train.dh'] = df_2['rainfall_train.dh'].apply(map_value)

In [6]:
enc_dict = {}
onehot_df = pd.DataFrame(index=df_2.index)
for i, col in enumerate(['rainfall_train.ef_month', 'rainfall_train.ef_day', 'rainfall_train.ef_hour']):
    enc_dict[i] = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    one_hot_encoded = enc_dict[i].fit_transform(pd.DataFrame(df_2[col]))

    encoded_df = pd.DataFrame(one_hot_encoded, columns=enc_dict[i].get_feature_names_out([col]), index=df_2.index)
    onehot_df = pd.concat([onehot_df, encoded_df], axis=1)
    df_2 = df_2.drop(columns=[col])
df_2 = pd.concat([df_2, onehot_df], axis=1)

# Train BoxCox변환

In [7]:
df_3 = df_2.copy()

continuous_list = [
    'rainfall_train.v01', 'rainfall_train.v02', 'rainfall_train.v03', 'rainfall_train.v04', 'rainfall_train.v05',
    'rainfall_train.v06', 'rainfall_train.v07', 'rainfall_train.v08', 'rainfall_train.v09', 'rainfall_train.vv']

boxcox_dict = {}
for feature in continuous_list:
    boxcox_data, lmbda = stats.boxcox(df_3[feature] + 0.000001)
    boxcox_dict[feature] = lmbda
    df_3[feature] = boxcox_data

In [8]:
# pd.DataFrame(boxcox_dict.items()).to_csv('Database/boxcox_lmbda.csv')

# Train 연속형변수 분포저장

In [9]:
y = df_3['rainfall_train.vv']
X = df_3.drop(columns='rainfall_train.vv')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=42, shuffle=True)

In [10]:
continuous_list = [
    'rainfall_train.v01', 'rainfall_train.v02', 'rainfall_train.v03', 'rainfall_train.v04', 'rainfall_train.v05',
    'rainfall_train.v06', 'rainfall_train.v07', 'rainfall_train.v08', 'rainfall_train.v09']

X_train_norm = X_train.copy()
X_val_norm = X_val.copy()

X_train_norm.loc[:, continuous_list], scaler = standard_scale_train(X_train, continuous_list)
X_val_norm.loc[:, continuous_list] = standard_scale_val(X_val, continuous_list, scaler)

In [11]:
y_train_norm, target_scaler = standard_scale_train(pd.DataFrame(y_train), ['rainfall_train.vv'])
y_val_norm = standard_scale_val(pd.DataFrame(y_val), ['rainfall_train.vv'], target_scaler)

In [25]:
dump(target_scaler, 'Database/target_scaler.joblib')
print("Scaler object saved successfully!")

['Database/target_scaler.joblib']

Scaler object saved successfully!


# test General Preprocessing

In [12]:
df_1 = self_test.copy()
# df_1 = df_test.copy()

mask = df_1['rainfall_train.class_interval'] == -999
df_1 = df_1[~mask]

delete_list = ['rainfall_train.fc_year', 'rainfall_train.fc_month', 'rainfall_train.fc_day', 'rainfall_train.fc_hour',
               'rainfall_train.ef_year', 'rainfall_train.class_interval']
df_1 = df_1.drop(columns=delete_list)

# test 이산형변수 처리

In [13]:
df_2 = df_1.copy()

original_values = list(range(3, 241, 3))
reversed_values = original_values[::-1]
mapping_table = dict(zip(original_values, reversed_values))


def map_value(x):
    if pd.isna(x):
        return x
    return mapping_table.get(x, x)


df_2['rainfall_train.dh'] = df_2['rainfall_train.dh'].apply(map_value)

In [14]:
onehot_df = pd.DataFrame(index=df_2.index)
for i, col in enumerate(['rainfall_train.ef_month', 'rainfall_train.ef_day', 'rainfall_train.ef_hour']):
    one_hot_encoded = enc_dict[i].transform(pd.DataFrame(df_2[col]))

    encoded_df = pd.DataFrame(one_hot_encoded, columns=enc_dict[i].get_feature_names_out([col]), index=df_2.index)
    onehot_df = pd.concat([onehot_df, encoded_df], axis=1)
    df_2 = df_2.drop(columns=[col])
df_2 = pd.concat([df_2, onehot_df], axis=1)

# test BoxCox변환

In [15]:
df_3 = df_2.copy()

continuous_list = [
    'rainfall_train.v01', 'rainfall_train.v02', 'rainfall_train.v03', 'rainfall_train.v04', 'rainfall_train.v05',
    'rainfall_train.v06', 'rainfall_train.v07', 'rainfall_train.v08', 'rainfall_train.v09', 'rainfall_train.vv']

for feature in continuous_list:
    boxcox_data = stats.boxcox(df_3[feature] + 0.000001, lmbda=boxcox_dict[feature])
    df_3[feature] = boxcox_data

# test 이산형변수 칼럼통일

In [16]:
y2 = df_3['rainfall_train.vv']
X2 = df_3.drop(columns='rainfall_train.vv')

# train에만 있고 test에 없는 경우, 해당 column name으로 test에 zero columns 추가.
X2[list(X_val_norm.columns[X_val_norm.columns.isin(X2) == False])] = 0

# test에만 있고 train에는 없는 경우, 해당 column name은 제거.
X2 = X2.drop(columns=list(X2.columns[X2.columns.isin(X_val_norm) == False]))
X2 = X2[X_val_norm.columns]

# test 연속형변수 분포통일

In [17]:
continuous_list = [
    'rainfall_train.v01', 'rainfall_train.v02', 'rainfall_train.v03', 'rainfall_train.v04', 'rainfall_train.v05',
    'rainfall_train.v06', 'rainfall_train.v07', 'rainfall_train.v08', 'rainfall_train.v09']

X2.loc[:, continuous_list] = standard_scale_val(X2, continuous_list, scaler)
y2 = standard_scale_val(pd.DataFrame(y2), ['rainfall_train.vv'], target_scaler)

# 지역별 분리

In [19]:
X_train_norm_dict = {}
y_train_norm_dict = {}

for i, STN in enumerate(df['rainfall_train.stn4contest'].unique().tolist()):
    train_mask = (X_train_norm['rainfall_train.stn4contest'] == STN)
    X_train_norm_dict[STN] = X_train_norm[train_mask]
    y_train_norm_dict[STN] = y_train_norm[train_mask]

In [20]:
for i, (key, value) in enumerate(X_train_norm_dict.items()):
    X_train_norm_dict[key] = value.iloc[:, 1:]

X_val_norm = X_val_norm.iloc[:, 1:]
X2 = X2.iloc[:, 1:]

In [21]:
for i, (key, value) in enumerate(X_train_norm_dict.items()):
    print(f'{key} Train set')
    print(value.shape)
    print(y_train_norm_dict[key].shape)

print('Validation set')
print(X_val_norm.shape)
print(y_val_norm.shape)

print('Test set')
print(X2.shape)
print(y2.shape)

STN001 Train set
(62266, 55)
(62266, 1)
STN002 Train set
(62189, 55)
(62189, 1)
STN003 Train set
(62327, 55)
(62327, 1)
STN004 Train set
(61883, 55)
(61883, 1)
STN005 Train set
(61509, 55)
(61509, 1)
STN006 Train set
(61729, 55)
(61729, 1)
STN007 Train set
(61972, 55)
(61972, 1)
STN008 Train set
(62150, 55)
(62150, 1)
STN009 Train set
(62117, 55)
(62117, 1)
STN010 Train set
(61638, 55)
(61638, 1)
STN011 Train set
(62047, 55)
(62047, 1)
STN012 Train set
(61924, 55)
(61924, 1)
STN013 Train set
(61962, 55)
(61962, 1)
STN014 Train set
(61837, 55)
(61837, 1)
STN015 Train set
(61883, 55)
(61883, 1)
STN016 Train set
(61762, 55)
(61762, 1)
STN017 Train set
(61646, 55)
(61646, 1)
STN018 Train set
(62005, 55)
(62005, 1)
STN019 Train set
(62184, 55)
(62184, 1)
STN020 Train set
(61641, 55)
(61641, 1)
Validation set
(65194, 55)
(65194, 1)
Test set
(144897, 55)
(144897, 1)


# 전처리 파일저장

In [23]:
for i, (key, value) in enumerate(X_train_norm_dict.items()):
    value.to_csv(f'Database/train/{key}_X_train_norm.csv')
    y_train_norm_dict[key].to_csv(f'Database/train/{key}_y_train_norm.csv')
    print(f'{key} Train set saved!')

X_val_norm.to_csv(f'Database/val/X_val_norm.csv')
y_val_norm.to_csv(f'Database/val/y_val_norm.csv')
print('Validation set saved!')

X2.to_csv(f'Database/test/X_self_test_norm.csv')
y2.to_csv(f'Database/test/y_self_test_norm.csv')
print('Test set saved!')

STN001 Train set saved!
STN002 Train set saved!
STN003 Train set saved!
STN004 Train set saved!
STN005 Train set saved!
STN006 Train set saved!
STN007 Train set saved!
STN008 Train set saved!
STN009 Train set saved!
STN010 Train set saved!
STN011 Train set saved!
STN012 Train set saved!
STN013 Train set saved!
STN014 Train set saved!
STN015 Train set saved!
STN016 Train set saved!
STN017 Train set saved!
STN018 Train set saved!
STN019 Train set saved!
STN020 Train set saved!
Validation set saved!
Test set saved!
