# Load data

In [None]:
import pandas as pd
import numpy as np
import missingno as msno
import seaborn as sns
import matplotlib as mpl

from glob import glob
from tqdm import tqdm
from matplotlib import pyplot as plt

# Configuration

In [None]:
# Korean font
from matplotlib import font_manager, rc
try:
    font_path = "C:/Windows/Fonts/malgun.TTF"
    Kfont = font_manager.FontProperties(fname=font_path).get_name()
    rc('font', family=Kfont)
except:
    pass

# Fix minus presentation
mpl.rcParams['axes.unicode_minus'] = False

# Custom functions

In [None]:
def to_datetime_format(df:pd.DataFrame, col_nm:str='ymdhm', as_index=False) -> pd.DataFrame:
    df[col_nm] = pd.to_datetime(df[col_nm], infer_datetime_format=True) #format = "%Y-%m-%d %H%M")
    df = df.sort_values(by=col_nm, ascending=True)
    if as_index:
        df.set_index(col_nm, inplace=True)
    
    return df 

# Load data

### Data description
Data  

├ Water Data

│ ├ data_2012.csv

│   ├ ymdhm : 년월일시분

│   ├ swl : 팔당댐 현재수위 (단위: El.m)

│   ├ inf : 팔당댐 유입량 (단위: m^3/s)

│   ├ sfw : 팔당댐 저수량 (단위: 만m^3)

│   ├ ecpc : 팔당댐 공용량 (단위: 백만m^3)

│   ├ tototf : 총 방류량 (단위: m^3/s)

│   ├ tide_level : 강화대교 조위 (단위: cm)

│   ├ wl_1018662 : 청담대교 수위 (단위: cm)

│   ├ fw_1018662 : 청담대교 유량 (단위: m^3/s)

│   ├ wl_1018680 : 잠수교 수위 (단위: cm)

│   ├ fw_1018680 : 잠수교 유량 (단위: m^3/s)

│   ├ wl_1018683 : 한강대교 수위 (단위: cm)

│   ├ fw_1018683 : 한강대교 유량 (단위: m^3/s)

│   ├ wl_1019630 : 행주대교 수위 (단위: cm)

│   └ fw_1019630 : 행주대교 유량 (단위: m^3/s)

│ ├ data_2013.csv

…

└ └ data_2022.csv

└ RainFall Data

│ ├ rf_2012.csv

│   ├ YMDHM : 년월일시분

│   ├ rf_10184100 : 대곡교 강수량

│   ├ rf_10184110 : 진관교 강수량

│   └ rf_10184140 : 송정동 강수량

│ ├ rf_2013.csv

…

└ └ rf_2022.csv

### Load sample submission data

In [None]:
# Load sample submission data
df_smp_subm = pd.read_csv('data/sample_submission.csv')
df_smp_subm

# Get target columns
tgt_col = df_smp_subm.columns[1:]

### Load water data

In [None]:
# Get water level data list
wl_data_list = sorted(glob("data/water_data/*.csv"))
wl_data_list

In [None]:
# check water level data
pd.read_csv(wl_data_list[0]).info()

In [None]:
# Check most recent water level data
pd.read_csv(wl_data_list[-1])

In [None]:
# Concat whole water level data
df_wl_all = pd.concat([to_datetime_format(pd.read_csv(path), as_index=True) for path in wl_data_list], axis=0)

### Load rainfall data

In [None]:
# Get rainfall data list
rf_data_list = sorted(glob("data/rf_data/*.csv"))
rf_data_list

In [None]:
# check rainfall data
pd.read_csv(rf_data_list[0]).info()

In [None]:
# Check most recent rainfall data
pd.read_csv(rf_data_list[-1])

In [None]:
# Concat whole rainfall data
df_rf_all = pd.concat([to_datetime_format(pd.read_csv(path), as_index=True) for path in rf_data_list], axis=0)

# Preprocessing & Visualization

### Simple missing check for water level data

In [None]:
# Replace value from 0 to None in the target columns
df_wl_all[tgt_col] = df_wl_all[tgt_col].replace({0:None})

# Divide water data by date
df_wl = df_wl_all[df_wl_all.index < pd.to_datetime('2022-06-01')]
df_wl_fut = df_wl_all[df_wl_all.index >= pd.to_datetime('2022-06-01')]

# Get target columns in the train time line
df_tgt_past = df_wl[tgt_col]

In [None]:
# Water level data except test time missing check
msno.matrix(df_wl)
df_wl.info()

In [None]:
# Water level future dataset missing check
msno.matrix(df_wl_fut)
df_wl_fut.info()

### Simple missing check for rainfall data

In [None]:
# Divide water data by date
df_rf = df_rf_all[df_rf_all.index < pd.to_datetime('2022-06-01')]
df_rf_fut = df_wl_all[df_rf_all.index >= pd.to_datetime('2022-06-01')]

In [None]:
# Rainfall data except test time missing check
msno.matrix(df_rf)
df_rf.info()