---
# Setting
---

<br>

## Import Modules

In [1]:
import gc
gc.collect()

0

In [2]:
# !pip install pytimekr
# !pip install optuna

In [3]:
# d MyPython/2_Dacon_JEJU/DAT/
# unzip open.zip

In [4]:
import matplotlib as mpl
import matplotlib.font_manager as fm

fe = fm.FontEntry(fname='../NanumFont/NanumGothic.ttf',name='NanumGothic')
fm.fontManager.ttflist.insert(0, fe)  # or append is fine
mpl.rcParams['font.family'] = fe.name # = 'your custom ttf font name'

In [5]:
import os
import sys
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

import numpy as np
from tqdm import tqdm, trange
tqdm.pandas()

import warnings
warnings.filterwarnings(action='ignore')
warnings.simplefilter(action='ignore', category=FutureWarning) # FutureWarning 제거
os.environ['PYTHONWARNINGS']='ignore::FutureWarning'

import itertools
import datetime
from pytimekr import pytimekr
import matplotlib.pyplot as plt
import seaborn as sns

import multiprocessing as mp
from joblib import Parallel, delayed

# import datatable as dt

In [6]:
# https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution

import contextlib
import joblib
from tqdm import tqdm

@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

In [7]:
def abline(slope, intercept, color):
    axes = plt.gca()
    x_vals = np.array(axes.get_xlim())
    y_vals = intercept + slope * x_vals
    plt.plot(x_vals, y_vals, '--',color=color)
    
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)
        
def cnt(x):
    vc = x.value_counts().sort_index()
    res = pd.DataFrame({
        'index' : vc.index,
        'freq'  : vc.values,
    })
    res['rate'] = 100 * res['freq'] / res['freq'].sum()
    return res

In [8]:
from sklearn.metrics import mean_absolute_error

# verbose=0로 만들어주는 함수
# (참조) https://stackoverflow.com/questions/11130156/suppress-stdout-stderr-print-from-python-functions
class suppress_stdout_stderr(object):
    '''
    A context manager for doing a "deep suppression" of stdout and stderr in
    Python, i.e. will suppress all print, even if the print originates in a
    compiled C/Fortran sub-function.
       This will not suppress raised exceptions, since exceptions are printed
    to stderr just before a script exits, and after the context manager has
    exited (at least, I think that is why it lets exceptions through).

    '''
    def __init__(self):
        # Open a pair of null files
        self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
        # Save the actual stdout (1) and stderr (2) file descriptors.
        self.save_fds = (os.dup(1), os.dup(2))

    def __enter__(self):
        # Assign the null pointers to stdout and stderr.
        os.dup2(self.null_fds[0], 1)
        os.dup2(self.null_fds[1], 2)

    def __exit__(self, *_):
        # Re-assign the real stdout/stderr back to (1) and (2)
        os.dup2(self.save_fds[0], 1)
        os.dup2(self.save_fds[1], 2)
        # Close the null files
        os.close(self.null_fds[0])
        os.close(self.null_fds[1])

<br>

User Functions

In [9]:
# from lib.MyModel import *

<br>

## Initial Values

In [10]:
DAT_PATH = "../DAT/"

start_time = datetime.datetime.now()
print(start_time)

2022-11-12 08:09:31.269485


<br></br>

---
# Data Load
---

<br>

## Preprocessed Data

In [11]:
%%time

train_df7 = pd.read_parquet('../OUT/train_df7.parquet.gz')
test_df7  = pd.read_parquet('../OUT/test_df7.parquet.gz')

CPU times: user 12.1 s, sys: 11 s, total: 23 s
Wall time: 2min 35s


<br>

## Use all Covid feature or not

<br>

(1) if use

In [12]:
# Covid 컬럼을 가져오기 위한 gubun 값들
gubuns = [
    '강원','검역','경기','경남','경북','광주','대구','대전','부산','서울',
    '세종','울산','인천','전남','전북','제주','충남','충북','합계'
]

# gubun 값들이 컬럼에 포함된 경우 Covid 컬럼으로 가져옴
covid_features = []
for gubun in gubuns:
    col = [col for col in train_df7.columns if col.find(gubun)>=0]
    covid_features+=col
    
not_jeju_col = [col for col in covid_features if col.find('제주')<0]

In [13]:
# Correlation이 높지만 따로 처리하지 않음
train_df7[[col for col in train_df7.columns if col.find('_제주')>=0]].corr()

Unnamed: 0,deathCnt_제주,defCnt_제주,incDec_제주,isolClearCnt_제주,isolIngCnt_제주,localOccCnt_제주,overFlowCnt_제주,qurRate_제주
deathCnt_제주,1.0,0.991191,0.079869,-0.08031,-0.160987,0.078447,0.513374,0.99119
defCnt_제주,0.991191,1.0,0.151195,0.008329,-0.154311,0.149801,0.498721,0.999998
incDec_제주,0.079869,0.151195,1.0,0.762447,-0.106272,0.999996,-0.052035,0.151319
isolClearCnt_제주,-0.08031,0.008329,0.762447,1.0,-0.070244,0.762797,-0.167131,0.00839
isolIngCnt_제주,-0.160987,-0.154311,-0.106272,-0.070244,1.0,-0.106046,-0.077065,-0.154325
localOccCnt_제주,0.078447,0.149801,0.999996,0.762797,-0.106046,1.0,-0.054802,0.149927
overFlowCnt_제주,0.513374,0.498721,-0.052035,-0.167131,-0.077065,-0.054802,1.0,0.49869
qurRate_제주,0.99119,0.999998,0.151319,0.00839,-0.154325,0.149927,0.49869,1.0


<br>

(2) if not use

In [14]:
# not_jeju_col=[]

<br>

## Categorical & Numerical Feature

In [15]:
cat_features = train_df7\
    .drop(['id','target','base_date']+not_jeju_col,axis=1)\
    .select_dtypes(include=[object])\
    .columns.tolist()

num_features = [x for x in train_df7.drop(['id','target','base_date']+not_jeju_col,axis=1).columns 
                if x not in cat_features]

In [16]:
segment_features = 'segment'

del_features = ['day','start_node_name','end_node_name','road_name'] # 'wd','clfmAbbrCd'

new_cat_features = [col for col in cat_features if col not in [segment_features]+del_features]
new_num_features = [col for col in num_features if col not in [segment_features]+del_features]

In [17]:
print(f'numerical   features : {len(new_num_features)}')
print(f'categorical features : {len(new_cat_features)}')

numerical   features : 32
categorical features : 15


In [18]:
train_df7[new_cat_features].apply(lambda x: x.nunique())

weight_restricted         4
road_type                 2
start_turn_restricted     2
end_turn_restricted       2
weekday                   7
weekend                   2
holiday                   2
address                   2
stnId                     2
rnQcflg                   2
wd                       17
ssQcflg                   3
clfmAbbrCd               26
tsQcflg                   2
base_hour_3_interval      8
dtype: int64

<br>

## Make Modeling Dataset

In [19]:
X_train = train_df7[[segment_features]+new_cat_features+new_num_features]
y_train = train_df7.target

X_test  = test_df7[[segment_features]+new_cat_features+new_num_features]

In [20]:
# del train_df7, test_df7

<br>

One-Hot Encoding

In [21]:
X_train_oh = X_train.copy()
X_test_oh  = X_test .copy()

for col in tqdm(new_cat_features):
    
    # train, test의 모든 케이스에 대해서 onehot encoding을 위해, concat
    tmp_data = pd.concat([
        X_train_oh[[col]].assign(group='train'),
        X_test_oh [[col]].assign(group='test'),
    ],axis=0)
    
    # onehot encoding
    oh_df = pd.get_dummies(tmp_data,columns=[col])
    
    # split train/test
    tr_oh_df = oh_df[oh_df['group']=='train'].drop('group',axis=1)
    te_oh_df = oh_df[oh_df['group']=='test' ].drop('group',axis=1)
    
    # 마지막 컬럼을 제거 -> multicollinearity 문제로 인해 컬럼 하나를 제거
    tr_oh_df = tr_oh_df[tr_oh_df.columns[:-1]]
    te_oh_df = te_oh_df[te_oh_df.columns[:-1]]
    
    # 기존컬럼 제거
    X_train_oh.drop(col,axis=1,inplace=True)
    X_test_oh .drop(col,axis=1,inplace=True)
    
    # one-hot encoding 컬럼 추가
    X_train_oh = pd.concat([X_train_oh,tr_oh_df],axis=1)
    X_test_oh  = pd.concat([X_test_oh ,te_oh_df],axis=1)

100%|██████████| 15/15 [00:50<00:00,  3.35s/it]


In [22]:
X_train_oh.shape, X_test_oh.shape

((4686836, 107), (291241, 107))

In [23]:
X_train_oh.to_parquet('../OUT/X_train_oh.parquet.gz')
X_test_oh .to_parquet('../OUT/X_test_oh.parquet.gz')
y_train   .to_csv('../OUT/y_train.csv',index=False)

In [24]:
# # Kernel Dead로 인해, 실행 불가

# print('(1) data concat')
# tmp = pd.concat([
#     X_train.assign(group='train'),
#     X_test .assign(group='test'),
# ],axis=0)

# print('(2) get dummies')
# tmp = pd.get_dummies(tmp, columns=new_cat_features)

# print('(3) data split')
# X_train_oh = tmp[tmp['group']=='train'].drop('group',axis=1)
# X_test_oh  = tmp[tmp['group']=='test' ].drop('group',axis=1)

# print('(4) save data')
# X_train_oh.to_parquet('../OUT/X_train_oh.parquet.gz')
# X_test_oh .to_parquet('../OUT/X_test_oh.parquet.gz')

<br></br>

---
# Transformation & Scaling
---

In [25]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [26]:
def identity(x):
    return x

def transformation(x,offset=1e-4,method=['log','sqrt','identity']):
    #x = pd.Series(x)
    
    if method=='log':
        res = np.log(x+offset)
    elif method=='sqrt':
        res = np.sqrt(x+offset)
    elif method=='identity':
        res = identity(x)
    else:
        raise('Unknown Transformation Method')
    
    return res

def inverse_transformation(x,offset,method=['log','sqrt','identity']):
    #x = pd.Series(x)
    
    if method=='log':
        res = np.exp(x)-offset
    elif method=='sqrt':
        res = (x**2)-offset
    elif method=='identity':
        res = identity(x)
    else:
        raise('Unknown Transformation Method')
        
    return res

<br>

Setting Methods

In [27]:
transform_method = ['identity','log','sqrt'][0]
offset = 1e-4
scaling_method = MinMaxScaler

print(f'(1) Transformation method : {transform_method}(offset={offset})')
print(f'(2) Scaling method : {scaling_method.__name__}')

(1) Transformation method : identity(offset=0.0001)
(2) Scaling method : MinMaxScaler


<br></br>

---
# Save Segment Data
---

In [None]:
X_train_oh = pd.read_parquet('../OUT/X_train_oh.parquet.gz')
X_test_oh  = pd.read_parquet('../OUT/X_test_oh.parquet.gz')
y_train    = pd.read_csv('../OUT/y_train.csv')

In [28]:
segment_features = 'segment'

tr_segment = X_train_oh[segment_features].value_counts().sort_values(ascending=False).index
te_segment = X_test_oh [segment_features].value_counts().index
segment    = [seg for seg in tr_segment if seg in te_segment]

In [37]:
for seg in tqdm(segment):

    X_tr = X_train_oh[X_train_oh[segment_features]==seg].drop(segment_features,axis=1).astype(float)
    X_te = X_test_oh [X_test_oh [segment_features]==seg].drop(segment_features,axis=1).astype(float)
    
    y_tr = y_train   [X_train_oh[segment_features]==seg].astype(float)
    
    # transformation
    y_tr = transformation(y_tr,offset=offset,method=transform_method)
    
    # scaling
    for col in X_tr.columns:
        scaler = scaling_method()
        scaler.fit(np.array(X_tr[col]).reshape(-1,1))

        X_tr[col] = scaler.transform(np.array(X_tr[col]).reshape(-1,1))
        X_te[col] = scaler.transform(np.array(X_te[col]).reshape(-1,1))
        
    X_tr = pd.concat([X_tr,y_tr],axis=1)
        
    createFolder(f'../OUT/segment/{seg}')
    
    X_tr.to_csv(f'../OUT/segment/{seg}/train_df.csv',index=False)
    X_te.to_csv(f'../OUT/segment/{seg}/test_df.csv',index=False)

100%|██████████| 25/25 [06:31<00:00, 15.66s/it]


<br></br>

In [38]:
end_time = datetime.datetime.now()
run_time = end_time-start_time

print('='*45)
print(f'> start time : {start_time}')
print(f'>   end time : {end_time}')
print(f'>   run time : {run_time:}')
print('='*45)

> start time : 2022-11-12 08:09:31.269485
>   end time : 2022-11-12 08:21:50.915741
>   run time : 0:12:19.646256
