In [11]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta


In [12]:
MODEL_NAME = "cat"

# Load and preprocess

In [13]:
df_train = pd.read_csv("./data/train.csv",dtype={'일시':int})

In [14]:
df_train.head()

Unnamed: 0,일시,광진구,동대문구,성동구,중랑구
0,20180101,0.592,0.368,0.58,0.162
1,20180102,0.84,0.614,1.034,0.26
2,20180103,0.828,0.576,0.952,0.288
3,20180104,0.792,0.542,0.914,0.292
4,20180105,0.818,0.602,0.994,0.308


In [15]:
def dfByDateRange(dt_start:datetime,dt_end:datetime):
    dts=[]
    dt_cur = dt_start
    while dt_cur <= dt_end:
        dts.append(dt_cur.year*10000 + dt_cur.month*100 + dt_cur.day)
        dt_cur += timedelta(days=1)

    return pd.DataFrame({'일시': dts})

In [16]:
def preprocess(df,holiday_path,makeY=False):
    df_holiday =  pd.read_csv(holiday_path,dtype={'locdate':int})

    holidays = df_holiday['locdate'].apply(lambda x : datetime(year=int(x//1e4), month = int((x%1e4)//100), day=int(x%100)))
    
    df_features = pd.DataFrame()
    df_features['date'] = df['일시'].apply(lambda x : datetime(year=int(x//1e4), month = int((x%1e4)//100), day=int(x%100)))
    df_features['weekday'] = df_features['date'].apply(lambda date : date.weekday()) / 7.0
    df_features['weekend'] =  df_features['date'].apply(lambda date : date.weekday() == 5 or date.weekday() == 6)
    df_features['month'] = df_features['date'].apply(lambda date : date.month) / 12.0
    #df_features['day_of_month']= df_features['date'].apply(lambda date : date.day) 
    df_features['day_of_year'] = df_features['date'].apply(lambda date : date.timetuple().tm_yday) / 365.0 
    df_features['red'] = df_features['date'].isin(holidays) | df_features['weekday'].isin([5,6]) #saturday sunday
    df_features['year'] = df_features['date'].apply(lambda date : date.year) / 2018.0
    df_features['week_of_year']=df_features['date'].apply(lambda date : date.isocalendar()[1]) / 53.0

    df_features['vacation'] = False
    df_features['rainy_season'] = df_features['date'].apply(lambda date : 726 >= date.month*100 + date.day >= 625)

    for i in range(1,df_features.shape[0]-1):
        if df_features["red"][i] & df_features["red"][i-1] & df_features["red"][i+1]:
            df_features.at[i-1,"vacation"] = True
            df_features.at[i,"vacation"] = True
            df_features.at[i+1,"vacation"] = True


    cols_location = ['광진구','동대문구','성동구','중랑구']
    list_df_features_by_loc = []
    for idx, loc in enumerate(cols_location):
        df_new = df_features.copy()
        df_new['loc'] = idx
        if makeY:
            df_new['y']=df[loc]
        list_df_features_by_loc.append(df_new)

    df_ret=pd.concat(list_df_features_by_loc)
    df_ret = df_ret.reset_index(drop=True)

    df_ret = df_ret.drop('date',axis=1)

    return df_ret



In [17]:
df_processed = preprocess(df_train,'./data/holiday_train.csv',makeY=True)

df_trainX = df_processed.drop(['y'],axis=1)
df_trainY = df_processed['y']

In [18]:
df_trainX.tail()

Unnamed: 0,weekday,weekend,month,day_of_year,red,year,week_of_year,vacation,rainy_season,loc
5839,0.0,False,1.0,0.989041,False,1.001487,0.981132,False,False,3
5840,0.142857,False,1.0,0.991781,False,1.001487,0.981132,False,False,3
5841,0.285714,False,1.0,0.994521,False,1.001487,0.981132,False,False,3
5842,0.428571,False,1.0,0.99726,False,1.001487,0.981132,False,False,3
5843,0.571429,False,1.0,1.0,False,1.001487,0.981132,False,False,3


# Train

In [19]:
from catboost import CatBoostRegressor

DEV="cpu"

params = {
    'learning_rate': 0.01, 
    #'max_depth':8,
    'objective': 'MAE',
    'early_stopping_rounds' : 10000,
    'eval_metric':'MAE',
    'iterations':700,
    'random_seed':42,
    'metric_period':1000,
    'random_seed':42,
    #'boosting':'dart',
}



cat = CatBoostRegressor(**params)
cat.fit(X=df_trainX, y = df_trainY)

0:	learn: 2.5177188	total: 2.3ms	remaining: 1.61s
699:	learn: 0.9205025	total: 1.26s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x2037b9669a0>

# Prediction and make Submission file

In [20]:
df_test = dfByDateRange(datetime(year=2022,month=1,day=1),datetime(year=2022,month=11,day=30))
df_testX = preprocess(df_test,'./data/holiday_test.csv')

prediction = cat.predict(df_testX)
prediction[prediction<0.01] = 0.01

n_dates = len(prediction)//4
df_submission = pd.DataFrame({
    '일시':df_test['일시'].values,
    '광진구':prediction[:n_dates],
    '동대문구':prediction[n_dates:2*n_dates],
    '성동구':prediction[2*n_dates:3*n_dates],
    '중랑구':prediction[3*n_dates:]
})

filename = f"sub_{MODEL_NAME}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
print(filename)
df_submission.to_csv(f"./{filename}", index=False)

sub_cat_20221214_222648.csv


In [21]:
df_submission.tail()

Unnamed: 0,일시,광진구,동대문구,성동구,중랑구
329,20221126,6.861944,5.570131,5.603426,4.105433
330,20221127,6.310531,4.994095,5.08448,3.671906
331,20221128,7.190272,6.119164,6.022596,4.331204
332,20221129,7.212938,6.130321,6.037548,4.354235
333,20221130,7.260728,6.167976,6.077848,4.387612
