In [1]:
import os
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

dayofweek_dict = {"월": 1, "화": 2, "수": 3, "목": 4, "금": 5, "토": 6, "일":7}

def prepare_dataset(fname):
    def generate_medium_point(start, end):
        return (end - start) + start

    df = pd.read_csv(os.path.join("data", fname))
    df.base_date = pd.to_datetime(df.base_date.map(lambda x: str(int(x))))

#     df.loc[:, "year"] = df.base_date.dt.year
    df.loc[:, "month"] = df.base_date.dt.month
#     df.loc[:, "day"] = df.base_date.dt.day

    df.day_of_week = df.day_of_week.map(lambda x: dayofweek_dict[x])
    df.loc[:, "is_weekend"] = df.day_of_week.map(lambda x: 1 if x >= 6 else 0)

    df.start_turn_restricted = df.start_turn_restricted.fillna(0)
    df.start_turn_restricted = df.start_turn_restricted.map(lambda x: 1 if x == "있음" else 0)
    
    df.end_turn_restricted = df.end_turn_restricted.fillna(0)
    df.end_turn_restricted = df.end_turn_restricted.map(lambda x: 1 if x == "있음" else 0)

    df.loc[:, "lat_mid"] = df.apply(lambda x: generate_medium_point(x['start_latitude'], x['end_latitude']), axis=1)
    df.loc[:, "long_mid"] = df.apply(lambda x: generate_medium_point(x['start_longitude'], x['end_longitude']), axis=1)

    df = df.drop(["id", "base_date", "road_name", "start_node_name", "end_node_name",
                  'start_latitude', 'end_latitude', 'start_longitude', 'end_longitude', 
                  'height_restricted', 'weight_restricted'], axis=1)
    
    return df

train_df = prepare_dataset("train.csv")
test_df = prepare_dataset("test.csv")
train_df.head()

Unnamed: 0,day_of_week,base_hour,lane_count,road_rating,multi_linked,connect_code,maximum_speed_limit,vehicle_restricted,road_type,start_turn_restricted,end_turn_restricted,target,month,is_weekend,lat_mid,long_mid
0,4,17,1,106,0,0,60.0,0.0,3,0,0,52.0,6,0,33.427749,126.662335
1,4,21,2,103,0,0,60.0,0.0,0,1,0,30.0,7,0,33.504811,126.52624
2,7,7,2,103,0,0,80.0,0.0,0,0,0,61.0,10,1,33.280072,126.362147
3,5,13,2,107,0,0,50.0,0.0,0,0,0,20.0,3,0,33.245565,126.566228
4,2,8,2,103,0,0,80.0,0.0,0,0,0,38.0,10,0,33.462677,126.330152


In [2]:
sample_df = train_df.loc[train_df.month.isin([7, 9])].reset_index(drop=True)
sample_df.head()

Unnamed: 0,day_of_week,base_hour,lane_count,road_rating,multi_linked,connect_code,maximum_speed_limit,vehicle_restricted,road_type,start_turn_restricted,end_turn_restricted,target,month,is_weekend,lat_mid,long_mid
0,4,21,2,103,0,0,60.0,0.0,0,1,0,30.0,7,0,33.504811,126.52624
1,1,7,2,107,0,0,60.0,0.0,0,0,0,28.0,9,0,33.252183,126.506069
2,7,2,1,107,0,0,50.0,0.0,0,0,0,40.0,7,1,33.248633,126.567766
3,5,22,2,107,0,0,50.0,0.0,0,0,0,35.0,7,0,33.485975,126.486409
4,5,21,1,103,0,0,70.0,0.0,0,1,0,21.0,7,0,33.500132,126.512046


In [8]:
from pycaret.regression import *

setup_clf = setup(data=train_df, target='target', 
                  fold=5,
                  feature_selection=True)

Unnamed: 0,Description,Value
0,session_id,6628
1,Target,target
2,Original Data,"(4701217, 16)"
3,Missing Values,False
4,Numeric Features,5
5,Categorical Features,10
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(3290851, 32)"


In [9]:
initial_model = create_model("catboost")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,4.1253,31.7068,5.6309,0.8755,0.1726,0.1279
1,4.1656,32.3867,5.6909,0.8729,0.1741,0.1292
2,4.1256,31.9605,5.6534,0.8744,0.1728,0.1277
3,4.1803,32.6958,5.718,0.8716,0.1747,0.1295
4,4.1439,32.0058,5.6574,0.8741,0.1731,0.1285
Mean,4.1481,32.1511,5.6701,0.8737,0.1735,0.1286
Std,0.0218,0.3484,0.0307,0.0014,0.0008,0.0007


In [10]:
tuned_model = tune_model(initial_model, n_iter=10, optimize="MAE")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.8497,28.5302,5.3414,0.888,0.1634,0.1187
1,3.8931,29.1845,5.4023,0.8854,0.1643,0.1196
2,3.8641,28.9106,5.3769,0.8864,0.1636,0.1188
3,3.8938,29.3309,5.4158,0.8848,0.165,0.1199
4,3.8657,28.7815,5.3648,0.8868,0.1634,0.1189
Mean,3.8733,28.9476,5.3802,0.8863,0.1639,0.1192
Std,0.0174,0.285,0.0265,0.0011,0.0006,0.0005


In [12]:
bagging_model = ensemble_model(tuned_model, n_estimators=10)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.8163,28.128,5.3036,0.8896,0.1628,0.118
1,3.8184,28.2351,5.3137,0.8892,0.1629,0.118
2,3.8212,28.2417,5.3143,0.889,0.1623,0.1177
3,3.8149,28.215,5.3118,0.8892,0.163,0.118
4,3.8132,28.0484,5.2961,0.8897,0.1619,0.1176
Mean,3.8168,28.1736,5.3079,0.8893,0.1626,0.1179
Std,0.0028,0.0747,0.007,0.0003,0.0004,0.0002


In [13]:
final_model = finalize_model(bagging_model)

In [14]:
pred = predict_model(final_model, data=test_df)
pred

Unnamed: 0,day_of_week,base_hour,lane_count,road_rating,multi_linked,connect_code,maximum_speed_limit,vehicle_restricted,road_type,start_turn_restricted,end_turn_restricted,month,is_weekend,lat_mid,long_mid,Label
0,4,17,3,107,0,0,70.0,0.0,0,0,1,8,0,33.500772,126.543837,23.468863
1,2,12,2,103,0,0,70.0,0.0,3,0,0,8,0,33.258119,126.415840,45.321267
2,5,2,1,103,0,0,60.0,0.0,0,0,0,8,0,33.259206,126.474687,65.183384
3,4,23,3,103,0,0,70.0,0.0,0,0,0,8,0,33.471061,126.545467,36.211974
4,3,17,3,106,0,0,70.0,0.0,0,0,0,8,0,33.496863,126.581230,46.141443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291236,6,5,1,103,0,0,80.0,0.0,0,0,0,8,1,33.470483,126.460846,49.455448
291237,5,20,2,103,0,0,60.0,0.0,0,0,0,8,0,33.300796,126.600332,54.082407
291238,5,11,1,107,0,0,30.0,0.0,0,1,1,8,0,33.497500,126.496946,22.588583
291239,5,7,2,107,0,0,60.0,0.0,0,0,0,8,0,33.255659,126.507333,26.308431


In [15]:
submission_df = pd.read_csv("data/sample_submission.csv")
submission_df['target'] = pred.Label
submission_df.to_csv("results/catboost_subsampled.csv", index=False)