In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import * 

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Load Data

In [3]:
train_df = pd.read_csv('../data/train.csv')
train_df.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,road_in_use,lane_count,road_rating,road_name,multi_linked,connect_code,...,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
0,TRAIN_0000000,20220623,목,17,0,1,106,지방도1112호선,0,0,...,3,제3교래교,33.427747,126.662612,없음,제3교래교,33.427749,126.662335,없음,52.0
1,TRAIN_0000001,20220728,목,21,0,2,103,일반국도11호선,0,0,...,0,광양사거리,33.50073,126.529107,있음,KAL사거리,33.504811,126.52624,없음,30.0
2,TRAIN_0000002,20211010,일,7,0,2,103,일반국도16호선,0,0,...,0,창고천교,33.279145,126.368598,없음,상창육교,33.280072,126.362147,없음,61.0
3,TRAIN_0000003,20220311,금,13,0,2,107,태평로,0,0,...,0,남양리조트,33.246081,126.567204,없음,서현주택,33.245565,126.566228,없음,20.0
4,TRAIN_0000004,20211005,화,8,0,2,103,일반국도12호선,0,0,...,0,애월샷시,33.462214,126.326551,없음,애월입구,33.462677,126.330152,없음,38.0


In [4]:
train_df.columns

Index(['id', 'base_date', 'day_of_week', 'base_hour', 'road_in_use',
       'lane_count', 'road_rating', 'road_name', 'multi_linked',
       'connect_code', 'maximum_speed_limit', 'vehicle_restricted',
       'weight_restricted', 'height_restricted', 'road_type',
       'start_node_name', 'start_latitude', 'start_longitude',
       'start_turn_restricted', 'end_node_name', 'end_latitude',
       'end_longitude', 'end_turn_restricted', 'target'],
      dtype='object')

In [5]:
test_df = pd.read_csv('../data/test.csv')
test_df.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,road_in_use,lane_count,road_rating,road_name,multi_linked,connect_code,...,height_restricted,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted
0,TEST_000000,20220825,목,17,0,3,107,연삼로,0,0,...,0.0,0,산지2교,33.499427,126.541298,없음,제주은행사거리,33.500772,126.543837,있음
1,TEST_000001,20220809,화,12,0,2,103,일반국도12호선,0,0,...,0.0,3,중문입구,33.258507,126.427003,없음,관광단지입구,33.258119,126.41584,없음
2,TEST_000002,20220805,금,2,0,1,103,일반국도16호선,0,0,...,0.0,0,도순3교,33.25896,126.476508,없음,도순2교,33.259206,126.474687,없음
3,TEST_000003,20220818,목,23,0,3,103,일반국도11호선,0,0,...,0.0,0,아라주공아파트,33.473494,126.545647,없음,인다마을,33.471061,126.545467,없음
4,TEST_000004,20220810,수,17,0,3,106,번영로,0,0,...,0.0,0,부록교 시종점,33.501477,126.569223,없음,봉개교 시종점,33.496863,126.58123,없음


# Sort Data

In [6]:
# datetime 컬럼 생성
train_df['base_date'] = train_df['base_date'].astype(str)
train_df['base_hour'] = train_df['base_hour'].astype(str)
train_df['datetime'] = pd.to_datetime(train_df['base_date'] + ' ' + train_df['base_hour'].str.zfill(2))

test_df['base_date'] = test_df['base_date'].astype(str)
test_df['base_hour'] = test_df['base_hour'].astype(str)
test_df['datetime'] = pd.to_datetime(test_df['base_date'] + ' ' + train_df['base_hour'].str.zfill(2))

In [7]:
sorted_train_df = train_df.sort_values(['datetime', 'road_name', 'start_node_name', 'end_node_name'])
sorted_test_df = test_df.sort_values(['datetime', 'road_name', 'start_node_name', 'end_node_name'])

In [21]:
len(sorted_train_df['datetime'].unique())

6085

In [20]:
sorted_train_df.groupby('datetime')['target'].mean()

datetime
2021-09-01 00:00:00    50.721925
2021-09-01 01:00:00    49.880478
2021-09-01 02:00:00    51.052709
2021-09-01 03:00:00    50.870588
2021-09-01 04:00:00    50.399445
                         ...    
2022-07-31 18:00:00    32.360294
2022-07-31 19:00:00    32.269608
2022-07-31 20:00:00    32.552826
2022-07-31 21:00:00    32.992574
2022-07-31 22:00:00    36.285360
Name: target, Length: 6085, dtype: float64

# 평균 이동량 컬럼 추가

In [22]:
new_train_df = sorted_train_df.copy()
new_train_df['mean_move'] = np.nan

In [8]:
total_df = pd.concat([sorted_train_df, sorted_test_df])

In [9]:
total_df.describe()

Unnamed: 0,road_in_use,lane_count,road_rating,multi_linked,connect_code,maximum_speed_limit,vehicle_restricted,weight_restricted,height_restricted,road_type,start_latitude,start_longitude,end_latitude,end_longitude,target
count,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4701217.0
mean,0.001426552,1.847427,104.9753,0.0004484765,0.2505031,61.10851,0.0,5504.004,0.0,0.6163683,33.38299,126.5212,33.383,126.5212,42.78844
std,0.03774277,0.6929471,1.842662,0.02117252,5.073369,12.10544,0.0,13821.66,0.0,1.212104,0.1022663,0.1524749,0.10226,0.1524585,15.95443
min,0.0,1.0,103.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,33.24343,126.1826,33.24343,126.1826,1.0
25%,0.0,1.0,103.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,33.26269,126.4244,33.26269,126.4244,30.0
50%,0.0,2.0,106.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,33.41209,126.5112,33.41209,126.5112,43.0
75%,0.0,2.0,107.0,0.0,0.0,70.0,0.0,0.0,0.0,0.0,33.4786,126.578,33.4786,126.578,54.0
max,1.0,3.0,107.0,1.0,103.0,80.0,0.0,50000.0,0.0,3.0,33.55608,126.9309,33.55608,126.9309,113.0


In [10]:
total_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4992458 entries, 3517001 to 253016
Data columns (total 25 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   id                     object        
 1   base_date              object        
 2   day_of_week            object        
 3   base_hour              object        
 4   road_in_use            int64         
 5   lane_count             int64         
 6   road_rating            int64         
 7   road_name              object        
 8   multi_linked           int64         
 9   connect_code           int64         
 10  maximum_speed_limit    float64       
 11  vehicle_restricted     float64       
 12  weight_restricted      float64       
 13  height_restricted      float64       
 14  road_type              int64         
 15  start_node_name        object        
 16  start_latitude         float64       
 17  start_longitude        float64       
 18  start_turn_restri

In [11]:
drop_col = ['id', 'vehicle_restricted', 'height_restricted', 'start_latitude', 'start_longitude', #'base_date'
            'end_latitude', 'end_longitude', 'road_in_use']
x = sorted_train_df.drop(drop_col, axis = 1)
x = x.drop('target', axis = 1)
y = sorted_train_df.loc[:, 'target']

test_data = sorted_test_df.drop(drop_col, axis = 1)

In [12]:
dumm_cols = ['day_of_week', 'base_hour', 'road_rating', 'road_name', 
             'multi_linked', 'connect_code', 'road_type', 
             'start_turn_restricted', 'end_turn_restricted', 'start_node_name', 'end_node_name']

In [13]:
for col in dumm_cols:
    le = LabelEncoder()
    x[col]= le.fit_transform(x[col])
    test_data[col]= le.transform(test_data[col])

In [14]:
x.shape

(4701217, 16)

In [15]:
x

Unnamed: 0,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,weight_restricted,road_type,start_node_name,start_turn_restricted,end_node_name,end_turn_restricted,datetime
3517001,20210901,2,0,1,2,0,0,0,60.0,0.0,0,3,0,325,0,2021-09-01 00:00:00
2734799,20210901,2,0,2,2,0,0,0,60.0,0.0,0,4,0,49,0,2021-09-01 00:00:00
4126826,20210901,2,0,1,2,0,0,0,50.0,0.0,0,4,0,241,0,2021-09-01 00:00:00
3647101,20210901,2,0,1,2,0,0,0,50.0,0.0,0,8,0,216,0,2021-09-01 00:00:00
3547151,20210901,2,0,1,2,0,0,0,50.0,0.0,0,8,0,243,0,2021-09-01 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3806835,20220731,4,15,2,2,57,0,0,50.0,0.0,0,435,0,238,0,2022-07-31 22:00:00
2609777,20220731,4,15,2,2,57,0,0,50.0,0.0,0,442,0,153,0,2022-07-31 22:00:00
1618853,20220731,4,15,2,2,57,0,0,50.0,0.0,0,442,0,203,0,2022-07-31 22:00:00
2751062,20220731,4,15,1,2,58,0,0,60.0,0.0,0,296,1,387,0,2022-07-31 22:00:00


In [16]:
test_data.shape

(291241, 16)

In [17]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.2, random_state = 2022, shuffle=False)

In [18]:
# x_train.to_csv('../data/train1_origin/x_train.csv', index=False)
# x_val.to_csv('../data/train1_origin/x_val.csv', index=False)
# y_train.to_csv('../data/train1_origin/y_train.csv', index=False)
# y_val.to_csv('../data/train1_origin/y_val.csv', index=False)
# test_data.to_csv('../data/train1_origin/x_test.csv', index=False)

# Catboost

In [19]:
from catboost import CatBoostRegressor

cat = CatBoostRegressor(loss_function='MAE', verbose=1)
cat.fit(x, y)

0:	learn: 13.0206749	total: 336ms	remaining: 5m 36s
1:	learn: 12.8253926	total: 610ms	remaining: 5m 4s
2:	learn: 12.6395943	total: 876ms	remaining: 4m 51s
3:	learn: 12.4635012	total: 1.14s	remaining: 4m 44s
4:	learn: 12.2988336	total: 1.41s	remaining: 4m 39s
5:	learn: 12.1413274	total: 1.66s	remaining: 4m 34s
6:	learn: 11.9879094	total: 1.92s	remaining: 4m 32s
7:	learn: 11.8356581	total: 2.18s	remaining: 4m 30s
8:	learn: 11.6937457	total: 2.44s	remaining: 4m 28s
9:	learn: 11.5574938	total: 2.69s	remaining: 4m 26s
10:	learn: 11.4252464	total: 2.95s	remaining: 4m 25s
11:	learn: 11.3031097	total: 3.21s	remaining: 4m 24s
12:	learn: 11.1834486	total: 3.47s	remaining: 4m 23s
13:	learn: 11.0655580	total: 3.74s	remaining: 4m 23s
14:	learn: 10.9539050	total: 4s	remaining: 4m 22s
15:	learn: 10.8469841	total: 4.26s	remaining: 4m 22s
16:	learn: 10.7448601	total: 4.52s	remaining: 4m 21s
17:	learn: 10.6472299	total: 4.78s	remaining: 4m 20s
18:	learn: 10.5528201	total: 5.04s	remaining: 4m 20s
19:	lea

<catboost.core.CatBoostRegressor at 0x7fc6549a12e0>

In [20]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(verbose=1)
model.fit(x, y)

Learning rate set to 0.155736
0:	learn: 14.8124221	total: 127ms	remaining: 2m 6s
1:	learn: 13.9334885	total: 244ms	remaining: 2m 1s
2:	learn: 13.2574233	total: 365ms	remaining: 2m 1s
3:	learn: 12.7213896	total: 478ms	remaining: 1m 59s
4:	learn: 12.2417245	total: 611ms	remaining: 2m 1s
5:	learn: 11.8927467	total: 730ms	remaining: 2m 1s
6:	learn: 11.6156169	total: 846ms	remaining: 1m 59s
7:	learn: 11.3899662	total: 961ms	remaining: 1m 59s
8:	learn: 11.1730557	total: 1.09s	remaining: 1m 59s
9:	learn: 10.9278323	total: 1.22s	remaining: 2m
10:	learn: 10.7695939	total: 1.34s	remaining: 2m
11:	learn: 10.6482550	total: 1.46s	remaining: 1m 59s
12:	learn: 10.5164187	total: 1.57s	remaining: 1m 59s
13:	learn: 10.4308524	total: 1.67s	remaining: 1m 57s
14:	learn: 10.2815927	total: 1.8s	remaining: 1m 58s
15:	learn: 10.1669688	total: 1.92s	remaining: 1m 57s
16:	learn: 10.1030445	total: 2.03s	remaining: 1m 57s
17:	learn: 9.9875512	total: 2.15s	remaining: 1m 57s
18:	learn: 9.9134938	total: 2.27s	remaini

<catboost.core.CatBoostRegressor at 0x7fc74c70f1f0>

In [47]:
y_pred = model.predict(test_data)
len(y_pred)

291241

In [48]:
temp = sorted_test_df[['id']].copy()
temp['target'] = y_pred
result = temp.sort_values('id')
result

Unnamed: 0,id,target
0,TEST_000000,25.317371
1,TEST_000001,41.693166
2,TEST_000002,62.575574
3,TEST_000003,35.261777
4,TEST_000004,38.602273
...,...,...
291236,TEST_291236,47.674337
291237,TEST_291237,50.884468
291238,TEST_291238,22.388354
291239,TEST_291239,23.417625


In [49]:
result.to_csv('../data/submission_v10.csv', index=False)

In [95]:
model_cat = CatBoostRegressor()
model_cat.fit(x_train, y_train)

Learning rate set to 0.150341
0:	learn: 14.8144593	total: 104ms	remaining: 1m 44s
1:	learn: 13.9175486	total: 201ms	remaining: 1m 40s
2:	learn: 13.2211945	total: 296ms	remaining: 1m 38s
3:	learn: 12.6856126	total: 392ms	remaining: 1m 37s
4:	learn: 12.2529243	total: 488ms	remaining: 1m 37s
5:	learn: 11.8605600	total: 583ms	remaining: 1m 36s
6:	learn: 11.5675107	total: 678ms	remaining: 1m 36s
7:	learn: 11.3337530	total: 770ms	remaining: 1m 35s
8:	learn: 11.1417422	total: 867ms	remaining: 1m 35s
9:	learn: 10.9477010	total: 962ms	remaining: 1m 35s
10:	learn: 10.7983845	total: 1.05s	remaining: 1m 34s
11:	learn: 10.6195481	total: 1.16s	remaining: 1m 35s
12:	learn: 10.4834155	total: 1.25s	remaining: 1m 35s
13:	learn: 10.3686107	total: 1.35s	remaining: 1m 34s
14:	learn: 10.2670049	total: 1.45s	remaining: 1m 34s
15:	learn: 10.1939180	total: 1.53s	remaining: 1m 34s
16:	learn: 10.0745507	total: 1.64s	remaining: 1m 34s
17:	learn: 9.9988338	total: 1.74s	remaining: 1m 35s
18:	learn: 9.9095501	total:

<catboost.core.CatBoostRegressor at 0x7f8b03c88d60>

In [96]:
pred = model_cat.predict(x_val)
pred
print(mean_absolute_error(y_val, pred))

3.984917124333543


In [116]:
joblib.dump(model_cat, 'models/model_cat.pkl')

['models/model_cat.pkl']

# RF

In [101]:
model_rf = RandomForestRegressor()

model_rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [102]:
pred = model_rf.predict(x_val)
pred
print(mean_absolute_error(y_val, pred))

3.589763242950743


In [114]:
joblib.dump(model_rf, 'models/model_rf.pkl')

['models/model_rf.pkl']

In [121]:
y_pred = model_rf.predict(test_data)
result = pd.DataFrame({'id':test_df['id'], 'target':y_pred})
result.head()

Unnamed: 0,id,target
0,TEST_000000,25.66
1,TEST_000001,41.02
2,TEST_000002,67.08
3,TEST_000003,38.71
4,TEST_000004,43.136667


In [124]:
result.to_csv('../data/submission_v7.csv', index=False)