In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import * 

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from pycaret.regression import *
import joblib

# Load Data

In [2]:
train_df = pd.read_csv('../data/train.csv')
train_df.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,road_in_use,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,vehicle_restricted,weight_restricted,height_restricted,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
0,TRAIN_0000000,20220623,목,17,0,1,106,지방도1112호선,0,0,60.0,0.0,32400.0,0.0,3,제3교래교,33.427747,126.662612,없음,제3교래교,33.427749,126.662335,없음,52.0
1,TRAIN_0000001,20220728,목,21,0,2,103,일반국도11호선,0,0,60.0,0.0,0.0,0.0,0,광양사거리,33.50073,126.529107,있음,KAL사거리,33.504811,126.52624,없음,30.0
2,TRAIN_0000002,20211010,일,7,0,2,103,일반국도16호선,0,0,80.0,0.0,0.0,0.0,0,창고천교,33.279145,126.368598,없음,상창육교,33.280072,126.362147,없음,61.0
3,TRAIN_0000003,20220311,금,13,0,2,107,태평로,0,0,50.0,0.0,0.0,0.0,0,남양리조트,33.246081,126.567204,없음,서현주택,33.245565,126.566228,없음,20.0
4,TRAIN_0000004,20211005,화,8,0,2,103,일반국도12호선,0,0,80.0,0.0,0.0,0.0,0,애월샷시,33.462214,126.326551,없음,애월입구,33.462677,126.330152,없음,38.0


In [3]:
train_df.columns

Index(['id', 'base_date', 'day_of_week', 'base_hour', 'road_in_use',
       'lane_count', 'road_rating', 'road_name', 'multi_linked',
       'connect_code', 'maximum_speed_limit', 'vehicle_restricted',
       'weight_restricted', 'height_restricted', 'road_type',
       'start_node_name', 'start_latitude', 'start_longitude',
       'start_turn_restricted', 'end_node_name', 'end_latitude',
       'end_longitude', 'end_turn_restricted', 'target'],
      dtype='object')

In [4]:
test_df = pd.read_csv('../data/test.csv')
test_df.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,road_in_use,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,vehicle_restricted,weight_restricted,height_restricted,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted
0,TEST_000000,20220825,목,17,0,3,107,연삼로,0,0,70.0,0.0,0.0,0.0,0,산지2교,33.499427,126.541298,없음,제주은행사거리,33.500772,126.543837,있음
1,TEST_000001,20220809,화,12,0,2,103,일반국도12호선,0,0,70.0,0.0,0.0,0.0,3,중문입구,33.258507,126.427003,없음,관광단지입구,33.258119,126.41584,없음
2,TEST_000002,20220805,금,2,0,1,103,일반국도16호선,0,0,60.0,0.0,0.0,0.0,0,도순3교,33.25896,126.476508,없음,도순2교,33.259206,126.474687,없음
3,TEST_000003,20220818,목,23,0,3,103,일반국도11호선,0,0,70.0,0.0,0.0,0.0,0,아라주공아파트,33.473494,126.545647,없음,인다마을,33.471061,126.545467,없음
4,TEST_000004,20220810,수,17,0,3,106,번영로,0,0,70.0,0.0,0.0,0.0,0,부록교 시종점,33.501477,126.569223,없음,봉개교 시종점,33.496863,126.58123,없음


# Sort Data

In [5]:
# datetime 컬럼 생성
train_df['base_date'] = train_df['base_date'].astype(str)
train_df['base_hour'] = train_df['base_hour'].astype(str)
train_df['datetime'] = pd.to_datetime(train_df['base_date'] + ' ' + train_df['base_hour'].str.zfill(2))

test_df['base_date'] = test_df['base_date'].astype(str)
test_df['base_hour'] = test_df['base_hour'].astype(str)
test_df['datetime'] = pd.to_datetime(test_df['base_date'] + ' ' + train_df['base_hour'].str.zfill(2))

In [6]:
sorted_train_df = train_df.sort_values(['datetime', 'road_name', 'start_node_name', 'end_node_name'])
sorted_test_df = test_df.sort_values(['datetime', 'road_name', 'start_node_name', 'end_node_name'])

In [7]:
len(sorted_train_df[sorted_train_df['base_date']=='20210904'])

17998

In [8]:
total_df = pd.concat([sorted_train_df, sorted_test_df])

In [9]:
total_df.describe()

Unnamed: 0,road_in_use,lane_count,road_rating,multi_linked,connect_code,maximum_speed_limit,vehicle_restricted,weight_restricted,height_restricted,road_type,start_latitude,start_longitude,end_latitude,end_longitude,target
count,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4992458.0,4701217.0
mean,0.001426552,1.847427,104.9753,0.0004484765,0.2505031,61.10851,0.0,5504.004,0.0,0.6163683,33.38299,126.5212,33.383,126.5212,42.78844
std,0.03774277,0.6929471,1.842662,0.02117252,5.073369,12.10544,0.0,13821.66,0.0,1.212104,0.1022663,0.1524749,0.10226,0.1524585,15.95443
min,0.0,1.0,103.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,33.24343,126.1826,33.24343,126.1826,1.0
25%,0.0,1.0,103.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,33.26269,126.4244,33.26269,126.4244,30.0
50%,0.0,2.0,106.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,33.41209,126.5112,33.41209,126.5112,43.0
75%,0.0,2.0,107.0,0.0,0.0,70.0,0.0,0.0,0.0,0.0,33.4786,126.578,33.4786,126.578,54.0
max,1.0,3.0,107.0,1.0,103.0,80.0,0.0,50000.0,0.0,3.0,33.55608,126.9309,33.55608,126.9309,113.0


In [10]:
total_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4992458 entries, 3517001 to 253016
Data columns (total 25 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   id                     object        
 1   base_date              object        
 2   day_of_week            object        
 3   base_hour              object        
 4   road_in_use            int64         
 5   lane_count             int64         
 6   road_rating            int64         
 7   road_name              object        
 8   multi_linked           int64         
 9   connect_code           int64         
 10  maximum_speed_limit    float64       
 11  vehicle_restricted     float64       
 12  weight_restricted      float64       
 13  height_restricted      float64       
 14  road_type              int64         
 15  start_node_name        object        
 16  start_latitude         float64       
 17  start_longitude        float64       
 18  start_turn_restri

In [11]:
drop_col = ['id', 'vehicle_restricted', 'height_restricted', 'start_latitude', 'start_longitude', #'base_date'
            'end_latitude', 'end_longitude', 'road_in_use']
x = sorted_train_df.drop(drop_col, axis = 1)
x = x.drop('target', axis = 1)
y = sorted_train_df.loc[:, 'target']

test_data = sorted_test_df.drop(drop_col, axis = 1)

In [12]:
dumm_cols = ['day_of_week', 'base_hour', 'road_rating', 'road_name', 
             'multi_linked', 'connect_code', 'road_type', 
             'start_turn_restricted', 'end_turn_restricted', 'start_node_name', 'end_node_name']

In [13]:
for col in dumm_cols:
    le = LabelEncoder()
    x[col]= le.fit_transform(x[col])
    test_data[col]= le.transform(test_data[col])

In [14]:
x.shape

(4701217, 16)

In [15]:
x

Unnamed: 0,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,weight_restricted,road_type,start_node_name,start_turn_restricted,end_node_name,end_turn_restricted,datetime
3517001,20210901,2,0,1,2,0,0,0,60.0,0.0,0,3,0,325,0,2021-09-01 00:00:00
2734799,20210901,2,0,2,2,0,0,0,60.0,0.0,0,4,0,49,0,2021-09-01 00:00:00
4126826,20210901,2,0,1,2,0,0,0,50.0,0.0,0,4,0,241,0,2021-09-01 00:00:00
3647101,20210901,2,0,1,2,0,0,0,50.0,0.0,0,8,0,216,0,2021-09-01 00:00:00
3547151,20210901,2,0,1,2,0,0,0,50.0,0.0,0,8,0,243,0,2021-09-01 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3806835,20220731,4,15,2,2,57,0,0,50.0,0.0,0,435,0,238,0,2022-07-31 22:00:00
2609777,20220731,4,15,2,2,57,0,0,50.0,0.0,0,442,0,153,0,2022-07-31 22:00:00
1618853,20220731,4,15,2,2,57,0,0,50.0,0.0,0,442,0,203,0,2022-07-31 22:00:00
2751062,20220731,4,15,1,2,58,0,0,60.0,0.0,0,296,1,387,0,2022-07-31 22:00:00


In [16]:
test_data.shape

(291241, 16)

In [17]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.2, random_state = 2022, shuffle=False)

In [18]:
# x_train.to_csv('../data/train1_origin/x_train.csv', index=False)
# x_val.to_csv('../data/train1_origin/x_val.csv', index=False)
# y_train.to_csv('../data/train1_origin/y_train.csv', index=False)
# y_val.to_csv('../data/train1_origin/y_val.csv', index=False)
# test_data.to_csv('../data/train1_origin/x_test.csv', index=False)

# Catboost

In [19]:
from catboost import CatBoostRegressor

cat = CatBoostRegressor(loss_function='MAE', verbose=1)
cat.fit(x, y)

0:	learn: 13.0206749	total: 336ms	remaining: 5m 36s
1:	learn: 12.8253926	total: 610ms	remaining: 5m 4s
2:	learn: 12.6395943	total: 876ms	remaining: 4m 51s
3:	learn: 12.4635012	total: 1.14s	remaining: 4m 44s
4:	learn: 12.2988336	total: 1.41s	remaining: 4m 39s
5:	learn: 12.1413274	total: 1.66s	remaining: 4m 34s
6:	learn: 11.9879094	total: 1.92s	remaining: 4m 32s
7:	learn: 11.8356581	total: 2.18s	remaining: 4m 30s
8:	learn: 11.6937457	total: 2.44s	remaining: 4m 28s
9:	learn: 11.5574938	total: 2.69s	remaining: 4m 26s
10:	learn: 11.4252464	total: 2.95s	remaining: 4m 25s
11:	learn: 11.3031097	total: 3.21s	remaining: 4m 24s
12:	learn: 11.1834486	total: 3.47s	remaining: 4m 23s
13:	learn: 11.0655580	total: 3.74s	remaining: 4m 23s
14:	learn: 10.9539050	total: 4s	remaining: 4m 22s
15:	learn: 10.8469841	total: 4.26s	remaining: 4m 22s
16:	learn: 10.7448601	total: 4.52s	remaining: 4m 21s
17:	learn: 10.6472299	total: 4.78s	remaining: 4m 20s
18:	learn: 10.5528201	total: 5.04s	remaining: 4m 20s
19:	lea

<catboost.core.CatBoostRegressor at 0x7fc6549a12e0>

In [20]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(verbose=1)
model.fit(x, y)

Learning rate set to 0.155736
0:	learn: 14.8124221	total: 127ms	remaining: 2m 6s
1:	learn: 13.9334885	total: 244ms	remaining: 2m 1s
2:	learn: 13.2574233	total: 365ms	remaining: 2m 1s
3:	learn: 12.7213896	total: 478ms	remaining: 1m 59s
4:	learn: 12.2417245	total: 611ms	remaining: 2m 1s
5:	learn: 11.8927467	total: 730ms	remaining: 2m 1s
6:	learn: 11.6156169	total: 846ms	remaining: 1m 59s
7:	learn: 11.3899662	total: 961ms	remaining: 1m 59s
8:	learn: 11.1730557	total: 1.09s	remaining: 1m 59s
9:	learn: 10.9278323	total: 1.22s	remaining: 2m
10:	learn: 10.7695939	total: 1.34s	remaining: 2m
11:	learn: 10.6482550	total: 1.46s	remaining: 1m 59s
12:	learn: 10.5164187	total: 1.57s	remaining: 1m 59s
13:	learn: 10.4308524	total: 1.67s	remaining: 1m 57s
14:	learn: 10.2815927	total: 1.8s	remaining: 1m 58s
15:	learn: 10.1669688	total: 1.92s	remaining: 1m 57s
16:	learn: 10.1030445	total: 2.03s	remaining: 1m 57s
17:	learn: 9.9875512	total: 2.15s	remaining: 1m 57s
18:	learn: 9.9134938	total: 2.27s	remaini

<catboost.core.CatBoostRegressor at 0x7fc74c70f1f0>

In [47]:
y_pred = model.predict(test_data)
len(y_pred)

291241

In [48]:
temp = sorted_test_df[['id']].copy()
temp['target'] = y_pred
result = temp.sort_values('id')
result

Unnamed: 0,id,target
0,TEST_000000,25.317371
1,TEST_000001,41.693166
2,TEST_000002,62.575574
3,TEST_000003,35.261777
4,TEST_000004,38.602273
...,...,...
291236,TEST_291236,47.674337
291237,TEST_291237,50.884468
291238,TEST_291238,22.388354
291239,TEST_291239,23.417625


In [49]:
result.to_csv('../data/submission_v10.csv', index=False)

In [95]:
model_cat = CatBoostRegressor()
model_cat.fit(x_train, y_train)

Learning rate set to 0.150341
0:	learn: 14.8144593	total: 104ms	remaining: 1m 44s
1:	learn: 13.9175486	total: 201ms	remaining: 1m 40s
2:	learn: 13.2211945	total: 296ms	remaining: 1m 38s
3:	learn: 12.6856126	total: 392ms	remaining: 1m 37s
4:	learn: 12.2529243	total: 488ms	remaining: 1m 37s
5:	learn: 11.8605600	total: 583ms	remaining: 1m 36s
6:	learn: 11.5675107	total: 678ms	remaining: 1m 36s
7:	learn: 11.3337530	total: 770ms	remaining: 1m 35s
8:	learn: 11.1417422	total: 867ms	remaining: 1m 35s
9:	learn: 10.9477010	total: 962ms	remaining: 1m 35s
10:	learn: 10.7983845	total: 1.05s	remaining: 1m 34s
11:	learn: 10.6195481	total: 1.16s	remaining: 1m 35s
12:	learn: 10.4834155	total: 1.25s	remaining: 1m 35s
13:	learn: 10.3686107	total: 1.35s	remaining: 1m 34s
14:	learn: 10.2670049	total: 1.45s	remaining: 1m 34s
15:	learn: 10.1939180	total: 1.53s	remaining: 1m 34s
16:	learn: 10.0745507	total: 1.64s	remaining: 1m 34s
17:	learn: 9.9988338	total: 1.74s	remaining: 1m 35s
18:	learn: 9.9095501	total:

<catboost.core.CatBoostRegressor at 0x7f8b03c88d60>

In [96]:
pred = model_cat.predict(x_val)
pred
print(mean_absolute_error(y_val, pred))

3.984917124333543


In [116]:
joblib.dump(model_cat, 'models/model_cat.pkl')

['models/model_cat.pkl']

# RF

In [101]:
model_rf = RandomForestRegressor()

model_rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [102]:
pred = model_rf.predict(x_val)
pred
print(mean_absolute_error(y_val, pred))

3.589763242950743


In [114]:
joblib.dump(model_rf, 'models/model_rf.pkl')

['models/model_rf.pkl']

In [121]:
y_pred = model_rf.predict(test_data)
result = pd.DataFrame({'id':test_df['id'], 'target':y_pred})
result.head()

Unnamed: 0,id,target
0,TEST_000000,25.66
1,TEST_000001,41.02
2,TEST_000002,67.08
3,TEST_000003,38.71
4,TEST_000004,43.136667


In [124]:
result.to_csv('../data/submission_v7.csv', index=False)

# XGB

In [97]:
from xgboost import XGBRegressor

model_xgb = XGBRegressor(n_estimators=100,gamma=1,eta=0.1,max_depth=5,reg_lambda=5,reg_alpha=5,random_state=2022)

model_xgb.fit(x_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False, eta=0.1,
             eval_metric=None, gamma=1, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.100000001, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, objective='reg:squarederror',
             predictor='auto', random_state=2022, ...)

In [98]:
from xgboost import XGBRegressor

model_xgb_v2 = XGBRegressor()

model_xgb_v2.fit(x_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, objective='reg:squarederror',
             predictor='auto', random_state=0, reg_alpha=0, ...)

In [99]:
pred = model_xgb.predict(x_val)
pred
print(mean_absolute_error(y_val, pred))

5.875256047947337


In [100]:
pred = model_xgb_v2.predict(x_val)
pred
print(mean_absolute_error(y_val, pred))

4.14413145262452


In [115]:
joblib.dump(model_xgb_v2, 'models/model_xgb_v2.pkl')

['models/model_xgb_v2.pkl']

In [16]:
y_pred = model_xgb.predict(test_data)

In [18]:
result = pd.DataFrame({'id':test_df['id'], 'target':y_pred})
result.head()

Unnamed: 0,id,target
0,TEST_000000,28.699699
1,TEST_000001,45.003902
2,TEST_000002,55.764816
3,TEST_000003,31.750568
4,TEST_000004,43.394955


# Stacking

In [20]:
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

rf = RandomForestRegressor()
xgb  = XGBRegressor()
cat = CatBoostRegressor()
rf_meta = RandomForestRegressor()

estimators = [('rf', rf), ('xgb', xgb), ('catboost', cat)]

model = StackingRegressor(estimators=estimators,
               final_estimator=rf_meta,
               verbose=1,)

model.fit(x_train, y_train).score(x_train, y_train) # score: R2 score 반환

Learning rate set to 0.150341
0:	learn: 14.8144593	total: 154ms	remaining: 2m 33s
1:	learn: 13.9175486	total: 251ms	remaining: 2m 5s
2:	learn: 13.2211945	total: 346ms	remaining: 1m 54s
3:	learn: 12.6856126	total: 441ms	remaining: 1m 49s
4:	learn: 12.2529243	total: 536ms	remaining: 1m 46s
5:	learn: 11.8605600	total: 630ms	remaining: 1m 44s
6:	learn: 11.5675107	total: 725ms	remaining: 1m 42s
7:	learn: 11.3337530	total: 815ms	remaining: 1m 41s
8:	learn: 11.1417422	total: 912ms	remaining: 1m 40s
9:	learn: 10.9477010	total: 1.01s	remaining: 1m 39s
10:	learn: 10.7983845	total: 1.1s	remaining: 1m 38s
11:	learn: 10.6195481	total: 1.2s	remaining: 1m 38s
12:	learn: 10.4834155	total: 1.3s	remaining: 1m 38s
13:	learn: 10.3686107	total: 1.39s	remaining: 1m 37s
14:	learn: 10.2670049	total: 1.49s	remaining: 1m 38s
15:	learn: 10.1939180	total: 1.58s	remaining: 1m 37s
16:	learn: 10.0745507	total: 1.69s	remaining: 1m 37s
17:	learn: 9.9988338	total: 1.79s	remaining: 1m 37s
18:	learn: 9.9095501	total: 1.8

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 86.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  7.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Learning rate set to 0.145133
0:	learn: 14.8543144	total: 82.2ms	remaining: 1m 22s
1:	learn: 13.9775320	total: 158ms	remaining: 1m 19s
2:	learn: 13.2888768	total: 232ms	remaining: 1m 17s
3:	learn: 12.7576019	total: 308ms	remaining: 1m 16s
4:	learn: 12.3228435	total: 382ms	remaining: 1m 15s
5:	learn: 11.9216996	total: 457ms	remaining: 1m 15s
6:	learn: 11.6440705	total: 529ms	remaining: 1m 15s
7:	learn: 11.4127841	total: 605ms	remaining: 1m 15s
8:	learn: 11.1892866	total: 680ms	remaining: 1m 14s
9:	learn: 11.0281585	total: 757ms	remaining: 1m 14s
10:	learn: 10.9115158	total: 830ms	remaining: 1m 14s
11:	learn: 10.7010578	total: 911ms	remaining: 1m 14s
12:	learn: 10.5459103	total: 995ms	remaining: 1m 15s
13:	learn: 10.4236362	total: 1.07s	remaining: 1m 15s
14:	learn: 10.2693115	total: 1.15s	remaining: 1m 15s
15:	learn: 10.1691641	total: 1.22s	remaining: 1m 15s
16:	learn: 10.0392480	total: 1.3s	remaining: 1m 15s
17:	learn: 9.9597136	total: 1.38s	remaining: 1m 15s
18:	learn: 9.8907575	total:

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.4min finished


0.939675704151639

In [21]:
pred = model.predict(x_val)
pred
print(mean_absolute_error(y_val, pred))

3.686703022883399


In [23]:
y_pred = model.predict(test_data)
result = pd.DataFrame({'id':test_df['id'], 'target':y_pred})
result.head()

Unnamed: 0,id,target
0,TEST_000000,24.68
1,TEST_000001,43.93
2,TEST_000002,63.9225
3,TEST_000003,36.97
4,TEST_000004,42.354333


In [24]:
result.to_csv('../data/submission_v8.csv', index=False)

# Jae Kyeong's

In [21]:
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRFRegressor
from lightgbm import LGBMRegressor
# from lightgbm import DaskLGBMClassifier

In [24]:
rf_model = RandomForestRegressor(n_estimators=300,
              max_depth=30,
              max_leaf_nodes=32,
              random_state=42,
              n_jobs=-1
              )

xgb_model = XGBRFRegressor(max_depth=30,
                n_estimators=300,
                learning_rate=0.01,
                gamma=0, subsample=0.75,
                use_label_encoder=False,
                colsample_bytree=1,
                eval_metric='logloss',
                n_jobs=-1
                # subsample=0.7
                )

xgb_meta = XGBRFRegressor(max_depth=27,
                n_estimators=500,
                learning_rate=0.005,
                gamma=0, subsample=0.75,
                use_label_encoder=False,
                colsample_bytree=1,
                eval_metric='logloss',
                n_jobs=-1
                # subsample=0.7
                )

lgb_model = LGBMRegressor(
                max_depth= 30,
                n_estimators= 300,
                learning_rate= 0.001,
                num_leaves = 40,
                n_jobs=-1)

# Second Layer
lgb_meta = LGBMRegressor(
                max_depth= 25,
                n_estimators= 600,
                learning_rate= 0.001,
                num_leaves = 32,
                n_jobs=-1
                )

mlp_model = MLPRegressor(hidden_layer_sizes=(32,), random_state=1, max_iter=10, validation_fraction=0.15, warm_start=False)



In [25]:
vr = VotingRegressor([('lgb', lgb_meta), ('mlp', mlp_model)], n_jobs=-1)

In [None]:
estimators = [('rf', rf_model), ('xgb', xgb_model), ('lgb_1', lgb_model)]

model = StackingRegressor(estimators=estimators,
               final_estimator=vr,
               verbose=1,
               n_jobs=-1)

model.fit(x_train, y_train).score(x_train, y_train) # score: R2 score 반환

# Pycaret

In [38]:
jeju_model = setup(session_id = 2022, data = pd.concat([x_train,y_train],axis=1), target = 'target',
                   test_data = pd.concat([x_val,y_val],axis=1),
                   normalize = True, normalize_method = 'zscore',
                   transformation=True, 
                   #fold_strategy='stratifiedkfold', 
                   use_gpu = True)

Unnamed: 0,Description,Value
0,session_id,2022
1,Target,target
2,Original Data,"(376096, 16)"
3,Missing Values,False
4,Numeric Features,7
5,Categorical Features,8
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(376096, 25)"


In [23]:
pycaret_regression_models = compare_models(n_select=25, sort='MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,3.6629,27.7698,5.2696,0.8908,0.158,0.1102,14.578
et,Extra Trees Regressor,4.0217,34.2808,5.8548,0.8652,0.1739,0.1199,19.427
xgboost,Extreme Gradient Boosting,4.1937,32.214,5.6756,0.8733,0.1752,0.1306,0.519
dt,Decision Tree Regressor,4.7436,48.1181,6.9367,0.8108,0.21,0.1397,1.67
knn,K Neighbors Regressor,4.7455,43.7661,6.6154,0.8279,0.1933,0.1427,14.112
lightgbm,Light Gradient Boosting Machine,5.1922,46.0946,6.7892,0.8188,0.2072,0.1639,119.286
gbr,Gradient Boosting Regressor,7.1768,84.4137,9.1876,0.6681,0.2696,0.2295,28.455
huber,Huber Regressor,9.5564,143.0902,11.9619,0.4374,0.3369,0.3005,6.502
ridge,Ridge Regression,9.5802,142.6963,11.9454,0.4389,0.3354,0.3009,0.06
lar,Least Angle Regression,9.5802,142.6963,11.9454,0.4389,0.3354,0.3009,0.073


In [39]:
pycaret_regression_models2 = compare_models(n_select=25, sort='MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,3.6629,27.7698,5.2696,0.8908,0.158,0.1102,14.616
et,Extra Trees Regressor,4.0217,34.2808,5.8548,0.8652,0.1739,0.1199,19.603
xgboost,Extreme Gradient Boosting,4.1937,32.214,5.6756,0.8733,0.1752,0.1306,0.495
dt,Decision Tree Regressor,4.7436,48.1181,6.9367,0.8108,0.21,0.1397,1.645
knn,K Neighbors Regressor,4.7455,43.7661,6.6154,0.8279,0.1933,0.1427,14.837
lightgbm,Light Gradient Boosting Machine,5.1922,46.0946,6.7892,0.8188,0.2072,0.1639,119.457
gbr,Gradient Boosting Regressor,7.1768,84.4137,9.1876,0.6681,0.2696,0.2295,28.673
huber,Huber Regressor,9.5564,143.0902,11.9619,0.4374,0.3369,0.3005,5.975
ridge,Ridge Regression,9.5802,142.6963,11.9454,0.4389,0.3354,0.3009,0.062
lar,Least Angle Regression,9.5802,142.6963,11.9454,0.4389,0.3354,0.3009,0.072
