In [205]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import gc

from sklearn.preprocessing import LabelEncoder
from haversine import haversine
from sklearn.cluster import KMeans

import math

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor

In [206]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

- 불필요한 컬럼 삭제

In [207]:
train.drop(['vehicle_restricted', 'id', 'height_restricted'], axis = 1, inplace = True)
test.drop(['vehicle_restricted', 'id', 'height_restricted'], axis = 1, inplace = True)

## Feature Engineering


### 1. 도로 주변 시설 및 구역 수(train 기준)

#### 공공 데이터 포털에서 2022년 8월 이전 아래 5가지 표준 데이터를 사용하였습니다.
- 무인교통단속카메라
- 전국초중등학교기본정보
- 어린이보호구역
- 제주시 주차장 정보
- 서귀포시 주차장 정보

train의 start_node, end_node의 위경도 좌표의 unique 값만을 활용

In [208]:
gps_comb = train[['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude']].drop_duplicates()

- 무인교통단속카메라

In [209]:
cctv = pd.read_csv('경찰청_제주특별자치도경찰청_무인교통단속카메라_20220616.csv', encoding = 'cp949')

In [210]:
cctv = cctv.iloc[:, 3:-7].drop(['소재지도로명주소', '소재지지번주소'], axis = 1)

- 초중등학교

In [211]:
school = pd.read_csv('초중등학교.csv', encoding = 'cp949')

In [212]:
school = school[(school['데이터기준일자'] <= '2022-07-31') & (school['시도교육청명'].str.contains('제주'))]

- 어린이 보호 구역

In [213]:
child = pd.read_csv('제주특별자치도_어린이보호구역_20220513.csv', encoding = 'cp949')

- 제주시 주차장

In [214]:
parking1 = pd.read_csv("제주특별자치도_제주시_주차장정보_20210818_1630391997093_77385.csv", encoding = 'cp949')

In [215]:
parking1.dropna(subset = ['위도', '경도'], inplace = True)

- 서귀포시 주차장

In [216]:
parking2 = pd.read_csv("제주특별자치도_서귀포시_주차장정보_20220425_1650966840250_33855.csv", encoding = 'cp949')

직선과 점 사이의 거리 방정식 활용
- 위의 4가지 데이터에 존재하는 시설 및 구역의 위경도 좌표(점)과 train 데이터에 존재하는 각 도로의 위경도 좌표(start_node, end_node)를 잇는 직선의 거리를 구하는 함수

In [217]:
def cal_dist(x1, y1, x2, y2, a, b):
    
    area = abs((x1 - a) * (y2 - b) - (y1 - b) * (x2 - a))
    AB = ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
    distance = area / AB
    return distance

train 데이터의 위경도 쌍을 이은 직선과 각 시설 및 구역의 위경도 좌표의 거리(위경도 좌표상 거리)가 0.0005이내일 경우 해당 도로 주변에 있다고 간주하여 count +

In [218]:
def get_node_cnt(gps_values, infra_values) :
    
    cnt = []

    for y1, x1, y2, x2 in gps_values.values :

        i = 0

        for a, b in infra_values.values :

            dist = cal_dist(x1, y1, x2, y2, a, b)

            if dist < 0.0005 :
                i += 1

            else :
                pass
        cnt.append(i)
    return cnt

In [219]:
cctv_cnt = get_node_cnt(gps_comb, cctv[['경도', '위도']])

In [220]:
school_cnt = get_node_cnt(gps_comb, school[['경도', '위도']])

In [221]:
child_cnt = get_node_cnt(gps_comb, child[['경도', '위도']])

In [222]:
parking1_cnt = get_node_cnt(gps_comb, parking1[['경도', '위도']])

In [223]:
parking2_cnt = get_node_cnt(gps_comb, parking2[['경도', '위도']])

In [224]:
parking_cnt = list(np.array(parking1_cnt) + np.array(parking2_cnt))

In [225]:
gps_comb['CCTV_cnt'] = cctv_cnt
gps_comb['school_cnt'] = school_cnt
gps_comb['child_cnt'] = child_cnt
gps_comb['parking_cnt'] = parking_cnt

#### 각 데이터에 merge 후 fillna

In [226]:
train = pd.merge(train, gps_comb, how = 'left')

In [227]:
test = pd.merge(test, gps_comb, how = 'left').fillna(0)

### 2. 제주 공항까지 거리(km)

- train과 test의 시작 위경도 좌표와 제주 공항 위경도 좌표까지의 거리(km)

In [228]:
jeju = 33.506683, 126.493177

In [229]:
train['j_a_dist'] = [haversine((v[0], v[1]), jeju, unit = 'km') for v in train[['start_latitude', 'start_longitude']].values]
test['j_a_dist'] = [haversine((v[0], v[1]), jeju, unit = 'km') for v in test[['start_latitude', 'start_longitude']].values]

### 3. 한라산까지 거리(km)

- train과 test의 시작 위경도 좌표와 한라산 위경도 좌표까지의 거리(km)

In [230]:
hanla = 33.36168194, 126.5291548

In [231]:
train['h_a_dist'] = [haversine((v[0], v[1]), hanla, unit = 'km') for v in train[['start_latitude', 'start_longitude']].values]
test['h_a_dist'] = [haversine((v[0], v[1]), hanla, unit = 'km') for v in test[['start_latitude', 'start_longitude']].values]

### 4. start_node_name과 end_node_name을 key값으로 만들어 LabelEncoding

In [232]:
le = LabelEncoder()

In [234]:
train['node_combination'] = train['start_node_name'] + '_' + train['end_node_name']
test['node_combination'] = test['start_node_name'] + '_' + test['end_node_name']

In [235]:
train['node_combination'] = le.fit_transform(train['node_combination'])

In [236]:
for category in np.unique(test['node_combination']) :
    if category not in le.classes_ :
        le.classes_ = np.append(le.classes_, label)
test['node_combination'] = le.transform(test['node_combination'])

### 5. 위경도 좌표만으로 Clustering(KMeans)
- Clustering Plotting 결과 군집 수가 6일 때 각 좌표가 명확히 구분되어 6으로 설정

In [237]:
dup_train = train[['start_node_name', 'start_latitude', 'start_longitude', 'end_node_name', 'end_latitude', 'end_longitude']].drop_duplicates()
dup_test = test[['start_node_name', 'start_latitude', 'start_longitude', 'end_node_name', 'end_latitude', 'end_longitude']].drop_duplicates()

In [238]:
km = KMeans(n_clusters = 6, max_iter = 1000, random_state = 42, n_init = 15)

In [239]:
dup_train['gps_cls'] = km.fit_predict(dup_train.iloc[:, [1, 2, 4, 5]])
dup_test['gps_cls'] = km.predict(dup_test.iloc[:, [1, 2, 4, 5]])

In [240]:
train = pd.merge(train, dup_train, how = 'left')
test = pd.merge(test, dup_test, how = 'left')

### 6. 공휴일 전후 1 ~ 2일 여부

- 일반적인 공휴일 기준으로 전후 1 ~ 2일을 기간을 더 두어 binary화

In [241]:
train['base_date'] = train['base_date'].astype(str)
test['base_date'] = test['base_date'].astype(str)

In [242]:
train['date'] = train['base_date'].str[4:]
test['date'] = test['base_date'].str[4:]

In [243]:
h_days = ['0129', '0130', '0131', '0201', '0202', '1231', '0101', '0102', '0228', '0230', '0301', '0302', '0504',
          '0505', '0506', '0507', '0508', '0605', '0607', '0606', '0920', '0921', '0922', '0814', '0815', '0816',
          '1002', '1003', '1004', '1008', '1009', '1010', '1224', '1225', '1226']

In [244]:
train['in_h_days'] = train['date'].isin(h_days)
test['in_h_days'] = test['date'].isin(h_days)

### 7. 년도

In [245]:
train['base_date'] = pd.to_datetime(train['base_date'])
test['base_date'] = pd.to_datetime(test['base_date'])

In [246]:
train['year'] = train['base_date'].dt.year
test['year'] = test['base_date'].dt.year

### 8. 월

In [247]:
train['month'] = train['base_date'].dt.month
test['month'] = test['base_date'].dt.month

### 9. 최고 제한 속도로 도로 주행시 소요 시간

In [248]:
dist = []
for i, v in enumerate(train[['start_latitude', 'end_latitude', 'start_longitude', 'end_longitude']].values) :
    dist.append(haversine((v[0], v[2]), (v[1], v[3]), unit = 'km'))

In [249]:
train['at_time'] = 60 * pd.Series(dist) / train['maximum_speed_limit']

In [250]:
dist = []
for i, v in enumerate(test[['start_latitude', 'end_latitude', 'start_longitude', 'end_longitude']].values) :
    dist.append(haversine((v[0], v[2]), (v[1], v[3]), unit = 'km'))

In [251]:
test['at_time'] = 60 * pd.Series(dist) / test['maximum_speed_limit']

In [252]:
gc.collect()

2185

### 9. 방위각
- 각 도로의 start, end node의 위경도 좌표로 해당 도로의 방위각 계산

In [253]:
def Azimuth(lat1, lng1, lat2, lng2):
    Lat1 = math.radians(lat1)
    Lat2 = math.radians(lat2)
    Lng1 = math.radians(lng1)
    Lng2 = math.radians(lng2)
    
    y = math.sin(Lng2 - Lng1) * math.cos(Lat2)
    x = math.cos(Lat1) * math.sin(Lat2) - math.sin(Lat1) * math.cos(Lat2) * math.cos(Lng2-Lng1)
    z = math.atan2(y, x)

    a = np.rad2deg(z)
    
    if(a < 0):
        a = 180 + (180 + a)
    return a

In [254]:
train['degree'] = [Azimuth(v[0], v[1], v[2], v[3]) for i, v in enumerate(train[['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude']].values)]
test['degree'] = [Azimuth(v[0], v[1], v[2], v[3]) for i, v in enumerate(test[['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude']].values)]

### 10. 계절

In [255]:
def get_season(x) :
    
    if x in [9, 10, 11] :
        return 3
    elif x in [12, 1, 2] :
        return 2
    elif x in [3, 4, 5, 6] :
        return 1
    else :
        return 0

In [256]:
train['season'] = train['month'].apply(get_season)
test['season'] = test['month'].apply(get_season)

### 11. 요일
- 일반적인 요일 순서대로가 아닌 LabelEncoding으로 진행

In [257]:
train['day_of_week'] = le.fit_transform(train['day_of_week'])

In [258]:
for category in np.unique(test['day_of_week']) :
    if category not in le.classes_ :
        le.classes_ = np.append(le.classes_, label)
test['day_of_week'] = le.transform(test['day_of_week'])

### 12. 도로명
- 도로명 LabelEncoding

In [259]:
train['road_name'] = le.fit_transform(train['road_name'])

In [260]:
for category in np.unique(test['road_name']) :
    if category not in le.classes_ :
        le.classes_ = np.append(le.classes_, label)
test['road_name'] = le.transform(test['road_name'])

### 13. 시작 노드 == 종료 노드 여부

In [261]:
train['node_same'] = train['start_node_name'] == train['end_node_name']
test['node_same'] = test['start_node_name'] == test['end_node_name']

### 14. 기타 컬럼 LabelEncoding

In [262]:
train['start_turn_restricted'] = le.fit_transform(train['start_turn_restricted'])

In [263]:
for category in np.unique(test['start_turn_restricted']) :
    if category not in le.classes_ :
        le.classes_ = np.append(le.classes_, label)
test['start_turn_restricted'] = le.transform(test['start_turn_restricted'])

In [264]:
train['end_turn_restricted'] = le.fit_transform(train['end_turn_restricted'])

In [265]:
for category in np.unique(test['end_turn_restricted']) :
    if category not in le.classes_ :
        le.classes_ = np.append(le.classes_, label)
test['end_turn_restricted'] = le.transform(test['end_turn_restricted'])

#### 모델링 사용 제외 컬럼 삭제

In [266]:
train.drop(['start_node_name', 'end_node_name', 'date', 'base_date'], axis = 1, inplace = True)
test.drop(['start_node_name', 'end_node_name', 'date', 'base_date'], axis = 1, inplace = True)

***

## Modeling

- lane_count를 1, 2, 3으로 나누어 모델링
- LGBM, XGBoost는 optuna로 파라미터 튜닝 -> pkl로 save, load

In [126]:
X = train.drop(['target'], axis = 1)

In [127]:
y = train.target

In [128]:
target = test[X.columns]

In [129]:
skf = StratifiedKFold(n_splits = 6, random_state = 42, shuffle = True)

In [130]:
X1 = X[X['lane_count'] == 1].drop(['lane_count'], axis = 1)
X2 = X[X['lane_count'] == 2].drop(['lane_count'], axis = 1)
X3 = X[X['lane_count'] == 3].drop(['lane_count'], axis = 1)

In [131]:
y1 = y[X1.index]
y2 = y[X2.index]
y3 = y[X3.index]

In [132]:
standard1 = X1['day_of_week']
standard2 = X2['day_of_week']
standard3 = X3['day_of_week']

In [133]:
target = test[X.columns]

In [134]:
target1 = target.loc[target['lane_count'] == 1, X1.columns]
target2 = target.loc[target['lane_count'] == 2, X2.columns]
target3 = target.loc[target['lane_count'] == 3, X3.columns]

In [None]:
seeds = np.random.randint(0, 2022, 6)

### 1) CatBoost

In [None]:
cb_pred1 = np.zeros(target1.shape[0])
train_cb_pred1 = np.zeros(X1.shape[0])

i = 0

cb_mae = []
train_cb_mae = []

for tr_idx, val_idx in skf.split(X1, standard1):
    tr_x, tr_y = X1.iloc[tr_idx], y1.iloc[tr_idx]
    val_x, val_y = X1.iloc[val_idx], y1.iloc[val_idx]

    cb = CatBoostRegressor(max_depth = 8, learning_rate = 0.033,
                           use_best_model = True, iterations = 10000, eval_metric = 'MAE')

    cb.fit(tr_x, tr_y, eval_set=[(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds=5, verbose=2500)

    val_pred = cb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    cb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = cb.predict(target1) / skf.n_splits
    cb_pred1 += fold_pred

    train_fold_pred = cb.predict(X1) / skf.n_splits
    train_cb_pred1 += train_fold_pred
    
print(f"AVG of MAE = {np.mean(cb_mae)}")

In [None]:
np.save('cb_pred1', cb_pred1)
np.save('train_cb_pred1', train_cb_pred1)

In [None]:
cb_pred2 = np.zeros(target2.shape[0])
train_cb_pred2 = np.zeros(X2.shape[0])

i = 0

cb_mae = []
train_cb_mae = []

for tr_idx, val_idx in skf.split(X2, standard2):
    tr_x, tr_y = X2.iloc[tr_idx], y2.iloc[tr_idx]
    val_x, val_y = X2.iloc[val_idx], y2.iloc[val_idx]

    cb = CatBoostRegressor(max_depth = 8, learning_rate = 0.033,
                           use_best_model = True, iterations = 10000, eval_metric = 'MAE')

    cb.fit(tr_x, tr_y, eval_set=[(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds=5, verbose=2500)

    val_pred = cb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    cb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}\n")

    i += 1

    fold_pred = cb.predict(target2) / skf.n_splits
    cb_pred2 += fold_pred

    train_fold_pred = cb.predict(X2) / skf.n_splits
    train_cb_pred2 += train_fold_pred

print(f"AVG of MAE = {np.mean(cb_mae)}")

In [None]:
np.save('cb_pred2', cb_pred2)
np.save('train_cb_pred2', train_cb_pred2)

In [None]:
cb_pred3 = np.zeros(target3.shape[0])
train_cb_pred3 = np.zeros(X3.shape[0])

i = 0

cb_mae = []
train_cb_mae = []

for tr_idx, val_idx in skf.split(X3, standard3):
    tr_x, tr_y = X3.iloc[tr_idx], y3.iloc[tr_idx]
    val_x, val_y = X3.iloc[val_idx], y3.iloc[val_idx]

    cb = CatBoostRegressor(max_depth = 8, learning_rate = 0.033,
                           use_best_model = True, iterations = 10000, eval_metric = 'MAE')

    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 8, verbose = 2500)

    val_pred = cb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    cb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}\n")

    i += 1

    fold_pred = cb.predict(target3) / skf.n_splits
    cb_pred3 += fold_pred

    train_fold_pred = cb.predict(X3) / skf.n_splits
    train_cb_pred3 += train_fold_pred

print(f"AVG of MAE = {np.mean(cb_mae)}")

In [None]:
np.save('cb_pred3', cb_pred3)
np.save('train_cb_pred3', train_cb_pred3)

### 2) LGBMRegressor

In [None]:
lgbm_pred1 = np.zeros(target1.shape[0])
train_lgbm_pred1 = np.zeros(X1.shape[0])

i = 0

lgbm_mae = []
train_lgbm_mae = []

for tr_idx, val_idx in skf.split(X1, standard1):
    tr_x, tr_y = X1.iloc[tr_idx], y1.iloc[tr_idx]
    val_x, val_y = X1.iloc[val_idx], y1.iloc[val_idx]

    study = joblib.load(f'./LGBMRegressor_tune/tune_0.pkl')
    print('Study loaded')
    print(study.best_trial.params)
    lgbm = LGBMRegressor(**study.best_trial.params)
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 5, verbose = 2500)

    val_pred = lgbm.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    lgbm_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = lgbm.predict(target1) / skf.n_splits
    lgbm_pred1 += fold_pred

    train_fold_pred = lgbm.predict(X1) / skf.n_splits
    train_lgbm_pred1 += train_fold_pred

print(f"AVG of MAE = {np.mean(lgbm_mae)}")

In [None]:
np.save('lgbm_pred1', lgbm_pred1)
np.save('train_lgbm_pred1', train_lgbm_pred1)

In [None]:
lgbm_pred2 = np.zeros(target2.shape[0])
train_lgbm_pred2 = np.zeros(X2.shape[0])

i = 0

lgbm_mae = []
train_lgbm_mae = []

for tr_idx, val_idx in skf.split(X2, standard2):
    tr_x, tr_y = X2.iloc[tr_idx], y2.iloc[tr_idx]
    val_x, val_y = X2.iloc[val_idx], y2.iloc[val_idx]

    study = joblib.load(f'./LGBMRegressor_tune/tune_1.pkl')
    print('Study loaded')
    print(study.best_trial.params)
    lgbm = LGBMRegressor(**study.best_trial.params)

    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds=5, verbose=2500)

    val_pred = lgbm.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    lgbm_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}\n")

    i += 1

    fold_pred = lgbm.predict(target2) / skf.n_splits
    lgbm_pred2 += fold_pred

    train_fold_pred = lgbm.predict(X2) / skf.n_splits
    train_lgbm_pred2 += train_fold_pred

print(f"AVG of MAE = {np.mean(lgbm_mae)}")

In [None]:
np.save('lgbm_pred2', lgbm_pred2)
np.save('train_lgbm_pred2', train_lgbm_pred2)

In [None]:
lgbm_pred3 = np.zeros(target3.shape[0])
train_lgbm_pred3 = np.zeros(X3.shape[0])

i = 0
lgbm_mae = []

for tr_idx, val_idx in skf.split(X3, standard3):
    tr_x, tr_y = X3.iloc[tr_idx], y3.iloc[tr_idx]
    val_x, val_y = X3.iloc[val_idx], y3.iloc[val_idx]

    study = joblib.load(f'./LGBMRegressor_tune/tune_2.pkl')
    print('Study loaded')
    print(study.best_trial.params)
    lgbm = LGBMRegressor(**study.best_trial.params)

    lgbm.fit(tr_x, tr_y, eval_set=[(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds=8, verbose=2500)

    val_pred = lgbm.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    lgbm_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}\n")

    i += 1

    fold_pred = lgbm.predict(target3) / skf.n_splits
    lgbm_pred3 += fold_pred

    train_fold_pred = lgbm.predict(X3) / skf.n_splits
    train_lgbm_pred3 += train_fold_pred

print(f"AVG of MAE = {np.mean(lgbm_mae)}")

In [None]:
np.save('lgbm_pred3', lgbm_pred3)
np.save('train_lgbm_pred3', train_lgbm_pred3)

### 3) XGBoost

In [None]:
xgb_pred1 = np.zeros(target1.shape[0])
train_xgb_pred1 = np.zeros(X1.shape[0])

i = 0

xgb_mae = []
for tr_idx, val_idx in skf.split(X1, standard1):
    tr_x, tr_y = X1.iloc[tr_idx], y1.iloc[tr_idx]
    val_x, val_y = X1.iloc[val_idx], y1.iloc[val_idx]

    study = joblib.load(f'./XGBRegressor_tune/tune_0.pkl')
    print('Study loaded')
    print(study.best_trial.params)
    xgb = XGBRegressor(**study.best_trial.params)

    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds=8, verbose=1000,
            eval_metric='mae')

    val_pred = xgb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    xgb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = xgb.predict(target1) / skf.n_splits
    xgb_pred1 += fold_pred

    train_fold_pred = xgb.predict(X1) / skf.n_splits
    train_xgb_pred1 += train_fold_pred

print(f"\nAVG of MAE = {np.mean(xgb_mae)}")

In [None]:
np.save('xgb_pred1', xgb_pred1)
np.save('train_xgb_pred1', train_xgb_pred1)

In [None]:
xgb_pred2 = np.zeros(target2.shape[0])
train_xgb_pred2 = np.zeros(X2.shape[0])

i = 0
xgb_mae = []

for tr_idx, val_idx in skf.split(X2, standard2):
    tr_x, tr_y = X2.iloc[tr_idx], y2.iloc[tr_idx]
    val_x, val_y = X2.iloc[val_idx], y2.iloc[val_idx]

    study = joblib.load(f'./XGBRegressor_tune/tune_1.pkl')
    print('Study loaded')
    print(study.best_trial.params)
    xgb = XGBRegressor(**study.best_trial.params)

    xgb.fit(tr_x, tr_y, eval_set=[(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds=8, verbose=1000,
            eval_metric='mae')

    val_pred = xgb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    xgb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = xgb.predict(target2) / skf.n_splits
    xgb_pred2 += fold_pred

    train_fold_pred = xgb.predict(X2) / skf.n_splits
    train_xgb_pred2 += train_fold_pred

print(f"\nAVG of MAE = {np.mean(xgb_mae)}")

In [None]:
np.save('xgb_pred2', xgb_pred2)
np.save('train_xgb_pred2', train_xgb_pred2)

In [None]:
xgb_pred3 = np.zeros(target3.shape[0])
train_xgb_pred3 = np.zeros(X3.shape[0])

i = 0
xgb_mae = []

for tr_idx, val_idx in skf.split(X3, standard3):
    tr_x, tr_y = X3.iloc[tr_idx], y3.iloc[tr_idx]
    val_x, val_y = X3.iloc[val_idx], y3.iloc[val_idx]

    study = joblib.load(f'./XGBRegressor_tune/tune_2.pkl')
    print('Study loaded')
    print(study.best_trial.params)
    xgb = XGBRegressor(**study.best_trial.params)

    xgb.fit(tr_x, tr_y, eval_set=[(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 8, verbose = 1000,
            eval_metric = 'mae')

    val_pred = xgb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    xgb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = xgb.predict(target3) / skf.n_splits
    xgb_pred3 += fold_pred

    train_fold_pred = xgb.predict(X3) / skf.n_splits
    train_xgb_pred3 += train_fold_pred

print(f"\nAVG of MAE = {np.mean(xgb_mae)}")

In [None]:
np.save('xgb_pred3', xgb_pred3)
np.save('train_xgb_pred3', train_xgb_pred3)

## Submission

In [None]:
submission = pd.read_csv('sample_submission.csv')

### Ensemble

In [None]:
### XGBoost
xgb_pred1 = np.load('./xgb_pred1.npy')
xgb_pred2 = np.load('./xgb_pred2.npy')
xgb_pred3 = np.load('./xgb_pred3.npy')

### LGBM
lgbm_pred1 = np.load('./lgbm_pred1.npy')
lgbm_pred2 = np.load('./lgbm_pred2.npy')
lgbm_pred3 = np.load('./lgbm_pred3.npy')

### CatBoost
cb_pred1 = np.load('./cb_pred1.npy')
cb_pred2 = np.load('./cb_pred2.npy')
cb_pred3 = np.load('./cb_pred3.npy')

In [None]:
# Ensemble - LGBM : XGBoost : CatBoost = 0.65 : 0.25 : 0.1
submission.loc[target1.index, 'target'] = lg_pred1 * 0.65 + xgb_pred1 * 0.25 + cb_pred1 * 0.1
submission.loc[target2.index, 'target'] = lg_pred1 * 0.65 + xgb_pred2 * 0.25 + cb_pred2 * 0.1
submission.loc[target3.index, 'target'] = lg_pred1 * 0.65 + xgb_pred3 * 0.25 + cb_pred3 * 0.1

In [None]:
submission['target'] = round(submission['target'], 0)
submission.to_csv('lgbm_xgb_cb.csv', index = False)