# 개요

- [기존 1등의 모델](https://www.kaggle.com/code/bcruise/starting-strong-xgboost-lightgbm-catboost?scriptVersionId=116642571)에서 우리팀의 모델을 블렌딩하였다
- 1위(XGBoost + LightGBM + CatBoost) + 생성한 모델(CatBoost)

# 라이브러리 로드

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
import optuna
from sklearn.preprocessing import StandardScaler

# 데이터 로드

In [None]:
train = pd.read_csv("train.csv").drop(columns="id")
test = pd.read_csv("test.csv")
test_idx = test.id
test = test.drop(columns="id")

original = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

train.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,36,Travel_Frequently,599,Research & Development,24,3,Medical,1,4,Male,...,80,1,10,2,3,10,0,7,8,0
1,35,Travel_Rarely,921,Sales,8,3,Other,1,1,Male,...,80,1,4,3,3,4,2,0,3,0
2,32,Travel_Rarely,718,Sales,26,3,Marketing,1,3,Male,...,80,2,4,3,3,3,2,1,2,0
3,38,Travel_Rarely,1488,Research & Development,2,3,Medical,1,3,Female,...,80,0,15,1,1,6,0,0,2,0
4,50,Travel_Rarely,1017,Research & Development,5,4,Medical,1,2,Female,...,80,0,31,0,3,31,14,4,10,1


In [None]:
original.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


# 데이터 분석 및 EDA

- 기존 내부 데이터의 타겟값이 참인 데이터가 적으므로 외부 데이터(original)에서 타겟값이 참인 데이터를 추가

In [None]:
original['Attrition'] = (original['Attrition'] == 'Yes').astype(np.int64)

original.drop(columns="EmployeeNumber", inplace=True)

In [None]:
original = original[list(train.columns)]

In [None]:
train_extended = pd.concat([train, original]).reset_index(drop=True)
len(train_extended)

3147

## 결측치 확인

In [None]:
pd.concat([train_extended.isnull().sum().rename("Missing in Train"),
           test.isnull().sum().rename("Missing in Test")], axis=1).sort_values(by="Missing in Train")

Unnamed: 0,Missing in Train,Missing in Test
Age,0,0.0
YearsSinceLastPromotion,0,0.0
YearsInCurrentRole,0,0.0
YearsAtCompany,0,0.0
WorkLifeBalance,0,0.0
TrainingTimesLastYear,0,0.0
TotalWorkingYears,0,0.0
StockOptionLevel,0,0.0
StandardHours,0,0.0
RelationshipSatisfaction,0,0.0


In [None]:
y = train_extended.Attrition
df = pd.concat([train_extended.drop(columns="Attrition"), test])

## 데이터 타입 확인

In [None]:
df.dtypes.sort_values()

Age                          int64
YearsInCurrentRole           int64
YearsAtCompany               int64
WorkLifeBalance              int64
TrainingTimesLastYear        int64
TotalWorkingYears            int64
StockOptionLevel             int64
StandardHours                int64
RelationshipSatisfaction     int64
PerformanceRating            int64
PercentSalaryHike            int64
NumCompaniesWorked           int64
MonthlyRate                  int64
YearsSinceLastPromotion      int64
MonthlyIncome                int64
EnvironmentSatisfaction      int64
DailyRate                    int64
DistanceFromHome             int64
Education                    int64
EmployeeCount                int64
JobSatisfaction              int64
YearsWithCurrManager         int64
JobInvolvement               int64
JobLevel                     int64
HourlyRate                   int64
Gender                      object
OverTime                    object
Over18                      object
EducationField      

## 고유값 개수 확인

In [None]:
df.nunique().sort_values()

Over18                         1
EmployeeCount                  1
StandardHours                  1
OverTime                       2
PerformanceRating              2
Gender                         2
Department                     3
MaritalStatus                  3
BusinessTravel                 3
WorkLifeBalance                4
RelationshipSatisfaction       4
JobSatisfaction                4
EnvironmentSatisfaction        5
JobInvolvement                 5
StockOptionLevel               5
EducationField                 6
JobLevel                       6
Education                      6
TrainingTimesLastYear          7
JobRole                        9
NumCompaniesWorked            11
PercentSalaryHike             15
YearsSinceLastPromotion       16
YearsWithCurrManager          18
YearsInCurrentRole            19
DistanceFromHome              29
YearsAtCompany                38
TotalWorkingYears             41
Age                           43
HourlyRate                    71
DailyRate 

# 피처 엔지니어링

- 고유값이 1인 특성 제거
- 고유값이 2이상 20이하인 특성은 카테고리 특성으로 취급

In [None]:
feats_to_drop = [col for col in df.columns if df[col].nunique()==1]
cat_features = [col for col in df.columns if df[col].nunique() <= 20 and df[col].nunique() > 1]

## 특성 제거

In [None]:
df.drop(columns=feats_to_drop, inplace=True)

## oridinal encoding

In [None]:
ord_enc = OrdinalEncoder()

ord_enc.fit(df[cat_features])

df[cat_features] = ord_enc.transform(df[cat_features])
df.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,36,1.0,599,1.0,24,2.0,3.0,4.0,1.0,42,...,0.0,1.0,1.0,10,2.0,2.0,10,0.0,7.0,8.0
1,35,2.0,921,2.0,8,2.0,4.0,1.0,1.0,46,...,0.0,3.0,1.0,4,3.0,2.0,4,2.0,0.0,3.0
2,32,2.0,718,2.0,26,2.0,2.0,3.0,1.0,80,...,0.0,3.0,2.0,4,3.0,2.0,3,2.0,1.0,2.0
3,38,2.0,1488,1.0,2,2.0,3.0,3.0,0.0,40,...,0.0,2.0,0.0,15,1.0,0.0,6,0.0,0.0,2.0
4,50,2.0,1017,1.0,5,3.0,3.0,2.0,0.0,37,...,0.0,2.0,0.0,31,0.0,2.0,31,14.0,4.0,10.0


## 파생변수 추가

In [None]:
df['MonthlyIncome/Age'] = df['MonthlyIncome'] / df['Age']

df["Age_risk"] = (df["Age"] < 34).astype(int)
df["HourlyRate_risk"] = (df["HourlyRate"] < 60).astype(int)
df["Distance_risk"] = (df["DistanceFromHome"] >= 20).astype(int)
df["YearsAtCo_risk"] = (df["YearsAtCompany"] < 4).astype(int)

df['NumCompaniesWorked'] = df['NumCompaniesWorked'].replace(0, 1)
df['AverageTenure'] = df["TotalWorkingYears"] / df["NumCompaniesWorked"]

df['JobHopper'] = ((df["NumCompaniesWorked"] > 2) & (df["AverageTenure"] < 2.0)).astype(int)

df["AttritionRisk"] = df["Age_risk"] + df["HourlyRate_risk"] + df["Distance_risk"] + df["YearsAtCo_risk"] + df['JobHopper']

## 스케일링

In [None]:
sc = StandardScaler()
df = sc.fit_transform(df)

## 데이터 나누기

In [None]:
X_train = df[:-len(test), :]
X_test = df[-len(test): , :]

In [None]:
X_train

array([[-0.04218709, -1.00694123, -0.67630387, ...,  0.69365254,
        -0.40690757,  0.60559872],
       [-0.15573291,  0.5562361 ,  0.16052252, ..., -0.38509144,
        -0.40690757, -0.37634128],
       [-0.49637038,  0.5562361 , -0.36704194, ..., -0.38509144,
        -0.40690757,  1.58753872],
       ...,
       [-1.06409949,  0.5562361 , -1.8301887 , ..., -0.02551011,
        -0.40690757, -0.37634128],
       [ 1.4339086 , -1.00694123,  0.42560417, ...,  0.42396654,
        -0.40690757, -1.35828128],
       [-0.26927874,  0.5562361 , -0.60093752, ..., -0.5648821 ,
        -0.40690757, -1.35828128]])

In [None]:
y

0       0
1       0
2       0
3       0
4       1
       ..
3142    0
3143    0
3144    0
3145    0
3146    0
Name: Attrition, Length: 3147, dtype: int64

# 모델 학습

## 교차 검증 함수 선언

In [None]:
def cross_validate(X, y, model):
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1337)
    all_scores = []
    
    for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        model.fit(X_tr, y_tr)
        
        y_pred = model.predict_proba(X_val)[:, 1]
        
        auc = roc_auc_score(y_val, y_pred)
        
        print(f"Fold {fold_id} \t auc: {auc}")
        
        all_scores.append(auc)
    
    avg_auc = np.mean(all_scores)
    
    print(f"Avg AUC: {avg_auc}")

## XGBoost

In [None]:
xgb_params = {'n_estimators': 150,
                 'max_depth': 3,
                 'learning_rate': 0.1,
                 'min_child_weight': 4,
                 'subsample': 0.7,
                 'colsample_bytree': 0.3
             }


xgb_clf = xgb.XGBClassifier(**xgb_params)

cross_validate(X_train, y, xgb_clf)

xgb_clf.fit(X_train, y, verbose=0)

Fold 0 	 auc: 0.8857765850385777
Fold 1 	 auc: 0.8170077155317008
Fold 2 	 auc: 0.8305098960080509
Fold 3 	 auc: 0.9150452868165047
Fold 4 	 auc: 0.8222073129822208
Fold 5 	 auc: 0.8423347869842335
Fold 6 	 auc: 0.8477021133847702
Fold 7 	 auc: 0.847678709345233
Fold 8 	 auc: 0.8572041534368832
Fold 9 	 auc: 0.8666437827168969
Avg AUC: 0.8532110342245071


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.3,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=4,
              missing=nan, monotone_constraints='()', n_estimators=150,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

## LightGBM

In [None]:
lgbm_params = {'n_estimators': 407,
                 'num_rounds': 274,
                 'learning_rate': 0.1,
                 'num_leaves': 195,
                 'max_depth': 9,
                 'min_data_in_leaf': 46,
                 'lambda_l1': 0.01,
                 'lambda_l2': 0.6,
                 'min_gain_to_split': 1.42,
                 'bagging_fraction': 0.45,
                 'feature_fraction': 0.3}

In [None]:
lgbm_clf = lgbm.LGBMClassifier(**lgbm_params)

cross_validate(X_train, y, lgbm_clf)

lgbm_clf.fit(X_train, y, verbose=False)

Fold 0 	 auc: 0.9118584367661857
Fold 1 	 auc: 0.8071955719557196
Fold 2 	 auc: 0.8411606843341161
Fold 3 	 auc: 0.9116068433411607
Fold 4 	 auc: 0.8034216705803421
Fold 5 	 auc: 0.8241361959074135
Fold 6 	 auc: 0.8239684669573969
Fold 7 	 auc: 0.8463914871706857
Fold 8 	 auc: 0.8574615978717927
Fold 9 	 auc: 0.8521410795503304
Avg AUC: 0.8479342034435143


LGBMClassifier(bagging_fraction=0.45, feature_fraction=0.3, lambda_l1=0.01,
               lambda_l2=0.6, max_depth=9, min_data_in_leaf=46,
               min_gain_to_split=1.42, n_estimators=407, num_leaves=195,
               num_rounds=274)

## CatBoost

In [None]:
catboost_params = {'loss_function': 'CrossEntropy',
                     'learning_rate': 0.76,
                     'l2_leaf_reg': 0.014,
                     'colsample_bylevel': 0.06,
                     'depth': 1,
                     'boosting_type': 'Plain',
                     'bootstrap_type': 'Bernoulli',
                     'min_data_in_leaf': 18,
                     'one_hot_max_size': 14,
                     'subsample': 0.99}

catboost_clf = catboost.CatBoostClassifier(**catboost_params)

cross_validate(X_train, y, catboost_clf)

catboost_clf.fit(X_train, y, verbose=False)

0:	learn: 0.3899362	total: 55.3ms	remaining: 55.3s
1:	learn: 0.3792823	total: 56.5ms	remaining: 28.2s
2:	learn: 0.3609155	total: 57.5ms	remaining: 19.1s
3:	learn: 0.3603260	total: 58.4ms	remaining: 14.5s
4:	learn: 0.3592114	total: 59.3ms	remaining: 11.8s
5:	learn: 0.3427839	total: 59.9ms	remaining: 9.92s
6:	learn: 0.3427666	total: 60.3ms	remaining: 8.56s
7:	learn: 0.3384105	total: 60.9ms	remaining: 7.55s
8:	learn: 0.3317356	total: 61.5ms	remaining: 6.77s
9:	learn: 0.3314480	total: 62ms	remaining: 6.14s
10:	learn: 0.3272721	total: 62.6ms	remaining: 5.63s
11:	learn: 0.3268577	total: 63.1ms	remaining: 5.2s
12:	learn: 0.3267670	total: 63.9ms	remaining: 4.85s
13:	learn: 0.3267668	total: 64.3ms	remaining: 4.53s
14:	learn: 0.3267618	total: 64.9ms	remaining: 4.26s
15:	learn: 0.3228453	total: 66.2ms	remaining: 4.07s
16:	learn: 0.3209077	total: 68.4ms	remaining: 3.95s
17:	learn: 0.3202561	total: 71.1ms	remaining: 3.88s
18:	learn: 0.3202558	total: 71.6ms	remaining: 3.69s
19:	learn: 0.3184713	tota

<catboost.core.CatBoostClassifier at 0x7f5db9b16810>

# 모델 예측 및 블렌딩

- 모델 예측값들은 1위가 만든 모델들의 예측값으로 블렌딩 시 `score : 0.90185`
- 기존 예측값은 우리팀이 만든 CatBoost 모델의 예측값으로 `score : 0.90037`

## 모델 예측

In [None]:
xgb_preds = xgb_clf.predict_proba(X_test)[:, 1]
lgbm_preds = lgbm_clf.predict_proba(X_test)[:, 1]
cat_preds = catboost_clf.predict_proba(X_test)[:, 1]

## 기존 예측값 로드

- Project3_최종.ipynb 최종모델 예측 결과

In [None]:
local_best = pd.read_csv('submission_ver_catboost_best(0.90037).csv')

In [None]:
local_best['Attrition'].values

array([0.13844965, 0.15573318, 0.05971285, ..., 0.02647163, 0.04714299,
       0.01000006])

## 블렌딩

- 1위의 모델(XGBoost, LightGBM, CatBoost)와 우리팀의 CatBoost모델이 낸 예측값들을 블렌딩
- 총 네개의 모델(XGBoost, LightGBM, CatBoost2)을 블렌딩한 예측값 `score : 0.90407` (1등) 

In [None]:
final_preds = np.column_stack([xgb_preds, xgb_preds,
                               cat_preds, local_best['Attrition'].values]).mean(axis=1)

In [None]:
submission = pd.DataFrame({"id": test_idx, "Attrition": final_preds})
submission.head()

Unnamed: 0,id,Attrition
0,1677,0.141695
1,1678,0.147479
2,1679,0.039118
3,1680,0.055234
4,1681,0.50084


In [None]:
submission.to_csv("submission.csv", index=False)