# XGBoost

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('dataset/train.csv', index_col=0)
test = pd.read_csv('dataset/test.csv', index_col=0)
print(train['Attrition'].value_counts())

# 处理 Attrition 字段
train['Attrition']=train['Attrition'].map(lambda x:1 if x=='Yes' else 0)

# 查看数据是否有空值
#print(train.isna().sum())

# 去掉没用的列 员工号码，标准工时（=80）
train = train.drop(['EmployeeNumber', 'StandardHours'], axis=1)
test = test.drop(['EmployeeNumber', 'StandardHours'], axis=1)

# 对于分类特征进行特征值编码
attr=['Age','BusinessTravel','Department','Education','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime']
lbe_list=[]
for feature in attr:
    lbe=LabelEncoder()
    train[feature]=lbe.fit_transform(train[feature])
    test[feature]=lbe.transform(test[feature])
    lbe_list.append(lbe)
#train.to_csv('temp.csv')
#print(train)

param = {
    'boosting_type': 'gbdt',
    'objective': 'binary:logistic', #
    'eval_metric': 'auc',
    'eta': 0.1,
    'max_depth': 1,
    'colsample_bytree': 0.8,
    'subsample': 0.9,
    'subsample_freq': 7,
    'alpha': 0,
    'lambda': 0,
}

X_train, X_valid, y_train, y_valid = train_test_split(train.drop('Attrition',axis=1), train['Attrition'], test_size=0.2, random_state=2023)

train_data = xgb.DMatrix(X_train, label=y_train)
valid_data = xgb.DMatrix(X_valid, label=y_valid)
test_data = xgb.DMatrix(test)

model = xgb.train(param, train_data, evals=[(train_data, 'train'), (valid_data, 'valid')], num_boost_round = 100000, early_stopping_rounds=100, verbose_eval=5)
predict = model.predict(test_data)
test['Attrition']=predict
print(predict)

Attrition
No     988
Yes    188
Name: count, dtype: int64
[0]	train-auc:0.59042	valid-auc:0.61549
[5]	train-auc:0.73945	valid-auc:0.74122
[10]	train-auc:0.77306	valid-auc:0.75096
[15]	train-auc:0.78399	valid-auc:0.77669
[20]	train-auc:0.79002	valid-auc:0.78792
[25]	train-auc:0.79629	valid-auc:0.79602
[30]	train-auc:0.80597	valid-auc:0.80114
[35]	train-auc:0.81931	valid-auc:0.80910
[40]	train-auc:0.82886	valid-auc:0.80498
[45]	train-auc:0.83329	valid-auc:0.79844
[50]	train-auc:0.84048	valid-auc:0.81016
[55]	train-auc:0.84293	valid-auc:0.81201
[60]	train-auc:0.84814	valid-auc:0.81912
[65]	train-auc:0.84923	valid-auc:0.82800
[70]	train-auc:0.85439	valid-auc:0.82353
[75]	train-auc:0.86077	valid-auc:0.82516
[80]	train-auc:0.86389	valid-auc:0.82317
[85]	train-auc:0.86727	valid-auc:0.82530
[90]	train-auc:0.86878	valid-auc:0.82445
[95]	train-auc:0.86994	valid-auc:0.82658
[100]	train-auc:0.87323	valid-auc:0.82914
[105]	train-auc:0.87430	valid-auc:0.82644
[110]	train-auc:0.87725	valid-auc:0.8260

Parameters: { "boosting_type", "subsample_freq" } are not used.



[160]	train-auc:0.89134	valid-auc:0.83611
[165]	train-auc:0.89351	valid-auc:0.83440
[170]	train-auc:0.89339	valid-auc:0.83042
[175]	train-auc:0.89585	valid-auc:0.82871
[180]	train-auc:0.89827	valid-auc:0.82900
[185]	train-auc:0.89910	valid-auc:0.82772
[190]	train-auc:0.89827	valid-auc:0.83099
[195]	train-auc:0.90022	valid-auc:0.83156
[200]	train-auc:0.90156	valid-auc:0.82900
[205]	train-auc:0.90323	valid-auc:0.82857
[210]	train-auc:0.90549	valid-auc:0.83213
[215]	train-auc:0.90679	valid-auc:0.82928
[220]	train-auc:0.90749	valid-auc:0.83056
[225]	train-auc:0.90939	valid-auc:0.83255
[230]	train-auc:0.90998	valid-auc:0.83170
[235]	train-auc:0.91115	valid-auc:0.83483
[240]	train-auc:0.91129	valid-auc:0.83340
[245]	train-auc:0.91174	valid-auc:0.83454
[250]	train-auc:0.91233	valid-auc:0.83525
[255]	train-auc:0.91352	valid-auc:0.83582
[256]	train-auc:0.91361	valid-auc:0.83682
[0.06316353 0.07983631 0.12851222 0.07761735 0.72005045 0.1850563
 0.2662958  0.05720792 0.0103399  0.2573669  0.08311