# XGBoost

In [101]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [179]:
train = pd.read_csv('dataset/train.csv', index_col=0)
test = pd.read_csv('dataset/test.csv', index_col=0)
print(train['Attrition'].value_counts())

# 处理 Attrition 字段
train['Attrition']=train['Attrition'].map(lambda x:1 if x=='Yes' else 0)

# 查看数据是否有空值
#print(train.isna().sum())

# 去掉没用的列 员工号码，标准工时（=80）
train = train.drop(['EmployeeNumber', 'StandardHours'], axis=1)
test = test.drop(['EmployeeNumber', 'StandardHours'], axis=1)

# 对于分类特征进行特征值编码
attr=['Age','BusinessTravel','Department','Education','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime']
lbe_list=[]
for feature in attr:
    lbe=LabelEncoder()
    train[feature]=lbe.fit_transform(train[feature])
    test[feature]=lbe.transform(test[feature])
    lbe_list.append(lbe)
#train.to_csv('temp.csv')
#print(train)

param = {
    'boosting_type': 'gbdt',
    'objective': 'binary:logistic', #
    'eval_metric': 'auc',
    'eta': 0.1,
    'max_depth': 1,
    'colsample_bytree': 0.8,
    'subsample': 0.9,
    'subsample_freq': 7,
    'alpha': 0,
    'lambda': 0,
}

X_train, X_valid, y_train, y_valid = train_test_split(train.drop('Attrition',axis=1), train['Attrition'], test_size=0.2, random_state=2023)

train_data = xgb.DMatrix(X_train, label=y_train)
valid_data = xgb.DMatrix(X_valid, label=y_valid)
test_data = xgb.DMatrix(test)

model = xgb.train(param, train_data, evals=[(train_data, 'train'), (valid_data, 'valid')], num_boost_round = 100000, early_stopping_rounds=100, verbose_eval=5)
predict = model.predict(test_data)
test['Attrition']=predict
print(predict)

Attrition
No     988
Yes    188
Name: count, dtype: int64
Parameters: { "boosting_type", "subsample_freq" } are not used.

[0]	train-auc:0.59995	valid-auc:0.60057
[5]	train-auc:0.73517	valid-auc:0.71713


[10]	train-auc:0.76513	valid-auc:0.75984
[15]	train-auc:0.77539	valid-auc:0.77832
[20]	train-auc:0.78225	valid-auc:0.77719
[25]	train-auc:0.79339	valid-auc:0.77555
[30]	train-auc:0.80060	valid-auc:0.78785
[35]	train-auc:0.81294	valid-auc:0.79055
[40]	train-auc:0.82055	valid-auc:0.80505
[45]	train-auc:0.83123	valid-auc:0.81357
[50]	train-auc:0.83365	valid-auc:0.81151
[55]	train-auc:0.83604	valid-auc:0.81457
[60]	train-auc:0.84121	valid-auc:0.81528
[65]	train-auc:0.84831	valid-auc:0.81883
[70]	train-auc:0.85293	valid-auc:0.81684
[75]	train-auc:0.85901	valid-auc:0.82253
[80]	train-auc:0.86369	valid-auc:0.82452
[85]	train-auc:0.86541	valid-auc:0.82722
[90]	train-auc:0.86669	valid-auc:0.81748
[95]	train-auc:0.87085	valid-auc:0.81763
[100]	train-auc:0.87045	valid-auc:0.81862
[105]	train-auc:0.87328	valid-auc:0.81919
[110]	train-auc:0.87645	valid-auc:0.82189
[115]	train-auc:0.87615	valid-auc:0.82203
[120]	train-auc:0.87760	valid-auc:0.82061
[125]	train-auc:0.88070	valid-auc:0.82146
[130]	trai

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


[170]	train-auc:0.89316	valid-auc:0.83397
[175]	train-auc:0.89530	valid-auc:0.83141
[180]	train-auc:0.89508	valid-auc:0.83511
[185]	train-auc:0.89622	valid-auc:0.83255
[190]	train-auc:0.89846	valid-auc:0.83312
[195]	train-auc:0.90012	valid-auc:0.83653
[200]	train-auc:0.90148	valid-auc:0.83710
[205]	train-auc:0.90292	valid-auc:0.83810
[210]	train-auc:0.90315	valid-auc:0.83355
[215]	train-auc:0.90380	valid-auc:0.83539
[220]	train-auc:0.90412	valid-auc:0.83426
[225]	train-auc:0.90527	valid-auc:0.83625
[230]	train-auc:0.90642	valid-auc:0.83767
[235]	train-auc:0.90713	valid-auc:0.83881
[240]	train-auc:0.90761	valid-auc:0.83838
[245]	train-auc:0.90883	valid-auc:0.84037
[250]	train-auc:0.91027	valid-auc:0.83994
[255]	train-auc:0.91115	valid-auc:0.84037
[260]	train-auc:0.91195	valid-auc:0.84065
[265]	train-auc:0.91322	valid-auc:0.84250
[270]	train-auc:0.91434	valid-auc:0.84193
[275]	train-auc:0.91487	valid-auc:0.84065
[280]	train-auc:0.91552	valid-auc:0.83639
[285]	train-auc:0.91674	valid-auc: