# CatBoost

In [1]:
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_csv('dataset/train.csv', index_col=0)
test = pd.read_csv('dataset/test.csv', index_col=0)
# print(train['Attrition'].value_counts())

# 处理Attrition 字段
train['Attrition'] = train['Attrition'].map(lambda x:1 if x=='Yes' else 0)

# 查看数据中每列是否有空值
# print(train.isna().sum())

In [3]:
# 去掉没用的列 员工号码，标准工时（=80）
train = train.drop(['EmployeeNumber', 'StandardHours'], axis=1)
test = test.drop(['EmployeeNumber', 'StandardHours'], axis=1)


In [4]:
# 对于分类特征进行特征值编码
attr=['Age','BusinessTravel','Department','Education','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime']
lbe_list=[]
for feature in attr:
    lbe=LabelEncoder()
    train[feature]=lbe.fit_transform(train[feature])
    test[feature]=lbe.transform(test[feature])
    lbe_list.append(lbe)
#print(train)

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(train.drop('Attrition',axis=1), train['Attrition'], test_size=0.2, random_state=42)

In [6]:
model = cb.CatBoostClassifier(
    iterations=1000, 
    depth=7, 
    learning_rate=0.01, 
    loss_function='Logloss', 
    eval_metric='AUC',
    logging_level='Verbose', 
    metric_period=50
)

In [7]:
# 得到分类特征的列号
categorical_features_indices = []
for i in range(len(X_train.columns)):
    if X_train.columns.values[i] in attr:
        categorical_features_indices.append(i)
print(categorical_features_indices)

[0, 1, 3, 5, 6, 9, 13, 15, 19, 20]


In [8]:
model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=categorical_features_indices)

0:	test: 0.6390374	best: 0.6390374 (0)	total: 58.3ms	remaining: 58.2s
50:	test: 0.7895886	best: 0.7895886 (50)	total: 160ms	remaining: 2.98s
100:	test: 0.8008294	best: 0.8008294 (100)	total: 270ms	remaining: 2.41s
150:	test: 0.8057405	best: 0.8057405 (150)	total: 398ms	remaining: 2.24s
200:	test: 0.8041035	best: 0.8057405 (150)	total: 530ms	remaining: 2.1s
250:	test: 0.8037761	best: 0.8057405 (150)	total: 667ms	remaining: 1.99s
300:	test: 0.8060679	best: 0.8060679 (300)	total: 810ms	remaining: 1.88s
350:	test: 0.8037761	best: 0.8060679 (300)	total: 1.01s	remaining: 1.87s
400:	test: 0.8019208	best: 0.8060679 (300)	total: 1.18s	remaining: 1.77s
450:	test: 0.8061770	best: 0.8061770 (450)	total: 1.32s	remaining: 1.61s
500:	test: 0.8080323	best: 0.8080323 (500)	total: 1.46s	remaining: 1.45s
550:	test: 0.8079232	best: 0.8080323 (500)	total: 1.59s	remaining: 1.3s
600:	test: 0.8103241	best: 0.8103241 (600)	total: 1.73s	remaining: 1.15s
650:	test: 0.8099967	best: 0.8103241 (600)	total: 1.86s	re

<catboost.core.CatBoostClassifier at 0x15a3019f0>

In [10]:
predict = model.predict(test)
test['Attrition']=predict
test[['Attrition']].to_csv('dataset/submit_cb.csv')