# CatBoost

In [1]:
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os
data_path = os.environ.get('DATA_PATH')

In [2]:
# Load the training and testing data
train = pd.read_csv(data_path + 'AI_Cheats/employee_turnover/train.csv', index_col=0)
test = pd.read_csv(data_path + 'AI_Cheats/employee_turnover/test.csv', index_col=0)

# Process the Attrition column, converting 'Yes' to 1 and 'No' to 0
train['Attrition'] = train['Attrition'].map(lambda x: 1 if x == 'Yes' else 0)

# Check for missing values in each column
# print(train.isna().sum())

In [3]:
# Drop unnecessary columns: EmployeeNumber and StandardHours (since they are constant)
train = train.drop(['EmployeeNumber', 'StandardHours'], axis=1)
test = test.drop(['EmployeeNumber', 'StandardHours'], axis=1)

In [4]:
# List of categorical features that need encoding
attr = ['Age', 'BusinessTravel', 'Department', 'Education', 'EducationField', 
        'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']

# Encode categorical features using LabelEncoder
lbe_list = []
for feature in attr:
    lbe = LabelEncoder()
    train[feature] = lbe.fit_transform(train[feature])
    test[feature] = lbe.transform(test[feature])
    lbe_list.append(lbe)
# Print the processed training data to check encoding
# print(train)

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(train.drop('Attrition',axis=1), train['Attrition'], test_size=0.2, random_state=42)

In [6]:
model = cb.CatBoostClassifier(
    iterations=1000, 
    depth=7, 
    learning_rate=0.01, 
    loss_function='Logloss', 
    eval_metric='AUC',
    logging_level='Verbose', 
    metric_period=50
)

In [7]:
# Get the column indices of the categorical features
categorical_features_indices = []
for i in range(len(X_train.columns)):
    if X_train.columns.values[i] in attr:
        categorical_features_indices.append(i)

print(categorical_features_indices)

[0, 1, 3, 5, 6, 9, 13, 15, 19, 20]


In [8]:
model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=categorical_features_indices)

0:	test: 0.6390374	best: 0.6390374 (0)	total: 60ms	remaining: 59.9s
50:	test: 0.7895886	best: 0.7895886 (50)	total: 173ms	remaining: 3.22s
100:	test: 0.8008294	best: 0.8008294 (100)	total: 269ms	remaining: 2.39s
150:	test: 0.8057405	best: 0.8057405 (150)	total: 383ms	remaining: 2.15s
200:	test: 0.8041035	best: 0.8057405 (150)	total: 500ms	remaining: 1.99s
250:	test: 0.8037761	best: 0.8057405 (150)	total: 625ms	remaining: 1.86s
300:	test: 0.8060679	best: 0.8060679 (300)	total: 749ms	remaining: 1.74s
350:	test: 0.8037761	best: 0.8060679 (300)	total: 864ms	remaining: 1.6s
400:	test: 0.8019208	best: 0.8060679 (300)	total: 985ms	remaining: 1.47s
450:	test: 0.8061770	best: 0.8061770 (450)	total: 1.1s	remaining: 1.33s
500:	test: 0.8080323	best: 0.8080323 (500)	total: 1.21s	remaining: 1.21s
550:	test: 0.8079232	best: 0.8080323 (500)	total: 1.32s	remaining: 1.08s
600:	test: 0.8103241	best: 0.8103241 (600)	total: 1.43s	remaining: 948ms
650:	test: 0.8099967	best: 0.8103241 (600)	total: 1.55s	rema

<catboost.core.CatBoostClassifier at 0x107a62f10>

In [9]:
predict = model.predict(test)
test['Attrition']=predict
test[['Attrition']].to_csv(data_path + 'AI_Cheats/employee_turnover/submit_cb.csv')