# Catboost Classifier

In [1]:
import catboost
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("data/base.csv")
drop = ['타겟', '가해운전자 차종']

X = data.drop(drop, axis=1)
y = data['타겟']

# Identify object columns for label encoding
object_columns = X.select_dtypes(include=['object']).columns

# Label encode object columns
label_encoder = LabelEncoder()
for column in object_columns:
    X[column] = label_encoder.fit_transform(X[column])

#print(X.isnull().sum(), y.isnull().sum())

In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a CatBoost classifier
catboost_model = CatBoostClassifier(iterations=500, depth=10, learning_rate=0.05, loss_function='MultiClass', verbose=False)


# Fit the model to the training data
catboost_model.fit(X_train, y_train)
feature_importance = catboost_model.get_feature_importance(prettified=True)
# Predict on the test set
y_pred = catboost_model.predict(X_test)

# Decode predicted values if you want to get them back to original class labels
y_pred_original = label_encoder.inverse_transform(y_pred)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("Confusion Matrix:")
print(conf_matrix)

print("Feature Importance:")
print(feature_importance)


Accuracy: 0.9852
Precision: 0.9854
Recall: 0.9852
F1 Score: 0.9851
Confusion Matrix:
[[1681   63   18]
 [   2 3227    4]
 [   2    1 1077]]
Feature Importance:
    Feature Id  Importances
0   피해운전자 상해정도    24.009542
1         사고내용    13.808677
2         경상자수     9.768279
3     피해운전자 차종     6.868876
4       부상신고자수     5.849228
5     가해운전자 연령     5.450001
6         사고유형     4.742704
7     피해운전자 연령     4.495854
8         중상자수     3.995344
9           요일     3.625214
10           월     3.539871
11        도광역시     3.369803
12      하루시간구분     2.905957
13        법규위반     2.463361
14        도로형태     2.139609
15    피해운전자 성별     1.413194
16    가해운전자 성별     0.984423
17        기상상태     0.226292
18        사망자수     0.222248
19        노면상태     0.121522


  y = column_or_1d(y, warn=True)


In [4]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

param_grid = {
    'iterations': [500, 1000],
    'depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1]
    # Add more hyperparameters to the grid
}

grid_search = GridSearchCV(CatBoostClassifier(loss_function='MultiClass', verbose=False), param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Extract the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate on the test set
y_pred = best_model.predict(X_test)

# Decode predicted values if you want to get them back to original class labels
y_pred_original = label_encoder.inverse_transform(y_pred)

# Evaluate the performance on the test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print("Best Parameters:", best_params)
print(f"Accuracy on Test Set: {accuracy:.4f}")
print(f"Precision on Test Set: {precision:.4f}")
print(f"Recall on Test Set: {recall:.4f}")
print(f"F1 Score on Test Set: {f1:.4f}")

Best Parameters: {'depth': 6, 'iterations': 500, 'learning_rate': 0.05}
Accuracy on Test Set: 0.9849
Precision on Test Set: 0.9851
Recall on Test Set: 0.9849
F1 Score on Test Set: 0.9848


  y = column_or_1d(y, warn=True)
