In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Prepare data
X = df[['Power', 'OSF', 'PWF', 'HDF', 'TWF', 'Torque [Nm]', 'Rotational speed [rpm]', 'Temp_Difference']] # Features
y = df['Machine failure'] # Target

# Split data into training and testing sets with shuffling
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42,stratify=y)

print("Data splitting complete.")
print("Shape of x_train:", x_train.shape)
print("Shape of x_test:", x_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Data splitting complete.
Shape of x_train: (8000, 8)
Shape of x_test: (2000, 8)
Shape of y_train: (8000,)
Shape of y_test: (2000,)


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

print("Shape of x_train before oversampling:", x_train.shape)
print("Shape of x_train after oversampling:", x_train_resampled.shape)
print("Shape of y_train before oversampling:", y_train.shape)
print("Shape of y_train after oversampling:", y_train_resampled.shape)

print("\nValue counts of y_train before oversampling:")
print(y_train.value_counts())

print("\nValue counts of y_train after oversampling:")
print(y_train_resampled.value_counts())

Shape of x_train before oversampling: (8000, 8)
Shape of x_train after oversampling: (15458, 8)
Shape of y_train before oversampling: (8000,)
Shape of y_train after oversampling: (15458,)

Value counts of y_train before oversampling:
Machine failure
0    7729
1     271
Name: count, dtype: int64

Value counts of y_train after oversampling:
Machine failure
0    7729
1    7729
Name: count, dtype: int64


In [None]:
from catboost import CatBoostClassifier

# Train CatBoost classifier
catboost_model = CatBoostClassifier(class_weights=[1, 10],random_state=42) # Suppress verbose output

catboost_model.fit(x_train, y_train)

# Predictions
y_pred_catboost = catboost_model.predict(x_test)

# Evaluation
print("CatBoost Classifier performance on test data:")
print("Accuracy:", accuracy_score(y_test, y_pred_catboost))
print("Classification Report:\n", classification_report(y_test, y_pred_catboost))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_catboost))

Learning rate set to 0.025035
0:	learn: 0.6318102	total: 3.03ms	remaining: 3.02s
1:	learn: 0.5765138	total: 5.72ms	remaining: 2.86s
2:	learn: 0.5295646	total: 8.31ms	remaining: 2.76s
3:	learn: 0.4841192	total: 11.1ms	remaining: 2.77s
4:	learn: 0.4424022	total: 13.9ms	remaining: 2.77s
5:	learn: 0.4057307	total: 16.6ms	remaining: 2.75s
6:	learn: 0.3735014	total: 19.3ms	remaining: 2.73s
7:	learn: 0.3425868	total: 21.9ms	remaining: 2.72s
8:	learn: 0.3143478	total: 24.5ms	remaining: 2.7s
9:	learn: 0.2889531	total: 27.3ms	remaining: 2.7s
10:	learn: 0.2709996	total: 29.9ms	remaining: 2.68s
11:	learn: 0.2484949	total: 32.4ms	remaining: 2.67s
12:	learn: 0.2282681	total: 35ms	remaining: 2.66s
13:	learn: 0.2115218	total: 37.7ms	remaining: 2.65s
14:	learn: 0.1960673	total: 40ms	remaining: 2.63s
15:	learn: 0.1824273	total: 42.6ms	remaining: 2.62s
16:	learn: 0.1709751	total: 45.3ms	remaining: 2.62s
17:	learn: 0.1638562	total: 50.7ms	remaining: 2.77s
18:	learn: 0.1531789	total: 53.6ms	remaining: 2.77

In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Predictions on train
y_train_pred = catboost_model.predict(x_train)
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))


Train Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7729
           1       1.00      1.00      1.00       271

    accuracy                           1.00      8000
   macro avg       1.00      1.00      1.00      8000
weighted avg       1.00      1.00      1.00      8000



In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

scores = cross_val_score(catboost_model, cols, y, cv=5, scoring='accuracy')
print("Cross-validation scores:", scores)
print("Mean accuracy:", np.mean(scores))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
6:	learn: 0.3721558	total: 37.4ms	remaining: 5.31s
7:	learn: 0.3454869	total: 41.9ms	remaining: 5.19s
8:	learn: 0.3200205	total: 45.6ms	remaining: 5.02s
9:	learn: 0.2952082	total: 54.3ms	remaining: 5.38s
10:	learn: 0.2792770	total: 62.7ms	remaining: 5.63s
11:	learn: 0.2585000	total: 70.6ms	remaining: 5.81s
12:	learn: 0.2394575	total: 79.5ms	remaining: 6.03s
13:	learn: 0.2220912	total: 83.2ms	remaining: 5.86s
14:	learn: 0.2068081	total: 91.3ms	remaining: 6s
15:	learn: 0.1928779	total: 101ms	remaining: 6.24s
16:	learn: 0.1843868	total: 108ms	remaining: 6.27s
17:	learn: 0.1726711	total: 112ms	remaining: 6.14s
18:	learn: 0.1625474	total: 116ms	remaining: 5.99s
19:	learn: 0.1527525	total: 125ms	remaining: 6.13s
20:	learn: 0.1444294	total: 131ms	remaining: 6.12s
21:	learn: 0.1369174	total: 140ms	remaining: 6.22s
22:	learn: 0.1293751	total: 148ms	remaining: 6.28s
23:	learn: 0.1227042	total: 157ms	remaining: 6.37s
24:	learn: 0.11

In [None]:
import joblib

# Save the trained model
joblib.dump(catboost_model, 'catboost_smote_pipeline.joblib')

print("CatBoost model saved successfully!")

CatBoost model saved successfully!


In [None]:
class_weights=[1, 10],  # make class 1 (failure) more important
    random_state=42