In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
df=pd.read_csv("/content/cleaned_ai4i2020.csv")
# Prepare data
X = df[['Power', 'OSF', 'PWF', 'HDF', 'TWF', 'Torque [Nm]', 'Rotational speed [rpm]', 'Temp_Difference']] # Features
y = df['Machine failure'] # Target

# Split data into training and testing sets with shuffling
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42,stratify=y)

print("Data splitting complete.")
print("Shape of x_train:", x_train.shape)
print("Shape of x_test:", x_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Data splitting complete.
Shape of x_train: (8000, 8)
Shape of x_test: (2000, 8)
Shape of y_train: (8000,)
Shape of y_test: (2000,)


In [2]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

print("Shape of x_train before oversampling:", x_train.shape)
print("Shape of x_train after oversampling:", x_train_resampled.shape)
print("Shape of y_train before oversampling:", y_train.shape)
print("Shape of y_train after oversampling:", y_train_resampled.shape)

print("\nValue counts of y_train before oversampling:")
print(y_train.value_counts())

print("\nValue counts of y_train after oversampling:")
print(y_train_resampled.value_counts())

Shape of x_train before oversampling: (8000, 8)
Shape of x_train after oversampling: (15458, 8)
Shape of y_train before oversampling: (8000,)
Shape of y_train after oversampling: (15458,)

Value counts of y_train before oversampling:
Machine failure
0    7729
1     271
Name: count, dtype: int64

Value counts of y_train after oversampling:
Machine failure
0    7729
1    7729
Name: count, dtype: int64


In [4]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [20]:
from catboost import CatBoostClassifier

# Train CatBoost classifier
best_params= {'verbose': True, 'loss_function': 'Logloss', 'learning_rate': 0.01, 'l2_leaf_reg': 5, 'iterations': 200, 'depth': 4, 'border_count': 32}
catboost_model = CatBoostClassifier(class_weights=[1, 10],random_state=42,**best_params) # Suppress verbose output

catboost_model.fit(x_train, y_train)

# Predictions
y_pred_catboost = catboost_model.predict(x_test)

# Evaluation
print("CatBoost Classifier performance on test data:")
print("Accuracy:", accuracy_score(y_test, y_pred_catboost))
print("Classification Report:\n", classification_report(y_test, y_pred_catboost))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_catboost))

0:	learn: 0.6683824	total: 2.42ms	remaining: 482ms
1:	learn: 0.6522758	total: 5.75ms	remaining: 569ms
2:	learn: 0.6362166	total: 9.3ms	remaining: 611ms
3:	learn: 0.6248294	total: 14ms	remaining: 685ms
4:	learn: 0.6112784	total: 19.4ms	remaining: 758ms
5:	learn: 0.6003441	total: 24.2ms	remaining: 781ms
6:	learn: 0.5863586	total: 27.3ms	remaining: 752ms
7:	learn: 0.5655636	total: 32.9ms	remaining: 789ms
8:	learn: 0.5522052	total: 37.7ms	remaining: 800ms
9:	learn: 0.5404267	total: 41.3ms	remaining: 784ms
10:	learn: 0.5213672	total: 47.5ms	remaining: 816ms
11:	learn: 0.5030412	total: 53.6ms	remaining: 839ms
12:	learn: 0.4935073	total: 57.4ms	remaining: 826ms
13:	learn: 0.4855454	total: 64ms	remaining: 850ms
14:	learn: 0.4743971	total: 68.4ms	remaining: 844ms
15:	learn: 0.4619446	total: 70.5ms	remaining: 811ms
16:	learn: 0.4458883	total: 72.7ms	remaining: 782ms
17:	learn: 0.4304674	total: 76ms	remaining: 768ms
18:	learn: 0.4209822	total: 78.2ms	remaining: 745ms
19:	learn: 0.4065343	total: 8

In [21]:
from sklearn.metrics import classification_report, accuracy_score

# Predictions on train
y_train_pred = catboost_model.predict(x_train)
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))


Train Accuracy: 0.999125
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7729
           1       1.00      0.97      0.99       271

    accuracy                           1.00      8000
   macro avg       1.00      0.99      0.99      8000
weighted avg       1.00      1.00      1.00      8000



In [22]:
from sklearn.model_selection import cross_val_score
import numpy as np

scores = cross_val_score(catboost_model, X, y, cv=5, scoring='accuracy')
print("Cross-validation scores:", scores)
print("Mean accuracy:", np.mean(scores))


0:	learn: 0.6688958	total: 2.38ms	remaining: 473ms
1:	learn: 0.6455046	total: 4.38ms	remaining: 434ms
2:	learn: 0.6229356	total: 6.49ms	remaining: 426ms
3:	learn: 0.6012336	total: 8.46ms	remaining: 415ms
4:	learn: 0.5877009	total: 10.4ms	remaining: 405ms
5:	learn: 0.5672798	total: 12.3ms	remaining: 399ms
6:	learn: 0.5531722	total: 14.3ms	remaining: 394ms
7:	learn: 0.5340584	total: 16.3ms	remaining: 392ms
8:	learn: 0.5216802	total: 18.3ms	remaining: 389ms
9:	learn: 0.5037700	total: 20.5ms	remaining: 390ms
10:	learn: 0.4865467	total: 22.5ms	remaining: 386ms
11:	learn: 0.4756025	total: 24.3ms	remaining: 381ms
12:	learn: 0.4594566	total: 26.2ms	remaining: 377ms
13:	learn: 0.4495300	total: 28.2ms	remaining: 374ms
14:	learn: 0.4343879	total: 30.1ms	remaining: 371ms
15:	learn: 0.4262056	total: 32.1ms	remaining: 369ms
16:	learn: 0.4119533	total: 34.1ms	remaining: 367ms
17:	learn: 0.3982759	total: 36.1ms	remaining: 365ms
18:	learn: 0.3898893	total: 39.1ms	remaining: 373ms
19:	learn: 0.3770763	t

In [44]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve

# ============================
# 1. Load and prepare dataset
df = pd.read_csv("/content/cleaned_ai4i2020.csv")  # replace with your path
X = df[['Power', 'OSF', 'PWF', 'HDF', 'TWF', 'Torque [Nm]', 'Rotational speed [rpm]', 'Temp_Difference']]
y = df["Machine failure"]

# Split train-test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ============================
# 2. Train CatBoost
# ============================
model = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.1,
    l2_leaf_reg=5,
    eval_metric="AUC",
    verbose=0,
    random_state=42
)

model.fit(X_train, y_train)

# ============================
# 3. Cross-validation check
# ============================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# ============================
# 4. Predictions & Probabilities
# ============================
y_proba = model.predict_proba(X_test)[:, 1]

# ============================
# 5. Find Best Threshold
# ============================
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)

# Define custom metric: maximize recall of failures while keeping precision ≥ 0.98
best_thresh, best_f1 = 0.5, 0
for t in thresholds:
    y_pred = (y_proba >= t).astype(int)
    report = classification_report(y_test, y_pred, output_dict=True)
    failure_recall = report['1']['recall']
    failure_precision = report['1']['precision']
    failure_f1 = report['1']['f1-score']

    if failure_precision >= 0.98 and failure_f1 > best_f1:
        best_thresh, best_f1 = t, failure_f1

print(f"Best Threshold: {best_thresh:.3f} | Best Failure F1: {best_f1:.3f}")

# ============================
# 6. Apply Threshold
# ============================
def final_prediction(proba, low=0.4, high=0.6):
    if proba >= high:
        return "Failure"
    elif proba < low:
        return "No Failure"
    else:
        return "Review Required"

X_test_preds = [final_prediction(p, low=best_thresh-0.1, high=best_thresh) for p in y_proba]

# ============================
# 7. Evaluation
# ============================
y_pred = [1 if p=="Failure" else 0 for p in X_test_preds]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("AUC-ROC:", roc_auc_score(y_test, y_proba))

# Show some predictions
results = pd.DataFrame({
    "True_Label": y_test.values,
    "Predicted_Proba": y_proba,
    "Final_Label": X_test_preds
}).head(20)
print("\nSample predictions (first 20):")
print(results)

Cross-validation scores: [0.999  0.999  0.999  0.999  0.9995]
Mean CV accuracy: 0.9991


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Threshold: 0.955 | Best Failure F1: 0.985

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1932
           1       1.00      0.97      0.99        68

    accuracy                           1.00      2000
   macro avg       1.00      0.99      0.99      2000
weighted avg       1.00      1.00      1.00      2000

Confusion Matrix:
[[1932    0]
 [   2   66]]
AUC-ROC: 0.9877945743514798

Sample predictions (first 20):
    True_Label  Predicted_Proba Final_Label
0            0         0.000157  No Failure
1            0         0.000021  No Failure
2            0         0.000017  No Failure
3            0         0.000009  No Failure
4            0         0.000204  No Failure
5            0         0.000071  No Failure
6            0         0.001369  No Failure
7            0         0.000095  No Failure
8            0         0.000007  No Failure
9            0         0.000089  No Failure
10           