# Predictive Maintenance - Equipment Failure Prediction

This notebook demonstrates ML for predicting equipment failures based on sensor data.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/harunpirim/IME465/blob/main/predictive_maintenance/predictive_maintenance.ipynb)

## Objectives:
1. Load and explore the AI4I 2020 Predictive Maintenance dataset
2. Perform EDA on sensor data and operational parameters
3. Engineer temporal and interaction features
4. Build and compare ML models
5. Evaluate with focus on recall (catching failures)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve)
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Data Loading

AI4I 2020 Predictive Maintenance Dataset with operational settings and sensor measurements.

In [None]:
# Load the AI4I 2020 predictive maintenance dataset from UCI
# Dataset page: https://archive.ics.uci.edu/dataset/601/ai4i+2020+predictive+maintenance+dataset
# Direct CSV: https://archive.ics.uci.edu/ml/machine-learning-databases/00601/ai4i2020.csv
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00601/ai4i2020.csv"
df_raw = pd.read_csv(url)
# Rename columns to snake_case for easier handling
rename_map = {
    'UDI': 'udi',
    'Product ID': 'product_id',
    'Type': 'type',
    'Air temperature [K]': 'air_temp_k',
    'Process temperature [K]': 'process_temp_k',
    'Rotational speed [rpm]': 'rotational_speed_rpm',
    'Torque [Nm]': 'torque_nm',
    'Tool wear [min]': 'tool_wear_min',
    'Machine failure': 'machine_failure',
    'TWF': 'twf',
    'HDF': 'hdf',
    'PWF': 'pwf',
    'OSF': 'osf',
    'RNF': 'rnf'
}
df_raw = df_raw.rename(columns=rename_map)
# Drop IDs and failure mode breakdowns to avoid leakage; keep overall failure label
leakage_cols = ['udi', 'product_id', 'twf', 'hdf', 'pwf', 'osf', 'rnf']
df = df_raw.drop(columns=leakage_cols)
print(f"Dataset shape: {df.shape}")
print("Failure rate: {:.2%}".format(df['machine_failure'].mean()))
df.head()


## 2. Exploratory Data Analysis

In [None]:
# Target distribution (likely imbalanced)
print("Target Distribution:")
print(df['machine_failure'].value_counts())
print(f"\nFailure rate: {df['machine_failure'].mean():.2%}")

plt.figure(figsize=(8, 5))
df['machine_failure'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Machine Failure Distribution')
plt.xlabel('Failure (0=No, 1=Yes)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

## 3. Feature Engineering

Create temporal features, interactions, and handle categorical variables.

In [None]:
# Feature engineering
# Dataset already cleaned: only core inputs and the binary machine_failure target remain
df_processed = df.copy()
# Temperature difference
df_processed['temp_diff'] = df_processed['process_temp_k'] - df_processed['air_temp_k']
# Power (torque * rotational speed)
df_processed['power'] = df_processed['torque_nm'] * df_processed['rotational_speed_rpm']
# Tool wear categories
df_processed['tool_wear_high'] = (df_processed['tool_wear_min'] > 200).astype(int)
# One-hot encode machine type
df_processed = pd.get_dummies(df_processed, columns=['type'], prefix='type', drop_first=True)
# Prepare features
X = df_processed.drop('machine_failure', axis=1)
y = df_processed['machine_failure']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Scale features (class imbalance handled with class_weight='balanced' in models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Class distribution in training set:")
print(y_train.value_counts(normalize=True))


## 4. Machine Learning Models

In [None]:
# Train and evaluate models
# Note: Using class_weight='balanced' to handle class imbalance (scikit-learn only)
models_dict = {}

# Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
y_pred_proba_lr = lr.predict_proba(X_test_scaled)[:, 1]
models_dict['LR'] = (y_pred_lr, y_pred_proba_lr)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_pred_proba_rf = rf.predict_proba(X_test)[:, 1]
models_dict['RF'] = (y_pred_rf, y_pred_proba_rf)

# Gradient Boosting (note: doesn't support class_weight, but handles imbalance reasonably well)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
y_pred_proba_gb = gb.predict_proba(X_test)[:, 1]
models_dict['GB'] = (y_pred_gb, y_pred_proba_gb)

# Evaluate
results = []
for name, (y_pred, y_pred_proba) in models_dict.items():
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba)
    })

results_df = pd.DataFrame(results)
print("Model Comparison:")
print(results_df.to_string(index=False))

## 5. Summary

Key findings: Recall is critical for predictive maintenance - missing failures is costly.