# Predictive Maintenance - Equipment Failure Prediction

This notebook demonstrates ML for predicting equipment failures based on sensor data.

## Objectives:
1. Load and explore the AI4I 2020 Predictive Maintenance dataset
2. Perform EDA on sensor data and operational parameters
3. Engineer temporal and interaction features
4. Build and compare ML models
5. Evaluate with focus on recall (catching failures)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve)
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Data Loading

AI4I 2020 Predictive Maintenance Dataset with operational settings and sensor measurements.

In [None]:
# Create synthetic predictive maintenance data
np.random.seed(42)
n_samples = 10000

data = {
    'air_temperature': np.random.uniform(295, 305, n_samples),
    'process_temperature': np.random.uniform(305, 315, n_samples),
    'rotational_speed': np.random.uniform(1168, 2886, n_samples),
    'torque': np.random.uniform(3.8, 77, n_samples),
    'tool_wear': np.random.uniform(0, 253, n_samples),
    'machine_type': np.random.choice(['H', 'L', 'M'], n_samples),
}

df = pd.DataFrame(data)

# Create failure based on realistic relationships
failure_prob = (
    0.02 +  # base failure rate
    0.3 * (df['tool_wear'] > 200) +
    0.2 * (df['process_temperature'] > 310) +
    0.15 * (df['rotational_speed'] > 2500) +
    0.1 * (df['torque'] > 60) +
    0.1 * ((df['process_temperature'] - df['air_temperature']) > 12) +
    np.random.normal(0, 0.05, n_samples)
)
df['machine_failure'] = (failure_prob > 0.3).astype(int)

print(f"Dataset shape: {df.shape}")
print(f"\nFailure rate: {df['machine_failure'].mean():.2%}")
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Target distribution (likely imbalanced)
print("Target Distribution:")
print(df['machine_failure'].value_counts())
print(f"\nFailure rate: {df['machine_failure'].mean():.2%}")

plt.figure(figsize=(8, 5))
df['machine_failure'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Machine Failure Distribution')
plt.xlabel('Failure (0=No, 1=Yes)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

## 3. Feature Engineering

Create temporal features, interactions, and handle categorical variables.

In [None]:
# Feature engineering
df_processed = df.copy()

# Temperature difference
df_processed['temp_diff'] = df_processed['process_temperature'] - df_processed['air_temperature']

# Power (torque * rotational speed)
df_processed['power'] = df_processed['torque'] * df_processed['rotational_speed']

# Tool wear categories
df_processed['tool_wear_high'] = (df_processed['tool_wear'] > 200).astype(int)

# One-hot encode machine type
df_processed = pd.get_dummies(df_processed, columns=['machine_type'], prefix='machine')

# Prepare features
X = df_processed.drop('machine_failure', axis=1)
y = df_processed['machine_failure']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features (class imbalance will be handled with class_weight='balanced' in models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))

## 4. Machine Learning Models

In [None]:
# Train and evaluate models
# Note: Using class_weight='balanced' to handle class imbalance (scikit-learn only)
models_dict = {}

# Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
y_pred_proba_lr = lr.predict_proba(X_test_scaled)[:, 1]
models_dict['LR'] = (y_pred_lr, y_pred_proba_lr)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_pred_proba_rf = rf.predict_proba(X_test)[:, 1]
models_dict['RF'] = (y_pred_rf, y_pred_proba_rf)

# Gradient Boosting (note: doesn't support class_weight, but handles imbalance reasonably well)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
y_pred_proba_gb = gb.predict_proba(X_test)[:, 1]
models_dict['GB'] = (y_pred_gb, y_pred_proba_gb)

# Evaluate
results = []
for name, (y_pred, y_pred_proba) in models_dict.items():
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba)
    })

results_df = pd.DataFrame(results)
print("Model Comparison:")
print(results_df.to_string(index=False))

## 5. Summary

Key findings: Recall is critical for predictive maintenance - missing failures is costly.