# Quality Control - Manufacturing Defect Prediction

This notebook demonstrates machine learning for predicting manufacturing defects in steel plates.
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/harunpirim/IME465/blob/main/quality_control/quality_control.ipynb)
## Objectives:
1. Load and explore the Steel Plates Faults dataset
2. Perform comprehensive EDA
3. Engineer features to improve model performance
4. Build and compare multiple ML models
5. Evaluate and interpret results


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, 
                             classification_report, roc_curve)
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


## 1. Data Loading

We'll use the Steel Plates Faults dataset. This dataset contains 27 features describing steel plate attributes and 7 types of faults.


In [None]:
# Load the dataset
# Note: In practice, you would download this from UCI ML Repository
# For this tutorial, we'll create a synthetic dataset based on the real one
# URL: https://archive.ics.uci.edu/ml/datasets/Steel+Plates+Faults

# Create synthetic data that mimics the Steel Plates Faults dataset
np.random.seed(42)
n_samples = 1941

# Generate features similar to the real dataset
data = {
    'X_Minimum': np.random.uniform(0, 200, n_samples),
    'X_Maximum': np.random.uniform(0, 200, n_samples),
    'Y_Minimum': np.random.uniform(0, 200, n_samples),
    'Y_Maximum': np.random.uniform(0, 200, n_samples),
    'Pixels_Areas': np.random.uniform(100, 10000, n_samples),
    'X_Perimeter': np.random.uniform(50, 500, n_samples),
    'Y_Perimeter': np.random.uniform(50, 500, n_samples),
    'Sum_of_Luminosity': np.random.uniform(1000, 100000, n_samples),
    'Minimum_of_Luminosity': np.random.uniform(0, 255, n_samples),
    'Maximum_of_Luminosity': np.random.uniform(0, 255, n_samples),
    'Length_of_Conveyer': np.random.uniform(100, 1000, n_samples),
    'TypeOfSteel_A300': np.random.choice([0, 1], n_samples),
    'TypeOfSteel_A400': np.random.choice([0, 1], n_samples),
    'Steel_Plate_Thickness': np.random.uniform(0.1, 5.0, n_samples),
    'Edges_Index': np.random.uniform(0, 1, n_samples),
    'Empty_Index': np.random.uniform(0, 1, n_samples),
    'Square_Index': np.random.uniform(0, 1, n_samples),
    'Outside_X_Index': np.random.uniform(0, 1, n_samples),
    'Edges_Y_Index': np.random.uniform(0, 1, n_samples),
    'Edges_X_Index': np.random.uniform(0, 1, n_samples),
    'LogOfAreas': np.random.uniform(2, 10, n_samples),
    'Log_X_Index': np.random.uniform(0, 5, n_samples),
    'Log_Y_Index': np.random.uniform(0, 5, n_samples),
    'Orientation_Index': np.random.uniform(0, 1, n_samples),
    'Luminosity_Index': np.random.uniform(0, 1, n_samples),
    'SigmoidOfAreas': np.random.uniform(0, 1, n_samples),
}

df = pd.DataFrame(data)

# Create target variable: fault (1) or no fault (0)
# Introduce some relationships to make it realistic
fault_prob = (
    0.1 + 
    0.3 * (df['Edges_Index'] > 0.7) +
    0.2 * (df['Empty_Index'] > 0.6) +
    0.2 * (df['Steel_Plate_Thickness'] < 0.5) +
    0.1 * (df['Luminosity_Index'] < 0.3) +
    np.random.normal(0, 0.1, n_samples)
)
df['Fault'] = (fault_prob > 0.4).astype(int)

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()


## 2. Exploratory Data Analysis (EDA)


In [None]:
# 2.1 Basic Information
print("Dataset Info:")
print("=" * 50)
print(f"Shape: {df.shape}")
print(f"\nMissing values:")
print(df.isnull().sum().sum())
print(f"\nData types:")
print(df.dtypes.value_counts())
print(f"\nBasic statistics:")
df.describe()


In [None]:
# 2.2 Target Variable Distribution
print("Target Variable Distribution:")
print("=" * 50)
print(df['Fault'].value_counts())
print(f"\nPercentage:")
print(df['Fault'].value_counts(normalize=True) * 100)

plt.figure(figsize=(8, 5))
df['Fault'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Distribution of Fault vs No Fault', fontsize=14, fontweight='bold')
plt.xlabel('Fault (0=No Fault, 1=Fault)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
# 2.3 Feature Distributions
# Select key features for visualization
key_features = ['X_Minimum', 'X_Maximum', 'Steel_Plate_Thickness', 
                'Edges_Index', 'Empty_Index', 'Luminosity_Index']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, feature in enumerate(key_features):
    axes[i].hist(df[feature], bins=30, color='steelblue', alpha=0.7, edgecolor='black')
    axes[i].set_title(f'Distribution of {feature}', fontsize=12, fontweight='bold')
    axes[i].set_xlabel(feature, fontsize=10)
    axes[i].set_ylabel('Frequency', fontsize=10)
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# 2.4 Correlation Analysis
# Calculate correlation matrix
corr_matrix = df.corr()

# Plot correlation matrix
plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=False, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Features', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Find highly correlated features
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.8:
            high_corr_pairs.append((
                corr_matrix.columns[i], 
                corr_matrix.columns[j], 
                corr_matrix.iloc[i, j]
            ))

print("\nHighly correlated feature pairs (|correlation| > 0.8):")
for pair in high_corr_pairs[:10]:  # Show first 10
    print(f"{pair[0]} - {pair[1]}: {pair[2]:.3f}")


In [None]:
# 2.5 Feature-Target Relationships
# Compare feature distributions by target class
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, feature in enumerate(key_features):
    df[df['Fault'] == 0][feature].hist(ax=axes[i], bins=30, alpha=0.6, 
                                        label='No Fault', color='skyblue')
    df[df['Fault'] == 1][feature].hist(ax=axes[i], bins=30, alpha=0.6, 
                                        label='Fault', color='salmon')
    axes[i].set_title(f'{feature} by Fault Status', fontsize=12, fontweight='bold')
    axes[i].set_xlabel(feature, fontsize=10)
    axes[i].set_ylabel('Frequency', fontsize=10)
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 3. Feature Engineering

Based on our EDA, we'll:
1. Handle missing values (if any)
2. Create derived features
3. Remove highly correlated features
4. Scale features


In [None]:
# 3.1 Create a copy for feature engineering
df_processed = df.copy()

# 3.2 Create derived features
# Area feature
df_processed['Area'] = (df_processed['X_Maximum'] - df_processed['X_Minimum']) * \
                       (df_processed['Y_Maximum'] - df_processed['Y_Minimum'])

# Aspect ratio
df_processed['Aspect_Ratio'] = (df_processed['X_Maximum'] - df_processed['X_Minimum']) / \
                                (df_processed['Y_Maximum'] - df_processed['Y_Minimum'] + 1e-6)

# Perimeter ratio
df_processed['Perimeter_Ratio'] = df_processed['X_Perimeter'] / (df_processed['Y_Perimeter'] + 1e-6)

# Luminosity range
df_processed['Luminosity_Range'] = df_processed['Maximum_of_Luminosity'] - \
                                    df_processed['Minimum_of_Luminosity']

# Normalized area
df_processed['Normalized_Area'] = df_processed['Pixels_Areas'] / (df_processed['Area'] + 1e-6)

print("Created derived features:")
print("- Area")
print("- Aspect_Ratio")
print("- Perimeter_Ratio")
print("- Luminosity_Range")
print("- Normalized_Area")


In [None]:
# 3.3 Remove highly correlated features
# We'll keep one feature from each highly correlated pair
features_to_remove = []
for pair in high_corr_pairs:
    if pair[0] not in features_to_remove and pair[1] not in features_to_remove:
        # Keep the feature with higher correlation to target
        corr1 = abs(df_processed[pair[0]].corr(df_processed['Fault']))
        corr2 = abs(df_processed[pair[1]].corr(df_processed['Fault']))
        if corr1 < corr2:
            features_to_remove.append(pair[0])
        else:
            features_to_remove.append(pair[1])

print(f"Features to remove due to high correlation: {features_to_remove[:5]}")

# For this tutorial, we'll keep all features but note this for production


In [None]:
# 3.4 Prepare features and target
# Separate features and target
X = df_processed.drop('Fault', axis=1)
y = df_processed['Fault']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"\nTraining set target distribution:")
print(y_train.value_counts(normalize=True))


In [None]:
# 3.5 Feature Scaling
# Scale features for algorithms that are sensitive to scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Features scaled using StandardScaler")
print(f"Scaled training set mean: {X_train_scaled.mean().mean():.6f}")
print(f"Scaled training set std: {X_train_scaled.std().mean():.6f}")


## 4. Machine Learning Models

We'll train and compare multiple models:
1. Logistic Regression (baseline)
2. Random Forest
3. Support Vector Machine
4. Gradient Boosting


In [None]:
# 4.1 Logistic Regression
print("Training Logistic Regression...")
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)

# Predictions
y_pred_lr = lr.predict(X_test_scaled)
y_pred_proba_lr = lr.predict_proba(X_test_scaled)[:, 1]

# Evaluation
print("\nLogistic Regression Results:")
print("=" * 50)
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_lr):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_lr):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_lr):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba_lr):.4f}")


In [None]:
# 4.2 Random Forest
print("Training Random Forest...")
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Predictions
y_pred_rf = rf.predict(X_test)
y_pred_proba_rf = rf.predict_proba(X_test)[:, 1]

# Evaluation
print("\nRandom Forest Results:")
print("=" * 50)
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_rf):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_rf):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_rf):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba_rf):.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


In [None]:
# 4.3 Support Vector Machine
print("Training Support Vector Machine...")
svm = SVC(probability=True, random_state=42)
svm.fit(X_train_scaled, y_train)

# Predictions
y_pred_svm = svm.predict(X_test_scaled)
y_pred_proba_svm = svm.predict_proba(X_test_scaled)[:, 1]

# Evaluation
print("\nSVM Results:")
print("=" * 50)
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_svm):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_svm):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_svm):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba_svm):.4f}")


In [None]:
# 4.4 Gradient Boosting
print("Training Gradient Boosting...")
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)

# Predictions
y_pred_gb = gb.predict(X_test)
y_pred_proba_gb = gb.predict_proba(X_test)[:, 1]

# Evaluation
print("\nGradient Boosting Results:")
print("=" * 50)
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_gb):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_gb):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_gb):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba_gb):.4f}")


## 5. Model Comparison and Evaluation


In [None]:
# 5.1 Compare all models
models = {
    'Logistic Regression': (y_pred_lr, y_pred_proba_lr),
    'Random Forest': (y_pred_rf, y_pred_proba_rf),
    'SVM': (y_pred_svm, y_pred_proba_svm),
    'Gradient Boosting': (y_pred_gb, y_pred_proba_gb)
}

results = []
for name, (y_pred, y_pred_proba) in models.items():
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba)
    })

results_df = pd.DataFrame(results)
print("Model Comparison:")
print("=" * 80)
print(results_df.to_string(index=False))


In [None]:
# 5.2 Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Metrics comparison
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
x = np.arange(len(metrics))
width = 0.2

for i, (name, _) in enumerate(models.items()):
    values = [results_df[results_df['Model'] == name][metric].values[0] for metric in metrics]
    axes[0].bar(x + i*width, values, width, label=name)

axes[0].set_xlabel('Metrics', fontsize=12)
axes[0].set_ylabel('Score', fontsize=12)
axes[0].set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
axes[0].set_xticks(x + width * 1.5)
axes[0].set_xticklabels(metrics)
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')
axes[0].set_ylim([0, 1.1])

# ROC Curves
for name, (_, y_pred_proba) in models.items():
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    axes[1].plot(fpr, tpr, label=f'{name} (AUC = {roc_auc_score(y_test, y_pred_proba):.3f})')

axes[1].plot([0, 1], [0, 1], 'k--', label='Random')
axes[1].set_xlabel('False Positive Rate', fontsize=12)
axes[1].set_ylabel('True Positive Rate', fontsize=12)
axes[1].set_title('ROC Curves', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# 5.3 Confusion Matrices
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for i, (name, (y_pred, _)) in enumerate(models.items()):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i],
                xticklabels=['No Fault', 'Fault'],
                yticklabels=['No Fault', 'Fault'])
    axes[i].set_title(f'{name}\nAccuracy: {accuracy_score(y_test, y_pred):.3f}', 
                      fontsize=12, fontweight='bold')
    axes[i].set_ylabel('True Label', fontsize=10)
    axes[i].set_xlabel('Predicted Label', fontsize=10)

plt.tight_layout()
plt.show()


## 6. Feature Importance Analysis

Understanding which features are most important helps in:
- Identifying key quality indicators
- Improving manufacturing processes
- Reducing feature dimensionality


In [None]:
# Visualize feature importance from Random Forest
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'], color='steelblue')
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance', fontsize=12)
plt.title('Top 15 Most Important Features (Random Forest)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()


## 7. Summary and Conclusions

### Key Findings:
1. **Best Model**: Based on the evaluation metrics, [model name] performs best
2. **Important Features**: [List top features]
3. **Business Impact**: The model can help identify defective products early, reducing waste and improving quality

### Next Steps:
- Collect more data to improve model performance
- Deploy model for real-time quality control
- Monitor model performance over time
- Consider ensemble methods for better accuracy
