# Preparing for Modelling (Manufacturing)

**Goal:** Predict whether a machine will fail (**Failure = 1**) based on recent sensor readings and operating context.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


## Load dataset

In [None]:
# Load dataset
df = pd.read_csv("predictive_maintenance_synthetic.csv")

df.head()


## Handling Imbalanced Data

In predictive maintenance, failures are  rare.  
That often means the dataset is **imbalanced** (many 0's, fewer 1's).

If we train without handling imbalance, the model may learn to say **"No failure"** all the time and still look accurate.


### Checking Class Distribution

In [None]:
# Check class distribution
failure_counts = df["Failure"].value_counts()

# Visualize class distribution


plt.show()


## Techniques to Handle Imbalanced Data

In [None]:
df.groupby(['Failure']).size().reset_index(name="count")


In [None]:
df.shape


### Oversampling (SMOTE)

In [None]:
# !pip install imblearn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


We will **encode** categorical columns so the model can work with them:

- `Machine_Type` (CNC, Pump, etc.)  
- `Production_Line` (LINE01, LINE02, ...)  
- `Shift` (Day, Swing, Night)

> Label Encoding is okay for demos. In production, one-hot encoding is often safer for non-ordered categories.


In [None]:
# Encode categorical variables using Label Encoding
categorical_cols = ["Machine_Type", "Production_Line", "Shift"]

label_encoders = {col: LabelEncoder() for col in categorical_cols}
for col in categorical_cols:
    df[col] = label_encoders[col].fit_transform(df[col])


### Prepare features and target

In [None]:
# Prepare features and target
X = df.drop(columns=["Failure"])  # Features
y = df["Failure"]                 # Target variable

# Split data before applying SMOTE


# stratify=y keeps the same failure ratio in both train and test sets


In [None]:
# Drop 'Reading_Timestamp' column from X_train before applying SMOTE
X_train = X_train.drop(columns=['Reading_Timestamp'], errors='ignore')

# Apply SMOTE
smote = SMOTE(random_state=42, sampling_strategy=0.9)  # minority/majority = 0.9
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check new class distribution
print("Class distribution before SMOTE:\n", pd.Series(y_train).value_counts())
print(len(y_train[y_train == 1]) / len(y_train[y_train == 0]))

print("Class distribution after SMOTE:", pd.Series(y_train_resampled).value_counts())
print(len(y_train_resampled[y_train_resampled == 1]) / len(y_train_resampled[y_train_resampled == 0]))

print(X_train.shape)
print(y_train.shape)
print(X_train_resampled.shape)
print(y_train_resampled.shape)


If you want to inspect the resampled training data as a DataFrame (useful for debugging):

In [None]:
# combine x and y back to dataframe




### Undersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Apply Random Undersampling
undersample = RandomUnderSampler(random_state=42, sampling_strategy=0.7)  # minority/majority = 0.7
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)


In [None]:
# Check new class distribution
print("Class distribution before Undersampling:", pd.Series(y_train).value_counts())
print(len(y_train[y_train == 1]) / len(y_train[y_train == 0]))

print("Class distribution after Undersampling:", pd.Series(y_train_under).value_counts())
print(len(y_train_under[y_train_under == 1]) / len(y_train_under[y_train_under == 0]))

print(X_train.shape)
print(y_train.shape)
print(X_train_under.shape)
print(y_train_under.shape)


## Creating New Features from Existing Data

In manufacturing, *feature engineering* often captures the physics/behavior you already know:
- heat + vibration together can be a warning
- older assets drift more
- longer time since maintenance increases risk


### Time-Based / Age-Based Features

In [None]:
# Define asset age categories (in days)

# "<1 year", "1-3 years", "3-6 years", "6-12 years"




### Maintenance & Load Features

In [None]:
# Simple derived features
# Days_Since_Maintenance = Hours_Since_Maintenance/ 24
# Load_Stress = Load_Percent X Motor_Current_A



### Interaction Features

In [None]:
# Interaction between vibration and temperature (common failure signature)
df["Vib_Temp_Interaction"] = df["Vibration_RMS"] * df["Temperature_C"]

# A simple risk score using practical rule-of-thumb thresholds
df["Failure_Risk_Score"] = (
    (df["Vibration_RMS"] > 3.5).astype(int) +
    (df["Temperature_C"] > 75).astype(int) +
    (df["Motor_Current_A"] > 28).astype(int) +
    (df["Hours_Since_Maintenance"] > 400).astype(int) +
    (df["Asset_Age_Days"] > 2000).astype(int)
)

df[["Vib_Temp_Interaction", "Failure_Risk_Score"]].head()


## Encoding Categorical Variables in Machine Learning

After feature engineering, we still need to convert categories to numbers.
Let's do a quick one-hot example (like asset age group).


In [None]:
# Identify categorical variables (object dtype)
categorical_features = df.select_dtypes(include=["object"]).columns
print("Categorical Features:", list(categorical_features))


### One-Hot Encoding (Dummy Variables)

In [None]:
# One-hot encode Age_Group (created earlier)
df_one_hot = pd.get_dummies(df["Age_Group"], prefix="Age_Group", drop_first=True)
df = pd.concat([df, df_one_hot], axis=1)

df.filter(like="Age_Group").head()


## Feature Engineering

### Feature Transformation

In [None]:
# Log transformation (log1p avoids log(0))


df[["Vibration_RMS", "Log_Vibration", "Motor_Current_A", "Log_Motor_Current"]].head()


### Binning (Discretization)

In [None]:
# Convert continuous variables into categories.



df[["Temperature_C", "Temp_Group"]].head()


### Rank Transformation

In [None]:
# Rank-based feature 




## Feature Scaling

In [None]:
numerical_features = [
    "Load_Percent", "Vibration_RMS", "Temperature_C", "Pressure_bar",
    "Motor_Current_A", "RPM", "Sound_dB", "Humidity_%", "Hours_Since_Maintenance",
    "Asset_Age_Days", "Days_Since_Maintenance", "Vib_Temp_Interaction", "Failure_Risk_Score"
]

df[numerical_features].head()


### Min-Max Scaling (Normalization)

In [None]:
from sklearn.preprocessing import MinMaxScaler



df_minmax[numerical_features].head()


### Standardization (Z-Score Scaling)

In [None]:
from sklearn.preprocessing import StandardScaler


df_standard[numerical_features].head()


## Feature Selection

### Filter Methods â€” Correlation Method

In [None]:
# Identify numerical features (excluding Machine_ID and Failure which is the target)
numerical_features_all = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
numerical_features_all = [col for col in numerical_features_all if col not in ["Machine_ID", "Failure"]]

target = "Failure"

# Compute correlation matrix
correlation_matrix = df[numerical_features_all + [target]].corr()

# Visualize correlation with a heatmap
plt.figure(figsize=(12, 7))
sns.heatmap(correlation_matrix, annot=False, cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.show()


In [None]:
# Drop features with high correlation (threshold > 0.75)
correlated_features = set()
threshold = 0.75

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

print("Highly Correlated Features to Remove:", correlated_features)

df_filtered = df.drop(columns=correlated_features)
df_filtered.head()


### Wrapper Methods (Recursive Feature Elimination - RFE)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Define model
model = RandomForestClassifier(n_estimators=100, random_state=42)


In [None]:
# Apply RFE to select top 10 features
rfe = RFE(model, n_features_to_select=10)
X_rfe = rfe.fit_transform(df[numerical_features_all], df[target])

selected_features = [
    feature for feature, selected in zip(numerical_features_all, rfe.support_) if selected
]

print("Selected Features using RFE:", selected_features)


### Decision Tree-based Feature Selection

In [None]:
# Train a Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(df[numerical_features_all], df[target])

# Get feature importances
feature_importances = pd.DataFrame({
    "Feature": numerical_features_all,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

print(feature_importances.head(20))


### Save selected features

In [None]:
df_selected = df[["Machine_ID"] + selected_features + [target]]
df_selected.to_csv("predictive_maintenance_selected_features.csv", index=False)

df_selected.head()
