In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("data/predictive_maintenance.csv")

df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


In [2]:
# Drop identifier columns that don't add predictive value
df = df.drop(['UDI', 'Product ID'], axis=1)

# Display the first few rows to verify
df.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,M,298.1,308.6,1551,42.8,0,0,No Failure
1,L,298.2,308.7,1408,46.3,3,0,No Failure
2,L,298.1,308.5,1498,49.4,5,0,No Failure
3,L,298.2,308.6,1433,39.5,7,0,No Failure
4,L,298.2,308.7,1408,40.0,9,0,No Failure


In [3]:
# Create new feature: Torque per unit of tool wear
df['Torque_per_Wear'] = df['Torque [Nm]'] / (df['Tool wear [min]'] + 1)

# Verify the new feature
df[['Torque [Nm]', 'Tool wear [min]', 'Torque_per_Wear']].head()


Unnamed: 0,Torque [Nm],Tool wear [min],Torque_per_Wear
0,42.8,0,42.8
1,46.3,3,11.575
2,49.4,5,8.233333
3,39.5,7,4.9375
4,40.0,9,4.0


In [4]:
# One-hot encode the 'Type' column
df = pd.get_dummies(df, columns=['Type'], drop_first=True)

# Check the updated dataframe
df.head()


Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Torque_per_Wear,Type_L,Type_M
0,298.1,308.6,1551,42.8,0,0,No Failure,42.8,False,True
1,298.2,308.7,1408,46.3,3,0,No Failure,11.575,True,False
2,298.1,308.5,1498,49.4,5,0,No Failure,8.233333,True,False
3,298.2,308.6,1433,39.5,7,0,No Failure,4.9375,True,False
4,298.2,308.7,1408,40.0,9,0,No Failure,4.0,True,False


In [5]:
from sklearn.preprocessing import StandardScaler

# Define numerical features
num_features = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 
                'Torque [Nm]', 'Tool wear [min]', 'Torque_per_Wear']

scaler = StandardScaler()
df[num_features] = scaler.fit_transform(df[num_features])

# Verify scaling by checking the summary statistics
df[num_features].describe()


Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Torque_per_Wear
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,-9.322321e-16,-1.693934e-15,-2.33058e-16,5.424994e-16,1.051603e-16,-4.7606360000000007e-17
std,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005
min,-2.352278,-2.901986,-2.068196,-3.630149,-1.695984,-0.2742589
25%,-0.8523974,-0.8125581,-0.6458012,-0.6808401,-0.8633176,-0.2291022
50%,0.04753123,0.0636534,-0.1995597,0.01134481,0.0007698234,-0.2013709
75%,0.7474757,0.7376623,0.4084443,0.6834663,0.8491466,-0.1245191
max,2.247357,2.557486,7.51484,3.672902,2.278819,14.18763


In [6]:
# For binary classification, separate the features and target variable
X = df.drop(['Target', 'Failure Type'], axis=1, errors='ignore')
y = df['Target']

# Split into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the training data
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Training set shape: (8000, 8)
Test set shape: (2000, 8)
