In [23]:
import pandas as pd
pred_df = pd.read_csv('/content/drive/MyDrive/Datasets/Predictive Maintenance System Data/predictive_maintenance.csv')
pred_df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


## **The columns in the dataset are,**

###**UID:**
unique identifier ranging from 1 to 10000

### **productID:**
 consisting of a letter L, M, or H for low (50% of all products), medium (30%), and high (20%) as product quality variants and a variant-specific serial number

### **air temperature [K]:**
generated using a random walk process later normalized to a standard deviation of 2 K around 300 K

### **process temperature [K]:**
 generated using a random walk process normalized to a standard deviation of 1 K, added to the air temperature plus 10 K.

### **rotational speed [rpm]:**
calculated from powepower of 2860 W, overlaid with a normally distributed noise

### **torque [Nm]:**
torque values are normally distributed around 40 Nm with an Ïƒ = 10 Nm and no negative values.

### **tool wear [min]:**
The quality variants H/M/L add 5/3/2 minutes of tool wear to the used tool in the process. and a
'machine failure' label that indicates, whether the machine has failed in this particular data point for any of the following failure modes are true.

## **Important : There are two Targets - Do not make the mistake of using one of them as feature, as it will lead to leakage.**

## **Target :**  
Failure or Not

## **Failure Type :**
Type of Failure

In [24]:
pred_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Target                   10000 non-null  int64  
 9   Failure Type             10000 non-null  object 
dtypes: float64(3), int64(4), object(3)
memory usage: 781.4+ KB


In [25]:
pred_df['Target'].value_counts()

Unnamed: 0_level_0,count
Target,Unnamed: 1_level_1
0,9661
1,339


In [26]:
pred_df['Failure Type'].value_counts()

Unnamed: 0_level_0,count
Failure Type,Unnamed: 1_level_1
No Failure,9652
Heat Dissipation Failure,112
Power Failure,95
Overstrain Failure,78
Tool Wear Failure,45
Random Failures,18


In [27]:
X=pred_df.drop(['Target','Failure Type'],axis=1)
y_target=pred_df['Target']
y_failure=pred_df['Failure Type']

In [28]:
#Split the data into train and test
from sklearn.model_selection import train_test_split

#For Target Column
X_train, X_test, y_train, y_test = train_test_split(X, y_target, test_size=0.2, random_state=42,stratify=y_target)

#For Failure Type Column
X_train_failure, X_test_failure, y_train_failure, y_test_failure = train_test_split(X, y_failure, test_size=0.2, random_state=42,stratify=y_failure)

In [29]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify categorical columns
categorical_features = ['Product ID', 'Type']
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create a column transformer for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns (like UDI, if not in numerical_features)
)

# Create a pipeline with preprocessing and SMOTE
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Apply preprocessing to the training data
X_train_processed = pipeline.fit_transform(X_train)

# Apply SMOTE to the preprocessed training data on Target column
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)


print("Before:", y_train.value_counts())
print("After:", y_train_resampled.value_counts())

Before: Target
0    7729
1     271
Name: count, dtype: int64
After: Target
0    7729
1    7729
Name: count, dtype: int64


In [30]:
# Apply SMOTE to failure type column
smote = SMOTE(random_state=42)
X_train_resampled_failure, y_train_resampled_failure = smote.fit_resample(X_train_processed, y_train_failure)


print("Before:", y_train_failure.value_counts())
print("After:", y_train_resampled_failure.value_counts())

Before: Failure Type
No Failure                  7722
Heat Dissipation Failure      90
Power Failure                 76
Overstrain Failure            62
Tool Wear Failure             36
Random Failures               14
Name: count, dtype: int64
After: Failure Type
No Failure                  7722
Tool Wear Failure           7722
Random Failures             7722
Power Failure               7722
Heat Dissipation Failure    7722
Overstrain Failure          7722
Name: count, dtype: int64


In [31]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

#Target (Binary)
X_train_scaled = scaler.fit_transform(X_train_resampled.toarray())
X_test_scaled = scaler.transform(pipeline.transform(X_test).toarray())

#Failure Type (Multi-Class)
X_train_scaled_failure = scaler.fit_transform(X_train_resampled_failure.toarray())
X_test_scaled_failure = scaler.transform(pipeline.transform(X_test_failure).toarray())

In [33]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

#Model Training for Binary Classifiaction (Target)
Target_model=XGBClassifier(random_state=42)
Target_model.fit(X_train_scaled,y_train_resampled)

# Encode string labels to numerical labels for Failure Type
label_encoder = LabelEncoder()
y_train_resampled_failure_encoded = label_encoder.fit_transform(y_train_resampled_failure)

#Model Training for Multi-Class Classification (Failure Type)
FailureType_model=XGBClassifier(random_state=42)
FailureType_model.fit(X_train_scaled_failure,y_train_resampled_failure_encoded)

In [37]:
from sklearn.metrics import accuracy_score, classification_report

#For Target Column
y_pred = Target_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9805


In [39]:
#For Failure Type
y_pred_failure = FailureType_model.predict(X_test_scaled_failure)
accuracy_failure = accuracy_score(y_test_failure, y_pred_failure)
print("Accuracy:", accuracy_failure)

Accuracy: 0.0


In [40]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1932
           1       0.69      0.78      0.73        68

    accuracy                           0.98      2000
   macro avg       0.84      0.88      0.86      2000
weighted avg       0.98      0.98      0.98      2000



In [43]:
# Encode y_test_failure using the same encoder fitted on training data
y_test_failure_encoded = label_encoder.transform(y_test_failure)

print(classification_report(y_test_failure_encoded, y_pred_failure))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.97      0.99      0.98      1930
           2       0.00      0.00      0.00        16
           3       0.00      0.00      0.00        19
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         9

    accuracy                           0.95      2000
   macro avg       0.16      0.16      0.16      2000
weighted avg       0.93      0.95      0.94      2000

