In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Ingestion

In [72]:
df = pd.read_csv("../Machine_Predictive_Data/predictive_maintenance.csv")
print(df.head())

   UDI Product ID Type  Air temperature [K]  Process temperature [K]  \
0    1     M14860    M                298.1                    308.6   
1    2     L47181    L                298.2                    308.7   
2    3     L47182    L                298.1                    308.5   
3    4     L47183    L                298.2                    308.6   
4    5     L47184    L                298.2                    308.7   

   Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Target Failure Type  
0                    1551         42.8                0       0   No Failure  
1                    1408         46.3                3       0   No Failure  
2                    1498         49.4                5       0   No Failure  
3                    1433         39.5                7       0   No Failure  
4                    1408         40.0                9       0   No Failure  


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Target                   10000 non-null  int64  
 9   Failure Type             10000 non-null  object 
dtypes: float64(3), int64(4), object(3)
memory usage: 781.4+ KB


In [74]:
df.isnull().sum()

UDI                        0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Target                     0
Failure Type               0
dtype: int64

# Data Preprocessing

In [77]:
df.drop(['UDI','Product ID'],axis=1, inplace=True)
df

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,M,298.1,308.6,1551,42.8,0,0,No Failure
1,L,298.2,308.7,1408,46.3,3,0,No Failure
2,L,298.1,308.5,1498,49.4,5,0,No Failure
3,L,298.2,308.6,1433,39.5,7,0,No Failure
4,L,298.2,308.7,1408,40.0,9,0,No Failure
...,...,...,...,...,...,...,...,...
9995,M,298.8,308.4,1604,29.5,14,0,No Failure
9996,H,298.9,308.4,1632,31.8,17,0,No Failure
9997,M,299.0,308.6,1645,33.4,22,0,No Failure
9998,H,299.0,308.7,1408,48.5,25,0,No Failure


In [79]:
df['Air temperature [c]'] = df['Air temperature [K]'] - 273.15
df['Process temperature [c]'] = df['Process temperature [K]'] - 273.15
df.drop(['Air temperature [K]', 'Process temperature [K]'], axis=1, inplace=True)
df.head()

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Air temperature [c],Process temperature [c]
0,M,1551,42.8,0,0,No Failure,24.95,35.45
1,L,1408,46.3,3,0,No Failure,25.05,35.55
2,L,1498,49.4,5,0,No Failure,24.95,35.35
3,L,1433,39.5,7,0,No Failure,25.05,35.45
4,L,1408,40.0,9,0,No Failure,25.05,35.55


In [81]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(categories=[['L','M','H']])

df['Type'] = encoder.fit_transform(df[['Type']])
df

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Air temperature [c],Process temperature [c]
0,1.0,1551,42.8,0,0,No Failure,24.95,35.45
1,0.0,1408,46.3,3,0,No Failure,25.05,35.55
2,0.0,1498,49.4,5,0,No Failure,24.95,35.35
3,0.0,1433,39.5,7,0,No Failure,25.05,35.45
4,0.0,1408,40.0,9,0,No Failure,25.05,35.55
...,...,...,...,...,...,...,...,...
9995,1.0,1604,29.5,14,0,No Failure,25.65,35.25
9996,2.0,1632,31.8,17,0,No Failure,25.75,35.25
9997,1.0,1645,33.4,22,0,No Failure,25.85,35.45
9998,2.0,1408,48.5,25,0,No Failure,25.85,35.55


In [83]:
df['Failure Type'].unique()

array(['No Failure', 'Power Failure', 'Tool Wear Failure',
       'Overstrain Failure', 'Random Failures',
       'Heat Dissipation Failure'], dtype=object)

In [84]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

df['Failure Type'] = encoder.fit_transform(df['Failure Type'])
df.head()

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Air temperature [c],Process temperature [c]
0,1.0,1551,42.8,0,0,1,24.95,35.45
1,0.0,1408,46.3,3,0,1,25.05,35.55
2,0.0,1498,49.4,5,0,1,24.95,35.35
3,0.0,1433,39.5,7,0,1,25.05,35.45
4,0.0,1408,40.0,9,0,1,25.05,35.55


In [85]:
df['Failure Type'].unique()

array([1, 3, 5, 2, 4, 0])

In [86]:
encoder.classes_

array(['Heat Dissipation Failure', 'No Failure', 'Overstrain Failure',
       'Power Failure', 'Random Failures', 'Tool Wear Failure'],
      dtype=object)

In [87]:
classes = [0, 1, 2, 3,4, 5]
encoder.inverse_transform(classes)

array(['Heat Dissipation Failure', 'No Failure', 'Overstrain Failure',
       'Power Failure', 'Random Failures', 'Tool Wear Failure'],
      dtype=object)

In [88]:
df['Failure Type'].value_counts()

Failure Type
1    9652
0     112
3      95
2      78
5      45
4      18
Name: count, dtype: int64

In [90]:
df.groupby('Target')['Failure Type'].value_counts()

Target  Failure Type
0       1               9643
        4                 18
1       0                112
        3                 95
        2                 78
        5                 45
        1                  9
Name: count, dtype: int64

In [91]:
df['Target'].value_counts()

Target
0    9661
1     339
Name: count, dtype: int64

# Data Splitting

In [92]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.combine import SMOTEENN

scaler = MinMaxScaler()
scaler_cols = ['Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Air temperature [c]', 'Process temperature [c]']

X = df.drop(['Target','Failure Type'], axis = 1)
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train[scaler_cols] = scaler.fit_transform(X_train[scaler_cols])
X_test[scaler_cols] = scaler.transform(X_test[scaler_cols])

smote =  SMOTEENN(sampling_strategy="minority")
X_train_resample, y_train_resample = smote.fit_resample(X_train, y_train)




In [95]:
y_train_resample

0        0
1        0
2        0
3        0
4        0
        ..
14607    1
14608    1
14609    1
14610    1
14611    1
Name: Target, Length: 14612, dtype: int64

# Model Training

In [93]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report

lr = LogisticRegression()
sv = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

models = [lr, sv, dt, rf]
scores = []

for model in models:
    model.fit(X_train_resample, y_train_resample)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    prec = precision_score(y_test, y_pred) * 100
    rec = recall_score(y_test, y_pred) * 100
    f1 = f1_score(y_test, y_pred) * 100
    scores.append([acc, prec, rec, f1])

In [94]:
scores_df = pd.DataFrame(columns=['Model'], data=['Logistic Regression', 'SVC', 'Decision Tree', 'Random Forest'])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1'])], axis=1 )
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,80.25,11.697248,83.606557,20.523139
1,SVC,88.5,19.928826,91.803279,32.748538
2,Decision Tree,94.2,31.034483,73.770492,43.68932
3,Random Forest,94.95,35.714286,81.967213,49.751244
