# Binary Classification of Machine Failures

This notebook is an atempt to solve the problem presented in the Kaggle competition:
 > https://www.kaggle.com/competitions/playground-series-s3e17/overview/description 
 

In [3]:
# Import all the tools you will need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import RocCurveDisplay, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pickle

## 1. Very short EDA

In [4]:
df = pd.read_csv("train.csv")
df

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136424,136424,M22284,M,300.1,311.4,1530,37.5,210,0,0,0,0,0,0
136425,136425,H38017,H,297.5,308.5,1447,49.1,2,0,0,0,0,0,0
136426,136426,L54690,L,300.5,311.8,1524,38.5,214,0,0,0,0,0,0
136427,136427,L53876,L,301.7,310.9,1447,46.3,42,0,0,0,0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136429 entries, 0 to 136428
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       136429 non-null  int64  
 1   Product ID               136429 non-null  object 
 2   Type                     136429 non-null  object 
 3   Air temperature [K]      136429 non-null  float64
 4   Process temperature [K]  136429 non-null  float64
 5   Rotational speed [rpm]   136429 non-null  int64  
 6   Torque [Nm]              136429 non-null  float64
 7   Tool wear [min]          136429 non-null  int64  
 8   Machine failure          136429 non-null  int64  
 9   TWF                      136429 non-null  int64  
 10  HDF                      136429 non-null  int64  
 11  PWF                      136429 non-null  int64  
 12  OSF                      136429 non-null  int64  
 13  RNF                      136429 non-null  int64  
dtypes: f

In [6]:
df.isna().sum()

id                         0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

In [7]:
df["Machine failure"].value_counts()

0    134281
1      2148
Name: Machine failure, dtype: int64

## 2. Get the data ready

### 2.1 Create a function to transform the dataset

The function will do the following:
* remove the `Product Id` column since its useless and a nightmare to encode 
* encode the `Type` column and transform all the 0 and 1 into boolean values since bool takes less space in computer memory than int does
* set the index of our DataFrame to be the `id` column (makes the df cleaner)

In [33]:
def get_data_ready(df, number_of_columns, is_test=False):
    # Remove the Product ID column
    df = df.drop("Product ID", axis=1)
    
    df = pd.get_dummies(df, prefix=["Type"], columns=["Type"])
    to_binary_column_names = {"TWF", "HDF", "PWF", "OSF", "RNF", "Type_H", "Type_L", "Type_M"}
    for i in range(0, number_of_columns):
        for column in to_binary_column_names:
            if (df[column][i]==1):
                df[column][i] = True
            else:
                df[column][i] = False
    for column in to_binary_column_names:
        df[column] = df[column].astype("bool")
        
    if(is_test==False):
        df["Machine failure"] = df["Machine failure"].astype("int")
        
    df.set_index("id", inplace=True)
    return df

In [9]:
transformed_df = get_data_ready(df)
transformed_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column][i] = False
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column][i] = True


Unnamed: 0_level_0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Type_H,Type_L,Type_M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,300.6,309.6,1596,36.1,140,0,False,False,False,False,False,False,True,False
1,302.6,312.1,1759,29.1,200,0,False,False,False,False,False,False,False,True
2,299.3,308.5,1805,26.5,25,0,False,False,False,False,False,False,True,False
3,301.0,310.9,1524,44.3,197,0,False,False,False,False,False,False,True,False
4,298.0,309.0,1641,35.4,34,0,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136424,300.1,311.4,1530,37.5,210,0,False,False,False,False,False,False,False,True
136425,297.5,308.5,1447,49.1,2,0,False,False,False,False,False,True,False,False
136426,300.5,311.8,1524,38.5,214,0,False,False,False,False,False,False,True,False
136427,301.7,310.9,1447,46.3,42,0,False,False,False,False,False,False,True,False


In [10]:
transformed_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136429 entries, 0 to 136428
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Air temperature [K]      136429 non-null  float64
 1   Process temperature [K]  136429 non-null  float64
 2   Rotational speed [rpm]   136429 non-null  int64  
 3   Torque [Nm]              136429 non-null  float64
 4   Tool wear [min]          136429 non-null  int64  
 5   Machine failure          136429 non-null  int32  
 6   TWF                      136429 non-null  bool   
 7   HDF                      136429 non-null  bool   
 8   PWF                      136429 non-null  bool   
 9   OSF                      136429 non-null  bool   
 10  RNF                      136429 non-null  bool   
 11  Type_H                   136429 non-null  bool   
 12  Type_L                   136429 non-null  bool   
 13  Type_M                   136429 non-null  bool   
dtypes: b

In [11]:
df["Machine failure"].value_counts()

0    134281
1      2148
Name: Machine failure, dtype: int64

### 2.2 Split the data

In [12]:
# Split the data into X and y
X = transformed_df.drop("Machine failure", axis=1)
y = transformed_df["Machine failure"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### 2.3 Balance the data

Since we have a very imbalanced data set we want to balance the data. In order to do that we will use the `SMOTE()` oversampling function and apply it to our data. 

In [13]:
# Implementing the SMOTE function to balcance our data
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42) 
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

### 2.4 Scaling the data

I was curious if scaling the data would make any difference. I have heard about this function caled `StandardScaler()` so I quickly read throug documentation and applied it. The models, which were trained on the Scaled data performed the same and their scores indicates that they were about random. So i started searching on why that may be and I came across this awesome article on medium:
> https://towardsdatascience.com/all-about-feature-scaling-bcc0ad75cb35

It turns out that models I use does not benefit from any form of scaled data, because RandomForest-based models are rule-based models and those simply do not benefit from scaled or normalized data. 

It's also posible that I applied the `StandardScaler()` function wrong. Neverthanless I will leave the code for the scaled models for science. 

In [14]:
# Implementign the StandardScaler function
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [15]:
# Balance the standarized data
X_train_smote_scaled, y_train_smote_scaled = smote.fit_resample(X_train_scaled, y_train)

## 3. Train the models

### 3.1 RandomForestClassifier

In [16]:
%%time
# Fit the model to the data

# Setup random state
np.random.seed(42)

# Fit the model
clf = RandomForestClassifier(n_jobs=-1)
clf.fit(X_train_smote, y_train_smote)

# Make predictions and score the model
predictions = clf.predict(X_test)
print(f"The AUC score for RandomForestClf is: {roc_auc_score(y_true=y_test, y_score=predictions)}")

The AUC score for RandomForestClf is: 0.8794368964194056
CPU times: total: 29.5 s
Wall time: 4.6 s


**Note:** The cell below is the experiment i mentioned above in section about data standarization and the model created in this cell is basicly useless. 

In [17]:
%%time
# Fit the model to the data with scaled data

# Setup random state
np.random.seed(42)

# Fit the model
clf_scaled_data = RandomForestClassifier(n_jobs=-1)
clf_scaled_data.fit(X_train_smote_scaled, y_train_smote_scaled)

# Make predictions and score the model
predictions_random_forest_clf_scaled = clf_scaled_data.predict(X_test)
print(f"The AUC score for RandomForestClf with scaled data is: {roc_auc_score(y_true=y_test, y_score=predictions_random_forest_clf_scaled)}")

The AUC score for RandomForestClf with scaled data is: 0.5
CPU times: total: 42.2 s
Wall time: 6.16 s




### 3.2 LogisticRegression

In [18]:
%%time

# Setup random seed
np.random.seed(42)

# Fit the model 
model = LogisticRegression(n_jobs=-1)
model.fit(X_train_smote, y_train_smote)

# Make predictions and score the model
predictions_model = model.predict(X_test)
print(f"The AUC score for LogisticRegresion is: {roc_auc_score(y_true=y_test, y_score=predictions_model)}")

The AUC score for LogisticRegresion is: 0.8707840547484713
CPU times: total: 31.2 ms
Wall time: 3.52 s


**Note:** The cell below is the experiment i mentioned above in section about data standarization and the model created in this cell is basicly useless. 

In [19]:
%%time
# Fit the LogisticRegression with scaled data
# Setup random seed
np.random.seed(42)

# Fit the model 
model_scaled_data = LogisticRegression(n_jobs=-1)
model_scaled_data.fit(X_train_smote_scaled, y_train_smote_scaled)

# Make predictions and score the model
predictions_model_scaled_data = model_scaled_data.predict(X_test)
print(f"The AUC score for LogisticRegresion with scaled data is: {roc_auc_score(y_true=y_test, y_score=predictions_model_scaled_data)}")

The AUC score for LogisticRegresion with scaled data is: 0.5
CPU times: total: 46.9 ms
Wall time: 1.28 s




## 4. Hyperparameter tuning

### 4.1 Hyperparameter tuning for RandomForestClassifier

**Note:** If you use `GridSearchCV()` in the cell below it may take as much as about 20h to run! You can estimate for how much it will be runing by taking the time that took your computer to go through first fiting of `RandomForestClassifier()` and multiplying it by 9500 (these are all the possibilities that your computer needs to compute). 

Therefore I have decided to go with `RandomizedSearchCV()` instead. I opted for a hundred iterations (500 fits in total due to 5-fold cross-validation) and it will take ... to compute.

In [22]:
%%time
# Setup RandomSearchCV for RandomForestClassifier
random_forest_clf_param_grid = {"n_estimators": np.arange(100, 2000, 100),
                                "max_depth": (5, 8, 15, 25, 30),
                                "min_samples_split": (2, 5, 10, 15, 100),
                                "min_samples_leaf": (1, 2, 5, 10)}
gs_random_forest_clf = RandomizedSearchCV(RandomForestClassifier(), 
                                          param_distributions=random_forest_clf_param_grid,
                                          verbose=True,
                                          cv=5, 
                                          n_jobs=-1, 
                                          scoring="roc_auc",
                                          n_iter=100, 
                                          random_state=42)
# Fit
gs_random_forest_clf.fit(X_train_smote, y_train_smote)

# Save the best model
pickle.dump(gs_random_forest_clf, open("gs_random_forest_clf.pkl", "wb"))

# Display the score of the best model
random_forest_clf_best_preds = gs_random_forest_clf.predict(X_test)
print(f"The AUC score for tuned RandomForestClassifier is: {roc_auc_score(y_true=y_test, y_score=random_forest_clf_best_preds)}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
The AUC score for tuned RandomForestClassifier is: 0.8794554989940019
CPU times: total: 1min 35s
Wall time: 5h 9min 17s


### 4.2 Hyperparameter tuning for LogisticRegression

**Note:** This cell may take a few hours to run. You can estimate how much time (in seconds) it will run by multiplying the time from the first `LogisticRegression()` run by 2250 (number of fits for `GridSearchCV`) and make sure you have something else to do in the meantime. 

In [23]:
%%time
logistic_regression_param_grid={"C": (0.1, 1, 10, 100, 1000),
                                "max_iter": np.arange(100,1000,10)}
gs_logistic_regression= RandomizedSearchCV(LogisticRegression(),
                                           param_distributions=logistic_regression_param_grid,
                                           cv=5,
                                           verbose=True,
                                           n_jobs=-1,
                                           scoring="roc_auc",
                                           n_iter=100,
                                           random_state=42)
# Fit
gs_logistic_regression.fit(X_train_smote, y_train_smote)

# Save the model 
pickle.dump(gs_logistic_regression, open("gs_logistic_regression.pkl", "wb"))

# Display the score of the best model 
gs_logistic_regression_preds = gs_logistic_regression.predict(X_test)
print(f"The AUC score for tuned LogisticRegression model is: {roc_auc_score(y_test, gs_logistic_regression_preds)}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
The AUC score for tuned LogisticRegression model is: 0.874590743358881
CPU times: total: 22.9 s
Wall time: 7min 32s


## 5 Making predictions

In [25]:
test_df = pd.read_csv("test.csv")
test_df

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,136429,L50896,L,302.3,311.5,1499,38.0,60,0,0,0,0,0
1,136430,L53866,L,301.7,311.0,1713,28.8,17,0,0,0,0,0
2,136431,L50498,L,301.3,310.4,1525,37.7,96,0,0,0,0,0
3,136432,M21232,M,300.1,309.6,1479,47.6,5,0,0,0,0,0
4,136433,M19751,M,303.4,312.3,1515,41.3,114,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
90949,227378,L51130,L,302.3,311.4,1484,40.4,15,0,0,0,0,0
90950,227379,L47783,L,297.9,309.8,1542,33.8,31,0,0,0,0,0
90951,227380,L48097,L,295.6,306.2,1501,41.4,187,0,0,0,0,0
90952,227381,L48969,L,298.1,307.8,1534,40.3,69,0,0,0,0,0


In [34]:
transformed_test_df = get_data_ready(test_df, 90954, is_test=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column][i] = False
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column][i] = True


In [35]:
transformed_test_df

Unnamed: 0_level_0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF,Type_H,Type_L,Type_M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
136429,302.3,311.5,1499,38.0,60,False,False,False,False,False,False,True,False
136430,301.7,311.0,1713,28.8,17,False,False,False,False,False,False,True,False
136431,301.3,310.4,1525,37.7,96,False,False,False,False,False,False,True,False
136432,300.1,309.6,1479,47.6,5,False,False,False,False,False,False,False,True
136433,303.4,312.3,1515,41.3,114,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
227378,302.3,311.4,1484,40.4,15,False,False,False,False,False,False,True,False
227379,297.9,309.8,1542,33.8,31,False,False,False,False,False,False,True,False
227380,295.6,306.2,1501,41.4,187,False,False,False,False,False,False,True,False
227381,298.1,307.8,1534,40.3,69,False,False,False,False,False,False,True,False


In [44]:
predictions = gs_random_forest_clf.predict(transformed_test_df)

In [45]:
predictions_df = pd.DataFrame(columns=("id", "Machine failure"))
predictions_df["id"] = test_df["id"]
predictions_df["Machine failure"]=predictions
predictions_df.set_index("id", inplace=True)
predictions_df

Unnamed: 0_level_0,Machine failure
id,Unnamed: 1_level_1
136429,0
136430,0
136431,0
136432,0
136433,0
...,...
227378,0
227379,0
227380,0
227381,0


In [46]:
predictions_df.to_csv("predictions.csv")