## Model Training

### Import Data and Required Packages

#### Importing Pandas, Numpy, Matplotlib, Seaborn, Sci-kit Learn, and Warnings Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix
import warnings

#### Import CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('data/manufacturing_defect_dataset.csv')

#### Show Top 5 Records

In [3]:
df.head()

Unnamed: 0,ProductionVolume,ProductionCost,SupplierQuality,DeliveryDelay,DefectRate,QualityScore,MaintenanceHours,DowntimePercentage,InventoryTurnover,StockoutRate,WorkerProductivity,SafetyIncidents,EnergyConsumption,EnergyEfficiency,AdditiveProcessTime,AdditiveMaterialCost,DefectStatus
0,202,13175.403783,86.648534,1,3.121492,63.463494,9,0.052343,8.630515,0.081322,85.042379,0,2419.616785,0.468947,5.551639,236.439301,1
1,535,19770.046093,86.310664,4,0.819531,83.697818,20,4.908328,9.296598,0.038486,99.657443,7,3915.566713,0.119485,9.080754,353.957631,1
2,960,19060.820997,82.132472,0,4.514504,90.35055,1,2.464923,5.097486,0.002887,92.819264,2,3392.385362,0.496392,6.562827,396.189402,1
3,370,5647.606037,87.335966,5,0.638524,67.62869,8,4.692476,3.577616,0.055331,96.887013,8,4652.400275,0.183125,8.097496,164.13587,1
4,206,7472.222236,81.989893,3,3.867784,82.728334,9,2.746726,6.851709,0.068047,88.315554,7,1581.630332,0.263507,6.406154,365.708964,1


### Data Preparation

#### Preparing X and Y variables

In [4]:
X = df.drop(columns=['DefectStatus'],axis=1)

In [5]:
y = df['DefectStatus']

#### Scaling and Train Test Split

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

#### Create a Function to display all Metrics after Training

In [7]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1_score_eval = f1_score(true, predicted)
    return accuracy, precision, recall, f1_score_eval

### Choosing the Best Model

In [8]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Support Vector Classifier": SVC()
}

model_list = []
f1_list = []

for key, value in models.items():
    model = value
    model.fit(scaled_X_train, y_train)
    
    #Make predictions
    y_test_pred = model.predict(scaled_X_test)
    
    #Evaluate performance
    accuracy, precision, recall, f1_score_eval = evaluate_model(y_test, y_test_pred)
    
    model_list.append(key)
    f1_list.append(f1_score_eval)
    
    print(key)    
    print("- Accuracy: {:.4f}".format(accuracy))
    print("- Precision: {:.4f}".format(precision))
    print("- Recall: {:.4f}".format(recall))
    print("- F1 Score: {:.4f}".format(f1_score_eval))

    print('----------------------------------')
    print('\n')

Logistic Regression
- Accuracy: 0.8807
- Precision: 0.8960
- Recall: 0.9721
- F1 Score: 0.9325
----------------------------------


K-Neighbors Classifier
- Accuracy: 0.8745
- Precision: 0.8971
- Recall: 0.9624
- F1 Score: 0.9286
----------------------------------


Decision Tree
- Accuracy: 0.9033
- Precision: 0.9620
- Recall: 0.9223
- F1 Score: 0.9418
----------------------------------


Random Forest Classifier
- Accuracy: 0.9588
- Precision: 0.9623
- Recall: 0.9903
- F1 Score: 0.9761
----------------------------------


AdaBoost Classifier
- Accuracy: 0.9136
- Precision: 0.9312
- Recall: 0.9697
- F1 Score: 0.9501
----------------------------------


Support Vector Classifier
- Accuracy: 0.8920
- Precision: 0.9053
- Recall: 0.9745
- F1 Score: 0.9386
----------------------------------




#### Results

In [9]:
pd.DataFrame(list(zip(model_list, f1_list)), columns=['Model Name', 'F1_Score']).sort_values(by=["F1_Score"],ascending=False)

Unnamed: 0,Model Name,F1_Score
3,Random Forest Classifier,0.976077
4,AdaBoost Classifier,0.950059
2,Decision Tree,0.94176
5,Support Vector Classifier,0.938632
0,Logistic Regression,0.93248
1,K-Neighbors Classifier,0.928571


### Random Forest Classifier

In [10]:
rf_model = RandomForestClassifier()
rf_model = rf_model.fit(scaled_X_train, y_train)
y_pred = rf_model.predict(scaled_X_test)
score = f1_score(y_test, y_pred) * 100
print(f"F1 Score of the model is {score:.2f}")

F1 Score of the model is 97.61


#### Confusion Matrix

In [11]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[116,  32],
       [  8, 816]], dtype=int64)

#### Feature Importance

In [12]:
df_feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf_model.feature_importances_})
df_feature_importances.sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
6,MaintenanceHours,0.221633
4,DefectRate,0.203265
5,QualityScore,0.147267
0,ProductionVolume,0.110913
15,AdditiveMaterialCost,0.033101
13,EnergyEfficiency,0.030971
2,SupplierQuality,0.030649
7,DowntimePercentage,0.030006
9,StockoutRate,0.028906
10,WorkerProductivity,0.02882


## Conclusion
- Overall, the model is pretty good. Further improvements can be made by applying under/over sampling or hyper parameter tuning
- Confirmed that Defect Rate, Maintenance Hours, Quality Score, and Production Volume are the key features that affect Defect Status