# Part II: Model Development

In this part, we develop three unique pipelines for predicting backorder. We use the smart sample from Part I to fit and evaluate these pipelines. 

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import os, sys
import itertools
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import joblib

## Reload the smart sample

In [2]:

# Reload your smart sampling from local file 
# ----------------------------------
import joblib
X,y = joblib.load("data/sampled_data.pkl")

## Split the data into Train/Test

In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

## Developing Pipeline

In this section, we design an operationalized machine learning pipeline, which includes:

* Anomaly detection
* Dimensionality Reduction
* Train a classification model


In [4]:
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest

from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif,RFE

from sklearn.pipeline import Pipeline

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, classification_report

In [5]:
##An attempt to make a custom class to include outlier identification in the pipeline.
from sklearn.pipeline import TransformerMixin

class OutlierExtractor(TransformerMixin):
    def __init__(self, method,**kwargs):
        """
        Create a transformer to remove outliers.
        
        method (string): The outlier identification method to use 
        """
        self.method = method
        self.kwargs = kwargs
    def transform(self, X):
        return self
    
    def fit(self, X, y, *args, **kwargs):
        X = np.asarray(X)
        y = np.asarray(y)
        
        method = self.method
        
        if method == "LocalOutlierFactor":
            outlier_method = LocalOutlierFactor(self.kwargs)
        elif method == "EllipticEnvelope":
            outlier_method = EllipticEnvelope(self.kwargs)
        elif method == "IsolationForest":
            outlier_method = IsolationForest(self.kwargs)
        elif method == "OneClassSVM":
            outlier_method = OneClassSVM(self.kwargs)
            
        lof_labels = outlier_method.fit_predict(X,y)
        inliers = lof_labels == 1
        
        return (X[inliers],y[inliers])

### 1st pipeline 
  * Anomaly detection
  * Dimensionality reduction
  * Model training/validation
  
Add cells as needed. 

In [7]:
# Add anomaly detection code  (Question #E201)
# ----------------------------------
lof = LocalOutlierFactor(n_neighbors=5)
lof_labels = lof.fit_predict(X_train, y_train)
inliers = lof_labels == 1 # select inliers
X_clean = X_train[inliers]
y_clean = y_train[inliers]

In [25]:
inlier_count = len(y_clean)
original_count = len(y_train)
print(f"Original: {original_count} \nInliers: {inlier_count}\nPoints removed: {original_count-inlier_count}")

Original: 27103 
Inliers: 23876
Points removed: 3227


In [7]:
# Add codes for feature selection and classification pipeline with grid search  (Question #E202)
# ----------------------------------
pipe = Pipeline([
     ("scale",MinMaxScaler()),
     ("PCA",PCA()),
     ("SVC",SVC(kernel="rbf"))
                  ])

In [8]:
param_grid = {
    "PCA__n_components": [10,15],
    "SVC__C": [ 1, 10, 100, 1000],
    "SVC__gamma": [ 1, 10, 100]
}

In [9]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1,scoring="f1",verbose=2)

In [10]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scale', MinMaxScaler()),
                                       ('PCA', PCA()), ('SVC', SVC())]),
             n_jobs=-1,
             param_grid={'PCA__n_components': [10, 15],
                         'SVC__C': [1, 10, 100, 1000],
                         'SVC__gamma': [1, 10, 100]},
             scoring='f1', verbose=2)

In [11]:
joblib.dump(grid,"first_pipeline.pkl")

['first_pipeline.pkl']

In [5]:
grid = joblib.load("first_pipeline.pkl")

In [6]:
grid.cv_results_

{'mean_fit_time': array([21.67779121, 24.44914002, 24.50235605, 27.07789912, 30.34640741,
        25.91304846, 38.04518991, 41.06209698, 29.85903621, 56.34769702,
        71.73439784, 45.6243134 , 25.41245031, 28.45459981, 27.22150583,
        30.02789702, 30.92732964, 26.03255792, 32.22865658, 33.66403117,
        26.09849005, 38.95820518, 38.88577533, 29.27506137]),
 'std_fit_time': array([0.25331992, 0.97993012, 0.31559785, 1.05804571, 0.72273235,
        0.52850521, 1.11181372, 3.73032746, 0.46878718, 1.77750663,
        3.6658286 , 1.67569576, 0.24659439, 0.76571189, 0.48798744,
        0.69194372, 1.28332471, 0.57099724, 0.77761989, 1.04414943,
        0.52836137, 0.91791267, 1.11075621, 0.71219461]),
 'mean_score_time': array([5.06031423, 5.08852262, 5.03026466, 5.12937102, 5.07370739,
        5.0542078 , 5.00159307, 5.08752208, 4.8417069 , 4.99982815,
        4.90732436, 4.78644161, 5.60631967, 5.68257928, 5.65268755,
        5.64757791, 5.7568222 , 5.62793674, 5.51252928, 5.57

In [7]:
grid.best_estimator_

Pipeline(steps=[('scale', MinMaxScaler()), ('PCA', PCA(n_components=15)),
                ('SVC', SVC(C=1000, gamma=100))])

### Performance on training data

In [8]:
y_pred_train = grid.predict(X_clean)

In [9]:
print(classification_report(y_clean, y_pred_train))

              precision    recall  f1-score   support

           0       0.73      0.94      0.83     15438
           1       0.78      0.38      0.51      8438

    accuracy                           0.74     23876
   macro avg       0.76      0.66      0.67     23876
weighted avg       0.75      0.74      0.71     23876



In [11]:
pd.DataFrame(confusion_matrix(y_clean,y_pred_train))

Unnamed: 0,0,1
0,14549,889
1,5268,3170


### Performance on test data

In [15]:
# Given an unbiased evaluation  (Question #E203)
# ----------------------------------
y_pred = grid.predict(X_test)



In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.93      0.83      4498
           1       0.74      0.37      0.49      2278

    accuracy                           0.74      6776
   macro avg       0.74      0.65      0.66      6776
weighted avg       0.74      0.74      0.72      6776



In [17]:
pd.DataFrame(confusion_matrix(y_test,y_pred))

Unnamed: 0,0,1
0,4201,297
1,1434,844


#### <center>Record the optimal hyperparameters and performance resulting from this pipeline.</center>

### 2nd pipeline
  * Anomaly detection
  * Dimensionality reduction
  * Model training/validation

In [12]:
# Add anomaly detection code  (Question #E205)
# ----------------------------------
envelope = EllipticEnvelope(support_fraction=1, contamination = 0.2).fit(X_train)

inliers = envelope.predict(X_train) == 1

X_clean2 = X_train[inliers]
y_clean2 = y_train[inliers]

In [24]:
inlier_count = len(y_clean2)
original_count = len(y_train)
print(f"Original: {original_count} \nInliers: {inlier_count}\nPoints removed: {original_count-inlier_count}")

Original: 27103 
Inliers: 21682
Points removed: 5421


In [19]:
# Add codes for feature selection and classification pipeline with grid search  (Question #E206)
# ----------------------------------
pipe2 = Pipeline([
     ("kbest",SelectKBest(mutual_info_classif)),
     ("Boost",GradientBoostingClassifier())
                  ])

In [20]:
param_grid = {
    "kbest__k":[15,20],
    "Boost__n_estimators":[50,60,70],
    "Boost__max_depth":[5,10,15],
    "Boost__learning_rate":[0.05,0.1,0.2]
}

In [21]:
grid2 = GridSearchCV(pipe2, param_grid=param_grid, cv=5,n_jobs=-1,scoring="f1",verbose=2)

In [22]:
grid2.fit(X_train,y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('kbest',
                                        SelectKBest(score_func=<function mutual_info_classif at 0x7f781ef45488>)),
                                       ('Boost',
                                        GradientBoostingClassifier())]),
             n_jobs=-1,
             param_grid={'Boost__learning_rate': [0.05, 0.1, 0.2],
                         'Boost__max_depth': [5, 10, 15],
                         'Boost__n_estimators': [50, 60, 70],
                         'kbest__k': [15, 20]},
             scoring='f1', verbose=2)

In [23]:
joblib.dump(grid2,"second_pipeline.pkl")

['second_pipeline.pkl']

In [13]:
grid2 = joblib.load("second_pipeline.pkl")

In [25]:
grid2.cv_results_

{'mean_fit_time': array([ 5.55530887,  5.84385743,  6.14451556,  6.22472105,  6.60797029,
         6.78209877,  8.85131154,  9.1817462 , 10.25080376, 10.6458981 ,
        11.48495083, 11.72897401, 14.53626208, 15.00262246, 17.31867108,
        17.73155875, 20.26097908, 20.36038074,  5.45935612,  5.839359  ,
         6.18644409,  6.2365591 ,  6.61869926,  6.81709657,  8.71698203,
         9.25958662, 10.06418939, 10.52148685, 11.16578031, 11.69290729,
        15.54326758, 15.96171365, 18.13799024, 18.58834286, 20.7940649 ,
        21.73433332,  5.53198986,  5.74273143,  6.15696044,  6.24138589,
         6.50324998,  6.85216036,  8.83276992,  9.23556681, 10.00732007,
        10.32805839, 11.29495249, 11.75008097, 15.62085347, 16.32272635,
        18.26200418, 18.77120323, 20.53905177, 21.44288363]),
 'std_fit_time': array([0.09644069, 0.1963508 , 0.22650928, 0.03226254, 0.05281185,
        0.04629682, 0.10193192, 0.08381315, 0.32969183, 0.32327896,
        0.15407804, 0.07243526, 0.32436

In [26]:
grid2.best_estimator_

Pipeline(steps=[('kbest',
                 SelectKBest(k=20,
                             score_func=<function mutual_info_classif at 0x7f781ef45488>)),
                ('Boost',
                 GradientBoostingClassifier(learning_rate=0.2, max_depth=10,
                                            n_estimators=70))])

### Performance on training data

In [14]:
y_train_pred2 = grid2.predict(X_clean2)

In [16]:
print(classification_report(y_clean2,y_train_pred2))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99     14317
           1       0.97      0.98      0.97      7365

    accuracy                           0.98     21682
   macro avg       0.98      0.98      0.98     21682
weighted avg       0.98      0.98      0.98     21682



In [29]:
pd.DataFrame(confusion_matrix(y_clean2,y_train_pred2))

Unnamed: 0,0,1
0,14099,218
1,153,7212


### Performance on test data

In [27]:
# Given an unbiased evaluation  (Question #E207)
# ----------------------------------
y_pred2 = grid2.predict(X_test)

In [28]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94      4498
           1       0.87      0.90      0.89      2278

    accuracy                           0.92      6776
   macro avg       0.91      0.92      0.92      6776
weighted avg       0.92      0.92      0.92      6776



In [29]:
pd.DataFrame(confusion_matrix(y_test,y_pred2))

Unnamed: 0,0,1
0,4200,298
1,220,2058


#### <center>Record the optimal hyperparameters and performance resulting from this pipeline.</center>

### 3rd pipeline
  * Anomaly detection
  * Dimensionality reduction
  * Model training/validation

In [27]:
# Add anomaly detection code  (Question #E209)
# ----------------------------------
iso = IsolationForest()

inliers = iso.fit(X_train,y_train).predict(X_train) == 1

X_clean3 = X_train[inliers]
y_clean3 = y_train[inliers]

In [28]:
inlier_count = len(y_clean3)
original_count = len(y_train)
print(f"Original: {original_count} \nInliers: {inlier_count}\nPoints removed: {original_count-inlier_count}")

Original: 27103 
Inliers: 25066
Points removed: 2037


In [31]:
# Add codes for feature selection and classification pipeline with grid search  (Question #E210)
# ----------------------------------
rfe = RFE(estimator=RandomForestClassifier())
pipe3 = Pipeline([
    ("RFE",rfe),
    ("RandomForest",RandomForestClassifier())
])

In [32]:
param_grid = {
    "RFE__n_features_to_select":[15,20],
    "RandomForest__n_estimators":[600,1000,1400],
    "RandomForest__max_depth":[10,30,None]
}

In [33]:
grid3 = GridSearchCV(pipe3, param_grid=param_grid, cv=5, n_jobs=-1, scoring='f1',verbose=2)

In [34]:
grid3.fit(X_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('RFE',
                                        RFE(estimator=RandomForestClassifier())),
                                       ('RandomForest',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'RFE__n_features_to_select': [15, 20],
                         'RandomForest__max_depth': [10, 30, None],
                         'RandomForest__n_estimators': [600, 1000, 1400]},
             scoring='f1', verbose=2)

In [35]:
joblib.dump(grid3,"third_pipeline.pkl")

['third_pipeline.pkl']

In [30]:
grid3 = joblib.load("third_pipeline.pkl")

In [37]:
grid3.best_estimator_

Pipeline(steps=[('RFE',
                 RFE(estimator=RandomForestClassifier(),
                     n_features_to_select=20)),
                ('RandomForest', RandomForestClassifier(n_estimators=1400))])

### Performance on training data

In [31]:
y_train_pred3 = grid2.predict(X_clean3)

In [32]:
print(classification_report(y_clean3, y_train_pred3))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16690
           1       0.97      0.98      0.97      8376

    accuracy                           0.98     25066
   macro avg       0.98      0.98      0.98     25066
weighted avg       0.98      0.98      0.98     25066



In [33]:
pd.DataFrame(confusion_matrix(y_clean3, y_train_pred3))

Unnamed: 0,0,1
0,16450,240
1,194,8182


### Performance on test data

In [38]:
# Given an unbiased evaluation  (Question #E211)
# ----------------------------------
y_pred3 = grid3.predict(X_test)

In [39]:
print(classification_report(y_test,y_pred3))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94      4498
           1       0.87      0.91      0.89      2278

    accuracy                           0.93      6776
   macro avg       0.91      0.92      0.92      6776
weighted avg       0.93      0.93      0.93      6776



In [40]:
pd.DataFrame(confusion_matrix(y_test,y_pred3))

Unnamed: 0,0,1
0,4194,304
1,194,2084


#### <center>Record the optimal hyperparameters and performance resulting from this pipeline.</center>

## Compare these three pipelines and discuss findings

### Pickle the required pipeline/models for Part III.

In [41]:
import joblib

joblib.dump([pipe,pipe2,pipe3],"models.pkl")




['models.pkl']