# Kaggle Backorder ML

Dataset: https://www.kaggle.com/tiredgeek/predict-bo-trial

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import os, sys
import itertools
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.svm import LinearSVC

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest

from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.externals import joblib

## Load dataset

**Description**
~~~
sku - Random ID for the product
national_inv - Current inventory level for the part
lead_time - Transit time for product (if available)
in_transit_qty - Amount of product in transit from source
forecast_3_month - Forecast sales for the next 3 months
forecast_6_month - Forecast sales for the next 6 months
forecast_9_month - Forecast sales for the next 9 months
sales_1_month - Sales quantity for the prior 1 month time period
sales_3_month - Sales quantity for the prior 3 month time period
sales_6_month - Sales quantity for the prior 6 month time period
sales_9_month - Sales quantity for the prior 9 month time period
min_bank - Minimum recommend amount to stock
potential_issue - Source issue for part identified
pieces_past_due - Parts overdue from source
perf_6_month_avg - Source performance for prior 6 month period
perf_12_month_avg - Source performance for prior 12 month period
local_bo_qty - Amount of stock orders overdue
deck_risk - Part risk flag
oe_constraint - Part risk flag
ppap_risk - Part risk flag
stop_auto_buy - Part risk flag
rev_stop - Part risk flag
went_on_backorder - Product actually went on backorder. **This is the target value.**
~~~



In [3]:
# Dataset location
DATASET = 'datasets/back_order/Kaggle_Training_Dataset_v2.csv'
assert os.path.exists(DATASET)

# Load and shuffle
dataset = pd.read_csv(DATASET).sample(frac = 1).reset_index(drop = True) 
datasetCopy = dataset.copy() # in-case i need to refer to a fresh version later
dataset.describe()

Unnamed: 0,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,sales_9_month,min_bank,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty
count,1687860.0,1586967.0,1687860.0,1687860.0,1687860.0,1687860.0,1687860.0,1687860.0,1687860.0,1687860.0,1687860.0,1687860.0,1687860.0,1687860.0,1687860.0
mean,496.1118,7.872267,44.05202,178.1193,344.9867,506.3644,55.92607,175.0259,341.7288,525.2697,52.7723,2.043724,-6.872059,-6.437947,0.6264507
std,29615.23,7.056024,1342.742,5026.553,9795.152,14378.92,1928.196,5192.378,9613.167,14838.61,1254.983,236.0165,26.55636,25.84333,33.72224
min,-27256.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,0.0
25%,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.63,0.66,0.0
50%,15.0,8.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,4.0,0.0,0.0,0.82,0.81,0.0
75%,80.0,9.0,0.0,4.0,12.0,20.0,4.0,15.0,31.0,47.0,3.0,0.0,0.97,0.95,0.0
max,12334400.0,52.0,489408.0,1427612.0,2461360.0,3777304.0,741774.0,1105478.0,2146625.0,3205172.0,313319.0,146496.0,1.0,1.0,12530.0


In [4]:
print(dataset.shape)

(1687861, 23)


## Processing

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1687861 entries, 0 to 1687860
Data columns (total 23 columns):
sku                  1687861 non-null object
national_inv         1687860 non-null float64
lead_time            1586967 non-null float64
in_transit_qty       1687860 non-null float64
forecast_3_month     1687860 non-null float64
forecast_6_month     1687860 non-null float64
forecast_9_month     1687860 non-null float64
sales_1_month        1687860 non-null float64
sales_3_month        1687860 non-null float64
sales_6_month        1687860 non-null float64
sales_9_month        1687860 non-null float64
min_bank             1687860 non-null float64
potential_issue      1687860 non-null object
pieces_past_due      1687860 non-null float64
perf_6_month_avg     1687860 non-null float64
perf_12_month_avg    1687860 non-null float64
local_bo_qty         1687860 non-null float64
deck_risk            1687860 non-null object
oe_constraint        1687860 non-null object
ppap_risk        

In [7]:
dataset.head().transpose()

Unnamed: 0,0,1,2,3,4
sku,3283978,1919342,1199464,1501957,1423191
national_inv,12509,341,38,10,4990
lead_time,8,4,8,4,52
in_transit_qty,0,299,0,0,0
forecast_3_month,0,520,0,0,0
forecast_6_month,0,968,0,0,0
forecast_9_month,0,1376,0,0,0
sales_1_month,0,287,0,0,0
sales_3_month,0,611,0,0,0
sales_6_month,0,968,0,0,0


### Take samples and examine the dataset

In [8]:
dataset.iloc[:3,:6]

Unnamed: 0,sku,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month
0,3283978,12509.0,8.0,0.0,0.0,0.0
1,1919342,341.0,4.0,299.0,520.0,968.0
2,1199464,38.0,8.0,0.0,0.0,0.0


### Drop columns that are obviously irrelevant or not processable

In [12]:
del dataset["sku"]

### Find unique values of string columns

In [13]:
# All the column names of these yes/no columns
yes_no_columns = list(filter(lambda i: dataset[i].dtype != np.float64, dataset.columns))
print(yes_no_columns, "\n \n")

for c in set(yes_no_columns):
    print(c, "\n", dataset[c].unique(), "\n ================= \n")

['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk', 'stop_auto_buy', 'rev_stop', 'went_on_backorder'] 
 

oe_constraint 
 ['No' 'Yes' nan] 

deck_risk 
 ['No' 'Yes' nan] 

ppap_risk 
 ['No' 'Yes' nan] 

potential_issue 
 ['No' 'Yes' nan] 

went_on_backorder 
 ['No' 'Yes' nan] 

rev_stop 
 ['No' 'Yes' nan] 

stop_auto_buy 
 ['Yes' 'No' nan] 



You may see **nan** also as possible values representing missing values in the dataset. I'll fill them using most popular values, the [Mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) in Stats.

In [14]:
for column_name in yes_no_columns:
    mode = dataset[column_name].apply(str).mode()[0]
    print('Filling missing values of {} with {}'.format(column_name, mode))
    dataset[column_name].fillna(mode, inplace = True)

Filling missing values of potential_issue with No
Filling missing values of deck_risk with No
Filling missing values of oe_constraint with No
Filling missing values of ppap_risk with No
Filling missing values of stop_auto_buy with Yes
Filling missing values of rev_stop with No
Filling missing values of went_on_backorder with No


### Convert yes/no columns into binary (0s and 1s)

In [15]:
for c in yes_no_columns:
    dataset[c] = dataset[c].apply(["No", "Yes"].index)
    print(c, "\n", dataset[c].unique(), "\n ================= \n")

potential_issue 
 [0 1] 

deck_risk 
 [0 1] 

oe_constraint 
 [0 1] 

ppap_risk 
 [0 1] 

stop_auto_buy 
 [1 0] 

rev_stop 
 [0 1] 

went_on_backorder 
 [0 1] 



Now all columns should be either int64 or float64.

In [16]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1687861 entries, 0 to 1687860
Data columns (total 22 columns):
national_inv         1687860 non-null float64
lead_time            1586967 non-null float64
in_transit_qty       1687860 non-null float64
forecast_3_month     1687860 non-null float64
forecast_6_month     1687860 non-null float64
forecast_9_month     1687860 non-null float64
sales_1_month        1687860 non-null float64
sales_3_month        1687860 non-null float64
sales_6_month        1687860 non-null float64
sales_9_month        1687860 non-null float64
min_bank             1687860 non-null float64
potential_issue      1687861 non-null int64
pieces_past_due      1687860 non-null float64
perf_6_month_avg     1687860 non-null float64
perf_12_month_avg    1687860 non-null float64
local_bo_qty         1687860 non-null float64
deck_risk            1687861 non-null int64
oe_constraint        1687861 non-null int64
ppap_risk            1687861 non-null int64
stop_auto_buy        

In [17]:
dataset.isnull().any()

national_inv          True
lead_time             True
in_transit_qty        True
forecast_3_month      True
forecast_6_month      True
forecast_9_month      True
sales_1_month         True
sales_3_month         True
sales_6_month         True
sales_9_month         True
min_bank              True
potential_issue      False
pieces_past_due       True
perf_6_month_avg      True
perf_12_month_avg     True
local_bo_qty          True
deck_risk            False
oe_constraint        False
ppap_risk            False
stop_auto_buy        False
rev_stop             False
went_on_backorder    False
dtype: bool

In [18]:
dataset = dataset.fillna(dataset.mean())

In [19]:
dataset.isnull().any()

national_inv         False
lead_time            False
in_transit_qty       False
forecast_3_month     False
forecast_6_month     False
forecast_9_month     False
sales_1_month        False
sales_3_month        False
sales_6_month        False
sales_9_month        False
min_bank             False
potential_issue      False
pieces_past_due      False
perf_6_month_avg     False
perf_12_month_avg    False
local_bo_qty         False
deck_risk            False
oe_constraint        False
ppap_risk            False
stop_auto_buy        False
rev_stop             False
went_on_backorder    False
dtype: bool

In [20]:
fullDataset = dataset.copy() # save for later
dataset = dataset.sample(frac = 0.50).reset_index(drop = True) # model a fraction first for speed considerations

## <div style="color: red;"> I need to make sure the number of backorders are being represented fairly in the dataset. I would imagine backorders to be a minority in this case</div>

In [22]:
num_backorders = np.sum(dataset["went_on_backorder"])
num_backorders

5654

In [23]:
num_backorders/len(dataset)

0.0066996077873757304

Less than 1% of the orders have been on backorder. This is going to suck. 

In [24]:
downsample_rate = 1/((len(dataset) - num_backorders) / num_backorders)
downsample_rate

0.0067447952702928389

In [25]:
dataset.shape

(843930, 22)

In [26]:
dataset_resampled = pd.concat([
    dataset[dataset["went_on_backorder"] == 1],
    dataset[dataset["went_on_backorder"] == 0].sample(frac = downsample_rate, replace = True).reset_index(drop = True)
])

dataset_resampled = dataset_resampled.sample(frac = 1).reset_index(drop = True) # shuffle

In [27]:
dataset_resampled.shape[0]/2

5654.0

In [28]:
# verify balance again. should be 0.5

num_backorders = np.sum(dataset_resampled["went_on_backorder"])
num_backorders/len(dataset_resampled)

0.5

In [29]:
dataset = dataset_resampled # remap it back to dataset

In [30]:
dataset.shape

(11308, 22)

### Build out X and y

In [31]:
X = np.array(dataset.iloc[:, :-1])
X.shape

(11308, 21)

In [32]:
y = np.array(dataset["went_on_backorder"])
y.shape

(11308,)

In [33]:
X_train = X
y_train = y

## Pipeline

### 1st pipeline 


In [34]:
## JACKY: Create a wrapper estimator class that does an outlier removal first BEFORE passing those values to the pipeline. 
## Parameter inputs:
## - Cleaning = OutlierRemoval object (EllipticEnvelope/IsolationForest/etc)
## - Steps = Original Pipeline

class RmOutlierPipeline(Pipeline): # inherits the Pipeline class
    def __init__(self, cleaning, steps): # takes in 2 constructor parameters (cleaning and steps)
        self.cleaning = cleaning
        super(RmOutlierPipeline, self).__init__(steps) # pass the steps parameter to the base class (pipeline)
        
    def fit(self, X, y):
        # Before passing in X and y to my original pipeline, I wanna remove all the outliers first. 
        inliers = self.cleaning.fit(X, y).predict(X) == 1 
        return super(RmOutlierPipeline, self).fit(X[inliers], y[inliers])

In [35]:
def summarizeGrid(grid):
    print("\n====================================================\n")
    print("***MEAN TEST SCORES:**** \n")
    mean_scores = grid.cv_results_["mean_test_score"]
    z = mean_scores
    print(list(z))
    print("\n====================================================\n")
    print("***BEST SCORE:*** \n")
    print(grid.best_score_)
    print("\n====================================================\n")
    print("***BEST PARAMETER(S):*** \n")
    print(grid.best_params_)
    print("\n====================================================\n")
    print("***BEST ESTIMATOR:*** \n")
    print(grid.best_estimator_)

In [36]:
Scale = StandardScaler() # transformer
DimReduction = PCA() # transformer 
Classifier = LinearSVC() # estimator
OutlierRemoval = EllipticEnvelope() # estimator to be used before the steps

steps = [
    ("scale", Scale),
    ("dimReduce", DimReduction),
    ("classify", Classifier)
]

pipe1 = RmOutlierPipeline(OutlierRemoval, steps)

N_COMPONENTS = [3, 5, 7, 10, 15] # for PCA

pGrid = [
    {"dimReduce__n_components": N_COMPONENTS}
]

grid1 = GridSearchCV(pipe1, cv = 5, n_jobs = 1, param_grid = pGrid)
grid1.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RmOutlierPipeline(cleaning=EllipticEnvelope(assume_centered=False, contamination=0.1, random_state=None,
         store_precision=True, support_fraction=None),
         steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('dimReduce', PCA(copy=True, iterated_power='...x_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'dimReduce__n_components': [3, 5, 7, 10, 15]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [37]:
summarizeGrid(grid1)



***MEAN TEST SCORES:**** 

[0.61823487796250443, 0.62973116377785643, 0.63724796604174039, 0.70870180403254335, 0.76750972762645919]


***BEST SCORE:*** 

0.767509727626


***BEST PARAMETER(S):*** 

{'dimReduce__n_components': 15}


***BEST ESTIMATOR:*** 

RmOutlierPipeline(cleaning=EllipticEnvelope(assume_centered=False, contamination=0.1, random_state=None,
         store_precision=True, support_fraction=None),
         steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('dimReduce', PCA(copy=True, iterated_power='auto', n_components=15, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])


###  2nd pipeline

In [38]:
Scale = StandardScaler() # transformer
DimReduction = PCA() # transformer 
Classifier = RandomForestClassifier() # estimator
OutlierRemoval = IsolationForest(n_estimators = 250, bootstrap = True) # estimator to be used before the steps

steps = [
    ("scale", Scale),
    ("dimReduce", DimReduction),
    ("classify", Classifier)
]

pipe2 = RmOutlierPipeline(OutlierRemoval, steps)

N_COMPONENTS = [3, 5, 7, 10, 15] # PCA
N_ESTIMATORS = [10, 20] # random forest

pGrid = [
    {
        "dimReduce__n_components": N_COMPONENTS, 
        "classify__n_estimators": N_ESTIMATORS
    }
]

grid2 = GridSearchCV(pipe2, cv = 5, n_jobs = 1, param_grid = pGrid)
grid2.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RmOutlierPipeline(cleaning=IsolationForest(bootstrap=True, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=250, n_jobs=1, random_state=None,
        verbose=0),
         steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('dimReduce', ..._jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'classify__n_estimators': [10, 20], 'dimReduce__n_components': [3, 5, 7, 10, 15]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [39]:
summarizeGrid(grid2)



***MEAN TEST SCORES:**** 

[0.75672090555359039, 0.79881499823134061, 0.8213654050229926, 0.83383445348425889, 0.82746727980191015, 0.77352316943756627, 0.79899186416696144, 0.83003183586841178, 0.84347364697559246, 0.84285461620091973]


***BEST SCORE:*** 

0.843473646976


***BEST PARAMETER(S):*** 

{'classify__n_estimators': 20, 'dimReduce__n_components': 10}


***BEST ESTIMATOR:*** 

RmOutlierPipeline(cleaning=IsolationForest(bootstrap=True, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=250, n_jobs=1, random_state=None,
        verbose=0),
         steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('dimReduce', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
 ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])


### 3rd pipeline

In [40]:
Scale = StandardScaler() # transformer
DimReduction = PCA() # transformer 
Classifier = LogisticRegression() # estimator
OutlierRemoval = IsolationForest(n_estimators = 250, bootstrap = True) # estimator to be used before the steps

steps = [
    ("scale", Scale),
    ("dimReduce", DimReduction),
    ("classify", Classifier)
]

pipe3 = RmOutlierPipeline(OutlierRemoval, steps)

N_COMPONENTS = [3, 5, 7, 10, 15] # PCA

pGrid = [
    {"dimReduce__n_components": N_COMPONENTS}
]

grid3 = GridSearchCV(pipe3, cv = 5, n_jobs = 1, param_grid = pGrid)
grid3.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RmOutlierPipeline(cleaning=IsolationForest(bootstrap=True, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=250, n_jobs=1, random_state=None,
        verbose=0),
         steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('dimReduce', ...y='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'dimReduce__n_components': [3, 5, 7, 10, 15]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [41]:
summarizeGrid(grid3)



***MEAN TEST SCORES:**** 

[0.58622214361513969, 0.63220728687654759, 0.66050583657587547, 0.67757339936328265, 0.75627874071453838]


***BEST SCORE:*** 

0.756278740715


***BEST PARAMETER(S):*** 

{'dimReduce__n_components': 15}


***BEST ESTIMATOR:*** 

RmOutlierPipeline(cleaning=IsolationForest(bootstrap=True, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=250, n_jobs=1, random_state=None,
        verbose=0),
         steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('dimReduce', PCA(copy=True, iterated_power='auto', n_components=15, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classify', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


# Retrain a model using the full training data set

## Train

Use the full training data set to train the model.

### <div style="color:red;">I'd have to rebalance the full dataset first</div>

In [86]:
num_backorders = np.sum(fullDataset["went_on_backorder"])
num_backorders

11293

In [87]:
backorderPct = num_backorders/len(fullDataset)
backorderPct

0.0066907168303551061

### <div style="color:red;">The full dataset is way too large. It already took forever for me to model against a small % of the entire dataset. Instead of upsampling, I'm going to downsample this as a rebalancer</div>


In [88]:
downsample_rate = 1/((len(fullDataset) - num_backorders) / num_backorders)
downsample_rate

0.0067357840540914543

In [89]:
fullDataset.shape

(1687861, 22)

In [90]:
dataset_resampled = pd.concat([
    fullDataset[fullDataset["went_on_backorder"] == 1],
    fullDataset[fullDataset["went_on_backorder"] == 0].sample(frac = downsample_rate, replace = True).reset_index(drop = True)
])

dataset_resampled = dataset_resampled.sample(frac = 1).reset_index(drop = True) # shuffle
dataset = dataset_resampled
dataset.shape

(22586, 22)

In [91]:
# verify balance again. should be 0.5

num_backorders = np.sum(dataset["went_on_backorder"])
num_backorders/len(dataset)

0.5

In [92]:
X = np.array(dataset.iloc[:, :-1])
X.shape

(22586, 21)

In [93]:
y = np.array(dataset["went_on_backorder"])
y.shape

(22586,)

In [94]:
# OUTLIER REMOVAL: IsolationForest()
# SCALER: StandardScaler()
# DIMENSIONAL REDUCTION: PCA()
# CLASSIFIER: RandomForestClassifier()

print("X.shape", X.shape)
print("y.shape", y.shape)

print("============================")

iso_forest = IsolationForest(n_estimators = 250, bootstrap = True)
iso_forest.fit(X, y)
inliers = iso_forest.predict(X) == 1

X = X[inliers]
y = y[inliers]

print("Inlier X.shape", X.shape)
print("Inlier y.shape", y.shape)
print("============================")

scaler = StandardScaler()
X = scaler.fit_transform(X)

pca = PCA(n_components = 10)
X = pca.fit_transform(X)
print("PCA X.shape", X.shape)
print("PCA y.shape", y.shape)

print("============================")
model = RandomForestClassifier(n_estimators = 30)
model.fit(X, y)
scores = cross_val_score(model, X, y, cv = 5)
print("CV Scores: ", scores)
print("============================")
print("Mean CV Score", np.mean(scores))


X.shape (22586, 21)
y.shape (22586,)
Inlier X.shape (20327, 21)
Inlier y.shape (20327,)
PCA X.shape (20327, 10)
PCA y.shape (20327,)
CV Scores:  [ 0.86842105  0.84899164  0.85412054  0.86937269  0.86469865]
Mean CV Score 0.861120914505


### Reload the trained model from the pickle file
### Load the Testing Data and evaluate.

 * `datasets/back_order/Kaggle_Test_Dataset_v2.csv`

In [97]:
testSource = 'datasets/back_order/Kaggle_Test_Dataset_v2.csv'
print(os.path.exists(testSource))


testData = pd.read_csv(testSource).sample(frac = 1).reset_index(drop = True)
testData.head(10).transpose()

True


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
sku,3380602,3393373,3511795,3324210,3474777,3361704,3452244,3466592,3494977,3354919
national_inv,33,0,2,403,500,21,152,2,17,8
lead_time,8,8,4,12,12,8,,8,6,2
in_transit_qty,0,0,0,0,0,0,0,0,0,0
forecast_3_month,0,6,0,0,0,0,0,2,0,0
forecast_6_month,48,6,0,0,0,0,0,3,0,0
forecast_9_month,96,8,0,0,0,0,0,4,0,0
sales_1_month,4,1,0,0,0,0,2,0,0,0
sales_3_month,29,4,0,4,0,0,10,1,1,0
sales_6_month,67,6,0,8,0,1,24,4,4,0


### <div style="color: red;"> Test data is also bad. I'd have to preprocess it before I can load it into the pickledModel </div>

In [98]:
testData.columns

Index(['sku', 'national_inv', 'lead_time', 'in_transit_qty',
       'forecast_3_month', 'forecast_6_month', 'forecast_9_month',
       'sales_1_month', 'sales_3_month', 'sales_6_month', 'sales_9_month',
       'min_bank', 'potential_issue', 'pieces_past_due', 'perf_6_month_avg',
       'perf_12_month_avg', 'local_bo_qty', 'deck_risk', 'oe_constraint',
       'ppap_risk', 'stop_auto_buy', 'rev_stop', 'went_on_backorder'],
      dtype='object')

In [99]:
del testData["sku"]

In [100]:
testData.shape

(242076, 22)

In [101]:
yes_no_columns = list(filter(lambda i: testData[i].dtype != np.float64, testData.columns))
print(yes_no_columns, "\n \n")

['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk', 'stop_auto_buy', 'rev_stop', 'went_on_backorder'] 
 



In [102]:
for c in set(yes_no_columns):
    print(c, "\n", testData[c].unique(), "\n ========================== \n")

oe_constraint 
 ['No' 'Yes' nan] 

deck_risk 
 ['No' 'Yes' nan] 

ppap_risk 
 ['No' 'Yes' nan] 

potential_issue 
 ['No' 'Yes' nan] 

went_on_backorder 
 ['No' 'Yes' nan] 

rev_stop 
 ['No' 'Yes' nan] 

stop_auto_buy 
 ['Yes' 'No' nan] 



In [103]:
for column_name in yes_no_columns:
    mode = testData[column_name].apply(str).mode()[0]
    print("filling missing values of {} with {}".format(column_name, mode))
    testData[column_name].fillna(mode, inplace = True)

filling missing values of potential_issue with No
filling missing values of deck_risk with No
filling missing values of oe_constraint with No
filling missing values of ppap_risk with No
filling missing values of stop_auto_buy with Yes
filling missing values of rev_stop with No
filling missing values of went_on_backorder with No


In [104]:
# convert yes/no columns to binary

for c in yes_no_columns:
    testData[c] = testData[c].apply(["No", "Yes"].index)
    print(c, "\n", testData[c].unique(), "\n ================== \n")

potential_issue 
 [0 1] 

deck_risk 
 [0 1] 

oe_constraint 
 [0 1] 

ppap_risk 
 [0 1] 

stop_auto_buy 
 [1 0] 

rev_stop 
 [0 1] 

went_on_backorder 
 [0 1] 



In [105]:
testData.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
national_inv,33.0,0.0,2.0,403.0,500.0,21.0,152.0,2.0,17.0,8.0
lead_time,8.0,8.0,4.0,12.0,12.0,8.0,,8.0,6.0,2.0
in_transit_qty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
forecast_3_month,0.0,6.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
forecast_6_month,48.0,6.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
forecast_9_month,96.0,8.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
sales_1_month,4.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
sales_3_month,29.0,4.0,0.0,4.0,0.0,0.0,10.0,1.0,1.0,0.0
sales_6_month,67.0,6.0,0.0,8.0,0.0,1.0,24.0,4.0,4.0,0.0
sales_9_month,101.0,8.0,0.0,11.0,0.0,2.0,44.0,5.0,10.0,0.0


In [106]:
testData = testData.fillna(testData.mean())

In [107]:
testData.isnull().any()

national_inv         False
lead_time            False
in_transit_qty       False
forecast_3_month     False
forecast_6_month     False
forecast_9_month     False
sales_1_month        False
sales_3_month        False
sales_6_month        False
sales_9_month        False
min_bank             False
potential_issue      False
pieces_past_due      False
perf_6_month_avg     False
perf_12_month_avg    False
local_bo_qty         False
deck_risk            False
oe_constraint        False
ppap_risk            False
stop_auto_buy        False
rev_stop             False
went_on_backorder    False
dtype: bool

In [108]:
testData.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
national_inv,33.0,0.0,2.0,403.0,500.0,21.0,152.0,2.0,17.0,8.0
lead_time,8.0,8.0,4.0,12.0,12.0,8.0,7.923018,8.0,6.0,2.0
in_transit_qty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
forecast_3_month,0.0,6.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
forecast_6_month,48.0,6.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
forecast_9_month,96.0,8.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
sales_1_month,4.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
sales_3_month,29.0,4.0,0.0,4.0,0.0,0.0,10.0,1.0,1.0,0.0
sales_6_month,67.0,6.0,0.0,8.0,0.0,1.0,24.0,4.0,4.0,0.0
sales_9_month,101.0,8.0,0.0,11.0,0.0,2.0,44.0,5.0,10.0,0.0


In [109]:
testData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242076 entries, 0 to 242075
Data columns (total 22 columns):
national_inv         242076 non-null float64
lead_time            242076 non-null float64
in_transit_qty       242076 non-null float64
forecast_3_month     242076 non-null float64
forecast_6_month     242076 non-null float64
forecast_9_month     242076 non-null float64
sales_1_month        242076 non-null float64
sales_3_month        242076 non-null float64
sales_6_month        242076 non-null float64
sales_9_month        242076 non-null float64
min_bank             242076 non-null float64
potential_issue      242076 non-null int64
pieces_past_due      242076 non-null float64
perf_6_month_avg     242076 non-null float64
perf_12_month_avg    242076 non-null float64
local_bo_qty         242076 non-null float64
deck_risk            242076 non-null int64
oe_constraint        242076 non-null int64
ppap_risk            242076 non-null int64
stop_auto_buy        242076 non-null int64

In [110]:
X = np.array(testData.iloc[:, :-1])
X.shape

(242076, 21)

In [111]:
y = np.array(testData["went_on_backorder"])
y.shape

(242076,)

In [112]:
# scale the data

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [113]:
pca = PCA(n_components = 10)
X = pca.fit_transform(X)
X.shape

(242076, 10)

## Test

In [114]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

y_pred = pickled.predict(X)
labels = np.unique(y)

cm = confusion_matrix(y, y_pred, labels)
cm = pd.DataFrame(cm).reset_index(drop = True)
cm.columns = labels
cm.index = labels
cm

Unnamed: 0,0,1
0,154273,85115
1,1667,1021


In [116]:
scores = cross_val_score(pickled, X, y, cv = 10)
print("CV scores: ", scores)
print("\n")
print("CV Avg:", np.mean(scores))

CV scores:  [ 0.98826834  0.98864012  0.98847488  0.9885575   0.98835096  0.9885575
  0.98897059  0.98859881  0.98863918  0.98855656]


CV Avg: 0.988561444115


### <div style="color: red;">JACKY: Why are my CV scores so high but my evaluation scores so low?</div>

In [117]:
a = accuracy_score(y, y_pred)
print("Accuracy Score: ", a)

Accuracy Score:  0.641509278078


In [118]:
from sklearn.metrics import f1_score, precision_score

f1 = f1_score(y, y_pred, average = "micro")
print("F1 Score:", f1)

print("")

p = precision_score(y, y_pred, average = "micro")
print("Precision Score:", p)

F1 Score: 0.641509278078

Precision Score: 0.641509278078
