In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import category_encoders as ce

import tensorflow as tf

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.models import Sequential


from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn import svm
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings("ignore")

In [429]:
df = pd.read_csv('./data/ship_clean.csv')

In [430]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 34 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   type                      180519 non-null  object 
 1   actual_ship_days          180519 non-null  int64  
 2   estimated_ship_days       180519 non-null  int64  
 3   delivery_status           180519 non-null  object 
 4   late_delivery_risk        180519 non-null  int64  
 5   category_id               180519 non-null  int64  
 6   category_name             180519 non-null  object 
 7   customer_city             180519 non-null  object 
 8   customer_id               180519 non-null  int64  
 9   customer_segment          180519 non-null  object 
 10  customer_state            180519 non-null  object 
 11  customer_zipcode          180516 non-null  float64
 12  department_name           180519 non-null  object 
 13  latitude                  180519 non-null  f

In [431]:
df.columns

Index(['type', 'actual_ship_days', 'estimated_ship_days', 'delivery_status',
       'late_delivery_risk', 'category_id', 'category_name', 'customer_city',
       'customer_id', 'customer_segment', 'customer_state', 'customer_zipcode',
       'department_name', 'latitude', 'longitude', 'market', 'order_city',
       'order_country', 'order_date', 'order_id', 'order_item_discount_rate',
       'order_item_profit_ratio', 'order_item_quantity', 'order_region',
       'order_state', 'order_status', 'product_card_id', 'product_name',
       'product_price', 'shipping_date', 'shipping_mode', 'ship_performance',
       'ontime', 'is_fraud'],
      dtype='object')

In [432]:
# change order_date and shipping_date features to datetime format-

df['order_date'] = pd.to_datetime(df['order_date'])

In [433]:
df['shipping_date'] = pd.to_datetime(df['shipping_date'])

In [434]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 34 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   type                      180519 non-null  object        
 1   actual_ship_days          180519 non-null  int64         
 2   estimated_ship_days       180519 non-null  int64         
 3   delivery_status           180519 non-null  object        
 4   late_delivery_risk        180519 non-null  int64         
 5   category_id               180519 non-null  int64         
 6   category_name             180519 non-null  object        
 7   customer_city             180519 non-null  object        
 8   customer_id               180519 non-null  int64         
 9   customer_segment          180519 non-null  object        
 10  customer_state            180519 non-null  object        
 11  customer_zipcode          180516 non-null  float64       
 12  de

In [435]:
# check categorical column value counts to figure out which can be onehotencoded-

df['type'].unique()  # ohe

array(['DEBIT', 'TRANSFER', 'CASH', 'PAYMENT'], dtype=object)

In [436]:
df['delivery_status'].unique()    # drop, colinear with prediction column

array(['Advance shipping', 'Late delivery', 'Shipping on time',
       'Shipping canceled'], dtype=object)

In [437]:
# df['category_name'].unique()    # too many vals

In [438]:
# df['customer_city'].unique()    # too many vals

In [439]:
df['customer_segment'].unique()   # ohe

array(['Consumer', 'Home Office', 'Corporate'], dtype=object)

In [440]:
# df['customer_state'].unique()   # too many vals

In [441]:
df['department_name'].unique()    # ohe

array(['Fitness', 'Apparel', 'Golf', 'Footwear', 'Outdoors', 'Fan Shop',
       'Technology', 'Book Shop', 'Discs Shop', 'Pet Shop',
       'Health and Beauty '], dtype=object)

In [442]:
df['market'].unique() # can ohe

array(['Pacific Asia', 'USCA', 'Africa', 'Europe', 'LATAM'], dtype=object)

In [443]:
# df['order_city'].unique()      # too many vals

In [444]:
# df['order_country'].unique()   # too many vals

In [445]:
df['order_region'].unique()      # ohe

array(['Southeast Asia', 'South Asia', 'Oceania', 'Eastern Asia',
       'West Asia', 'West of USA ', 'US Center ', 'West Africa',
       'Central Africa', 'North Africa', 'Western Europe',
       'Northern Europe', 'Central America', 'Caribbean', 'South America',
       'East Africa', 'Southern Europe', 'East of USA', 'Canada',
       'Southern Africa', 'Central Asia', 'Eastern Europe',
       'South of  USA '], dtype=object)

In [446]:
# df['order_state'].unique()   # too many vals

In [447]:
df['order_status'].unique()    # ohe

array(['COMPLETE', 'PENDING', 'CLOSED', 'PENDING_PAYMENT', 'CANCELED',
       'PROCESSING', 'SUSPECTED_FRAUD', 'ON_HOLD', 'PAYMENT_REVIEW'],
      dtype=object)

In [448]:
# df['product_name'].unique()   # too many vals

In [449]:
df['shipping_mode'].unique()    # ohe

array(['Standard Class', 'First Class', 'Second Class', 'Same Day'],
      dtype=object)

In [457]:
# drop the categorical variables with too many values, IDed above-

X = df.drop(columns=['category_name', 'customer_city', 'customer_state', 'order_city', 'order_country', 'order_state', 'product_name'])

In [458]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 27 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   type                      180519 non-null  object        
 1   actual_ship_days          180519 non-null  int64         
 2   estimated_ship_days       180519 non-null  int64         
 3   delivery_status           180519 non-null  object        
 4   late_delivery_risk        180519 non-null  int64         
 5   category_id               180519 non-null  int64         
 6   customer_id               180519 non-null  int64         
 7   customer_segment          180519 non-null  object        
 8   customer_zipcode          180516 non-null  float64       
 9   department_name           180519 non-null  object        
 10  latitude                  180519 non-null  float64       
 11  longitude                 180519 non-null  float64       
 12  ma

In [459]:
# drop features that overlap with/bias the target (ontime)-

X = X.drop(columns=['actual_ship_days', 'late_delivery_risk', 'delivery_status', 'order_status', 'ship_performance', 'ontime'])

In [460]:
# drop datetime dtypes-

# X = X.drop(columns=['order_date', 'shipping_date'])

In [461]:
# drop shipping_date (keep order_date)-

# X = X.drop(columns=['shipping_date'])

In [455]:
# change order_date from datetime to ordinal so it can be included in X-

# X['order_date'] = X['order_date'].map(dt.datetime.toordinal)

In [462]:
# drop order_date (keep shipping_date)-

X = X.drop(columns=['order_date'])

In [463]:
# change shipping_date from datetime to ordinal so it can be included in X-

X['shipping_date'] = X['shipping_date'].map(dt.datetime.toordinal)

In [464]:
X.shape

(180519, 20)

In [465]:
# drop zipcode, raises an error-

X = X.drop(columns=['customer_zipcode'])

In [466]:
X.shape

(180519, 19)

In [467]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   type                      180519 non-null  object 
 1   estimated_ship_days       180519 non-null  int64  
 2   category_id               180519 non-null  int64  
 3   customer_id               180519 non-null  int64  
 4   customer_segment          180519 non-null  object 
 5   department_name           180519 non-null  object 
 6   latitude                  180519 non-null  float64
 7   longitude                 180519 non-null  float64
 8   market                    180519 non-null  object 
 9   order_id                  180519 non-null  int64  
 10  order_item_discount_rate  180519 non-null  float64
 11  order_item_profit_ratio   180519 non-null  float64
 12  order_item_quantity       180519 non-null  int64  
 13  order_region              180519 non-null  o

In [468]:
# set y-

y = df['ontime']

In [469]:
y

0         1
1         0
2         1
3         1
4         1
         ..
180514    1
180515    0
180516    0
180517    1
180518    1
Name: ontime, Length: 180519, dtype: int64

In [470]:
# get a baseline (57%)-

y.value_counts(normalize=True)

0    0.572793
1    0.427207
Name: ontime, dtype: float64

In [471]:
y.shape

(180519,)

In [472]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   type                      180519 non-null  object 
 1   estimated_ship_days       180519 non-null  int64  
 2   category_id               180519 non-null  int64  
 3   customer_id               180519 non-null  int64  
 4   customer_segment          180519 non-null  object 
 5   department_name           180519 non-null  object 
 6   latitude                  180519 non-null  float64
 7   longitude                 180519 non-null  float64
 8   market                    180519 non-null  object 
 9   order_id                  180519 non-null  int64  
 10  order_item_discount_rate  180519 non-null  float64
 11  order_item_profit_ratio   180519 non-null  float64
 12  order_item_quantity       180519 non-null  int64  
 13  order_region              180519 non-null  o

In [473]:
X.shape

(180519, 19)

In [474]:
y.shape

(180519,)

In [475]:
# intitialize category encoder-

ohe = ce.OneHotEncoder()

In [476]:
# TTS-

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [477]:
# onehotencode-

X_train = ohe.fit_transform(X_train)

In [478]:
X_test = ohe.transform(X_test)

In [479]:
X_train.shape

(135389, 63)

In [480]:
X_test.shape

(45130, 63)

In [481]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 135389 entries, 21790 to 27668
Data columns (total 63 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   type_1                    135389 non-null  int64  
 1   type_2                    135389 non-null  int64  
 2   type_3                    135389 non-null  int64  
 3   type_4                    135389 non-null  int64  
 4   estimated_ship_days       135389 non-null  int64  
 5   category_id               135389 non-null  int64  
 6   customer_id               135389 non-null  int64  
 7   customer_segment_1        135389 non-null  int64  
 8   customer_segment_2        135389 non-null  int64  
 9   customer_segment_3        135389 non-null  int64  
 10  department_name_1         135389 non-null  int64  
 11  department_name_2         135389 non-null  int64  
 12  department_name_3         135389 non-null  int64  
 13  department_name_4         135389 non-null

In [482]:
# scale-

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [483]:
X_train_sc

array([[ 1.26991279, -0.61921414, -0.3489862 , ..., -0.42721069,
        -0.23853901, -0.15119978],
       [-0.78745565,  1.61495021, -0.3489862 , ..., -0.42721069,
        -0.23853901, -0.15119978],
       [ 1.26991279, -0.61921414, -0.3489862 , ...,  2.34076539,
        -0.23853901, -0.15119978],
       ...,
       [ 1.26991279, -0.61921414, -0.3489862 , ..., -0.42721069,
        -0.23853901, -0.15119978],
       [-0.78745565,  1.61495021, -0.3489862 , ..., -0.42721069,
        -0.23853901, -0.15119978],
       [ 1.26991279, -0.61921414, -0.3489862 , ..., -0.42721069,
        -0.23853901, -0.15119978]])

In [484]:
# make into a df (not sure this is necessary but whateva)-

X_train_sc = pd.DataFrame(X_train_sc)

In [485]:
X_train_sc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53,54,55,56,57,58,59,60,61,62
0,1.269913,-0.619214,-0.348986,-0.548889,0.778116,-0.181328,0.628145,0.966146,-0.46614,-0.661342,...,-0.073374,-0.054504,-0.149048,-0.794442,-0.037416,0.822811,-0.492998,-0.427211,-0.238539,-0.1512
1,-0.787456,1.614950,-0.348986,-0.548889,-0.677093,0.712381,-0.105876,0.966146,-0.46614,-0.661342,...,-0.073374,-0.054504,0.785903,1.134205,-1.303284,-1.215346,2.028404,-0.427211,-0.238539,-0.1512
2,1.269913,-0.619214,-0.348986,-0.548889,-1.404697,0.840054,-0.859566,-1.035040,2.14528,-0.661342,...,-0.073374,-0.054504,0.925403,1.848571,-0.379102,-1.215346,-0.492998,2.340765,-0.238539,-0.1512
3,1.269913,-0.619214,-0.348986,-0.548889,0.778116,0.840054,-0.810152,0.966146,-0.46614,-0.661342,...,-0.073374,-0.054504,0.925403,1.848571,-0.336798,0.822811,-0.492998,-0.427211,-0.238539,-0.1512
4,-0.787456,1.614950,-0.348986,-0.548889,0.778116,-0.500510,0.922952,0.966146,-0.46614,-0.661342,...,-0.073374,-0.054504,-0.564581,-0.651568,-0.219649,0.822811,-0.492998,-0.427211,-0.238539,-0.1512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135384,1.269913,-0.619214,-0.348986,-0.548889,0.778116,-0.947365,-1.204508,0.966146,-0.46614,-0.661342,...,-0.073374,-0.054504,-0.971211,-0.580203,0.398642,0.822811,-0.492998,-0.427211,-0.238539,-0.1512
135385,1.269913,-0.619214,-0.348986,-0.548889,0.778116,-0.181328,0.922712,0.966146,-0.46614,-0.661342,...,-0.073374,-0.054504,-0.193569,-0.723076,1.208928,0.822811,-0.492998,-0.427211,-0.238539,-0.1512
135386,1.269913,-0.619214,-0.348986,-0.548889,0.778116,0.840054,1.314910,-1.035040,-0.46614,1.512078,...,-0.073374,-0.054504,0.925403,1.848571,0.405150,0.822811,-0.492998,-0.427211,-0.238539,-0.1512
135387,-0.787456,1.614950,-0.348986,-0.548889,0.778116,-0.883528,-0.477684,-1.035040,-0.46614,1.512078,...,-0.073374,-0.054504,-0.858423,-0.080147,-0.509269,0.822811,-0.492998,-0.427211,-0.238539,-0.1512


In [486]:
X_train_sc.shape

(135389, 63)

In [487]:
# make into a df-

X_test_sc = pd.DataFrame(X_test_sc)

In [488]:
X_test_sc.shape

(45130, 63)

In [489]:
# define Rachael's function-

def pipe(model):
    #instantiate model
    model = model()
    #fit to scaled data
    model.fit(X_train_sc, y_train)
    
    #make predictions
    predictions = model.predict(X_test_sc)
    #print accuracy scores for training and testing groups
    print(f'{model} training score: {model.score(X_train_sc, y_train)}')
    print(f'{model} testing score: {model.score(X_test_sc, y_test)}')
    
    return

# from:
# Rachael Friedman
# https://towardsdatascience.com/how-to-make-your-modeling-process-more-efficient-89e70259839d

In [318]:
# run it across the various models to get an initial sense of performance-

pipe(LogisticRegression)

# this is with shipping_date included in X, so it may be gaming the model

LogisticRegression() training score: 0.8037506739838539
LogisticRegression() testing score: 0.7992687790826501


In [319]:
pipe(DecisionTreeClassifier)

# same

DecisionTreeClassifier() training score: 1.0
DecisionTreeClassifier() testing score: 0.9286727232439619


In [None]:
pipe(KNeighborsClassifier)

# these 2 kept sticking so I didn't run them

In [None]:
pipe(SVC)

In [402]:
pipe(RandomForestClassifier)

RandomForestClassifier() training score: 1.0
RandomForestClassifier() testing score: 0.7831376024817195


In [405]:
pipe(AdaBoostClassifier)

AdaBoostClassifier() training score: 0.7009875248358434
AdaBoostClassifier() testing score: 0.6997784179038333


In [406]:
pipe(GradientBoostingClassifier)

GradientBoostingClassifier() training score: 0.7057441889666073
GradientBoostingClassifier() testing score: 0.7046532240194993


In [407]:
pipe(ExtraTreesClassifier)

ExtraTreesClassifier() training score: 1.0
ExtraTreesClassifier() testing score: 0.7567693330378905
