In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, accuracy_score, classification_report

import matplotlib.pyplot as plt

plt.rcParams['figure.facecolor'] = (1,1,1,1)


# Pipeline

In [3]:
df = pd.read_csv('./data.csv')
print(df.shape)
df.sample()

(10999, 12)


Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
1665,1666,B,Ship,5,2,266,7,low,M,40,2304,1


# Step 0. Binarize the data

In [4]:
def bin_num_column(df_bin, col, start, finish, step):
    for i in range(start, finish, step):
        df_bin[col+'('+str(i) + ';' + str(i+step) + ']'] = (df[col] >i ) & (df[col] <= (i+step))

In [5]:
def bin_cat_column(df_bin, col, cats):
    for i in cats:
        df_bin[col+str(i)] = df[col] == i

In [6]:
df_bin = pd.DataFrame()
bin_cat_column(df_bin, 'Warehouse_block', ['D', 'F', 'A', 'B', 'C'])
bin_cat_column(df_bin, 'Warehouse_block', ['Flight', 'Ship', 'Road'])
bin_num_column(df_bin, 'Customer_care_calls', 2, 7, 2)
bin_num_column(df_bin, 'Customer_care_calls', 1, 5, 3)
bin_num_column(df_bin, 'Cost_of_the_Product', 96, 310, 400)
bin_num_column(df_bin, 'Prior_purchases', 2, 10, 3)
bin_cat_column(df_bin, 'Product_importance', ['low', 'medium', 'high'])
bin_cat_column(df_bin, 'Gender', ['M', 'F'])
bin_num_column(df_bin, 'Discount_offered', 1, 65, 20)
bin_num_column(df_bin, 'Weight_in_gms', 1000, 8000, 5000)
df_bin.head()

Unnamed: 0,Warehouse_blockD,Warehouse_blockF,Warehouse_blockA,Warehouse_blockB,Warehouse_blockC,Warehouse_blockFlight,Warehouse_blockShip,Warehouse_blockRoad,Customer_care_calls(2;4],Customer_care_calls(4;6],...,Product_importancemedium,Product_importancehigh,GenderM,GenderF,Discount_offered(1;21],Discount_offered(21;41],Discount_offered(41;61],Discount_offered(61;81],Weight_in_gms(1000;6000],Weight_in_gms(6000;11000]
0,True,False,False,False,False,False,False,False,True,False,...,False,False,False,True,False,False,True,False,True,False
1,False,True,False,False,False,False,False,False,True,False,...,False,False,True,False,False,False,True,False,True,False
2,False,False,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,True,False,True,False
3,False,False,False,True,False,False,False,False,True,False,...,True,False,True,False,True,False,False,False,True,False
4,False,False,False,False,True,False,False,False,False,False,...,True,False,False,True,False,False,True,False,True,False


In [8]:
df_bin['Reached_on_time'] = df['Reached.on.Time_Y.N'] == 0

In [10]:
y_feat = 'Reached_on_time'
df_train, df_test = train_test_split(df_bin, train_size=0.7, random_state=0)

X_train, y_train = df_train.drop(y_feat, axis=1), df_train[y_feat]
X_test, y_test = df_test.drop(y_feat, axis=1), df_test[y_feat]
X_train, y_train,  X_test, y_test = X_train.to_numpy(dtype='int'), y_train.to_numpy(dtype='int'), X_test.to_numpy(dtype='int'), y_test.to_numpy(dtype='int')
X_train[:10]


array([[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 1, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
        0, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
        1, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
        1, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 

Default tree

In [11]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Recall score:', recall_score(y_test, y_pred))
print('F1     score:', f1_score(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Recall score: 0.4720812182741117
F1     score: 0.49206349206349204
Accuracy score: 0.5927272727272728


In [12]:
clf.get_depth()

17

GridsearchCV with trees

(scoring with recall)

In [13]:
from sklearn.model_selection import GridSearchCV
parameters = {'criterion': ['gini', 'entropy', 'log_loss'],
              'max_depth' : [1,2,3,4,5,6],
              'min_samples_split': [2,5, 8, 10],
              'min_samples_leaf': [1,2,3,5, 7, 10]}
tree = DecisionTreeClassifier(random_state=0)
clf = GridSearchCV(tree, parameters, verbose = 3, scoring = 'recall')
clf.fit(X_train, y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[CV 1/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=2;, score=0.000 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=2;, score=0.000 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=2;, score=0.000 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=2;, score=0.000 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=2;, score=0.000 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=5;, score=0.000 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=5;, score=0.000 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=5;, score=0.000 total time=   0.0s
[CV 4/5] END cri

In [14]:
y_pred = clf.predict(X_test)
print('Recall score:', recall_score(y_test, y_pred))
print('F1     score:', f1_score(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Recall score: 0.3343002175489485
F1     score: 0.40563132424109105
Accuracy score: 0.5906060606060606


(scoring with f1)

In [15]:

tree = DecisionTreeClassifier(random_state=0)
clf = GridSearchCV(tree, parameters, verbose = 3, scoring = 'f1')
clf.fit(X_train, y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[CV 1/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=2;, score=0.000 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=2;, score=0.000 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=2;, score=0.000 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=2;, score=0.000 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=2;, score=0.000 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=5;, score=0.000 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=5;, score=0.000 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=5;, score=0.000 total time=   0.0s
[CV 4/5] END cri

In [16]:
y_pred = clf.predict(X_test)
print('Recall score:', recall_score(y_test, y_pred))
print('F1     score:', f1_score(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Recall score: 0.3343002175489485
F1     score: 0.40563132424109105
Accuracy score: 0.5906060606060606


Simple Gradient boosting

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Recall score:', recall_score(y_test, y_pred))
print('F1     score:', f1_score(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Recall score: 0.38941261783901376
F1     score: 0.44070578580221587
Accuracy score: 0.5869696969696969


GridSearchCV with gradient Boosting

(scoring with recall)

In [18]:
parameters = {'learning_rate': [1,0.1, 0.01, 0.001],
              'n_estimators':[10, 50, 100, 200],
              'subsample' : [1.0, 0.75, 0.5, 0.25]}
gb = GradientBoostingClassifier(random_state=0)
clf = GridSearchCV(gb, parameters, verbose = 3, scoring = 'recall')
clf.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV 1/5] END learning_rate=1, n_estimators=10, subsample=1.0;, score=0.460 total time=   0.1s
[CV 2/5] END learning_rate=1, n_estimators=10, subsample=1.0;, score=0.463 total time=   0.0s
[CV 3/5] END learning_rate=1, n_estimators=10, subsample=1.0;, score=0.391 total time=   0.0s
[CV 4/5] END learning_rate=1, n_estimators=10, subsample=1.0;, score=0.397 total time=   0.0s
[CV 5/5] END learning_rate=1, n_estimators=10, subsample=1.0;, score=0.493 total time=   0.0s
[CV 1/5] END learning_rate=1, n_estimators=10, subsample=0.75;, score=0.445 total time=   0.0s
[CV 2/5] END learning_rate=1, n_estimators=10, subsample=0.75;, score=0.496 total time=   0.0s
[CV 3/5] END learning_rate=1, n_estimators=10, subsample=0.75;, score=0.438 total time=   0.0s
[CV 4/5] END learning_rate=1, n_estimators=10, subsample=0.75;, score=0.469 total time=   0.0s
[CV 5/5] END learning_rate=1, n_estimators=10, subsample=0.75;, score=0.511 total time= 

In [19]:
y_pred = clf.predict(X_test)
print('Recall score:', recall_score(y_test, y_pred))
print('F1     score:', f1_score(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Recall score: 0.4205946337926033
F1     score: 0.4599524187153053
Accuracy score: 0.5872727272727273


(scoring with f1)

In [20]:
gb = GradientBoostingClassifier(random_state=0)
clf = GridSearchCV(gb, parameters, verbose = 3, scoring = 'f1')
clf.fit(X_train, y_train)


Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV 1/5] END learning_rate=1, n_estimators=10, subsample=1.0;, score=0.490 total time=   0.1s
[CV 2/5] END learning_rate=1, n_estimators=10, subsample=1.0;, score=0.484 total time=   0.0s
[CV 3/5] END learning_rate=1, n_estimators=10, subsample=1.0;, score=0.434 total time=   0.0s
[CV 4/5] END learning_rate=1, n_estimators=10, subsample=1.0;, score=0.435 total time=   0.0s
[CV 5/5] END learning_rate=1, n_estimators=10, subsample=1.0;, score=0.484 total time=   0.0s
[CV 1/5] END learning_rate=1, n_estimators=10, subsample=0.75;, score=0.468 total time=   0.0s
[CV 2/5] END learning_rate=1, n_estimators=10, subsample=0.75;, score=0.492 total time=   0.0s
[CV 3/5] END learning_rate=1, n_estimators=10, subsample=0.75;, score=0.450 total time=   0.0s
[CV 4/5] END learning_rate=1, n_estimators=10, subsample=0.75;, score=0.483 total time=   0.0s
[CV 5/5] END learning_rate=1, n_estimators=10, subsample=0.75;, score=0.500 total time= 

In [21]:
y_pred = clf.predict(X_test)
print('Recall score:', recall_score(y_test, y_pred))
print('F1     score:', f1_score(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))

Recall score: 0.4205946337926033
F1     score: 0.4599524187153053
Accuracy score: 0.5872727272727273
