### This notebook will only work with th data which already clean by the EDA notebook.

In [44]:
#base import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#Model Import
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.optimizers import Adam
#preprocessing import
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
#accuracy calculate import
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
np.random.seed(123)


In [45]:
ebay_clean= pd.read_csv('../data/cleaned/Ebay_cleaned.csv', index_col=0)
pd.set_option('display.max_columns', None)
ebay_clean.head()

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,package_size,record_number,distance,seller_lat,seller_lon,buyer_lat,buyer_lon,handling_date,shipping_date,total_time,pay_year,pay_month,pay_date
0,1,25454,3,2019-03-27,0,0,3,5,97219,49040,13,28,1,2019-03-24,2019-03-29,5,1,1,3001,45.45,-122.69,41.95,-85.32,3,2,5,2019,3,24
1,0,6727381,2,2018-06-03,0,3,3,5,11415-3528,62521,0,20,1,2018-06-02,2018-06-05,0,4,2,1282,40.71,-73.83,39.85,-88.93,1,2,3,2018,6,2
2,1,18507,1,2019-01-08,0,4,3,5,27292,53010,1,20,1,2019-01-06,2019-01-10,9,4,3,1104,35.8,-80.25,43.59,-88.28,2,2,4,2019,1,6
3,1,4677,1,2018-12-18,0,0,3,5,90703,80022,1,36,1,2018-12-17,2018-12-21,8,4,4,1353,33.86,-118.05,39.88,-104.8,1,3,4,2018,12,17
4,1,4677,1,2018-07-28,0,0,3,5,90703,55070,1,25,1,2018-07-27,2018-07-30,3,4,5,2456,33.86,-118.05,45.39,-93.38,1,2,3,2018,7,27


In [46]:
ebay_clean.isna().sum()

b2c_c2c                      0
seller_id                    0
declared_handling_days       0
acceptance_scan_timestamp    0
shipment_method_id           0
shipping_fee                 0
carrier_min_estimate         0
carrier_max_estimate         0
item_zip                     0
buyer_zip                    0
category_id                  0
item_price                   0
quantity                     0
payment_datetime             0
delivery_date                0
weight                       0
package_size                 0
record_number                0
distance                     0
seller_lat                   0
seller_lon                   0
buyer_lat                    0
buyer_lon                    0
handling_date                0
shipping_date                0
total_time                   0
pay_year                     0
pay_month                    0
pay_date                     0
dtype: int64

In [47]:
ebay_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 493951 entries, 0 to 499999
Data columns (total 29 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   b2c_c2c                    493951 non-null  int64  
 1   seller_id                  493951 non-null  int64  
 2   declared_handling_days     493951 non-null  int64  
 3   acceptance_scan_timestamp  493951 non-null  object 
 4   shipment_method_id         493951 non-null  int64  
 5   shipping_fee               493951 non-null  int64  
 6   carrier_min_estimate       493951 non-null  int64  
 7   carrier_max_estimate       493951 non-null  int64  
 8   item_zip                   493951 non-null  object 
 9   buyer_zip                  493951 non-null  object 
 10  category_id                493951 non-null  int64  
 11  item_price                 493951 non-null  int64  
 12  quantity                   493951 non-null  int64  
 13  payment_datetime           49

In [48]:
X= ebay_clean[['b2c_c2c', 'declared_handling_days', 'shipment_method_id', 'shipping_fee', 'item_price', 'weight', 'package_size', 'distance']]
y= ebay_clean['total_time']

In [49]:
print(y.shape)
print(X.shape)

(493951,)
(493951, 8)


In [50]:
#split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

Scale data


In [51]:
scaler= StandardScaler()
X_train_ss= scaler.fit_transform(X_train, y_train)
X_test_ss= scaler.transform(X_test)

In [52]:
y_true_test= y_test.values
y_true_train= y_train.values

**Accuracy function**

In [53]:
def define_late(y_actual, pred):
    '''
        This function is using for calculate the accurancy of the model in the differnce aspect:
        when the model run, the accuracy will calculate the exacly match number for the true predict, the predict give late or early delivery will be count as false.
        But in the business point of view, The order which is delivered early than the prediction will not get any complaint from customer and will be consider as the ontime deliver.
        In this function, we will modify the accuracy of the model base on the logic above:
            day predict > actual delivered: Ontime
            day predict = actual delivered: Ontime
            day predict < actual delivered: Late

    '''
    ontime= 0
    late= 0
    accuracy_sc=0
    for i in range(len(pred)):
        if pred[i] == y_actual[i] or pred[i]> y_actual[i]:
            ontime+=1
        else:
            late+=1
    accuracy_sc= ontime/len(y_actual)*100
    return accuracy_sc

**Loss Function Calulate**
$$L = \frac{1}{N}.abs([P_E.\sum_{early shipments}(actual delivery days - predicted deliveryday)+ P_L.\sum_{late shipments}(actual delivery days - predicted deliveryday)])$$
while $P_E = 0.4$, $P_L = 0.6$ and N is number of record in the dataset

In [54]:
def evaluate_loss(preds, actual):
    ''''
        This Loss function was provided by the ebay team, who was given out the dataset for their Machine learning challenge
        From a business point of view, it is a worse experience for a buyer if a shipment arrives after the estimated delivery date (“late shipment”) 
            as compared to arriving before the estimated delivery date (“early shipment”). 
            The formula for the loss function was mentioned above.
        
    '''
    early_loss, late_loss = 0,0 
    for i in range(len(preds)):
        if preds[i] < actual[i]:
            #early shipment
            early_loss += actual[i] - preds[i]
        elif preds[i] > actual[i]:
            #late shipment
            late_loss += preds[i] - actual[i]
    loss = (1/len(preds)) * (0.4 * (early_loss) + 0.6 * (late_loss))
    return loss

#### LinearRegression model

In [55]:
#Initialize
linear_model= LinearRegression()
#fit model
linear_model.fit(X_train_ss, y_train)
linear_preds= linear_model.predict(X_test_ss)
linear_train_pred= linear_model.predict(X_train_ss)
linear_train_pred= np.round(linear_train_pred)
linear_preds= np.round(linear_preds)
linear_score_test=  linear_model.score(X_test_ss, y_test)
linear_score_train=  linear_model.score(X_train_ss, y_train)
linear_accuracy_test= define_late(y_true_test, linear_preds)
linear_accuracy_train= define_late(y_true_train, linear_train_pred)

In [56]:
print(f'Accuracy score test:  {linear_accuracy_test}')
print(f'Accuracy score train: {linear_accuracy_train}')

Accuracy score test:  70.61371987326781
Accuracy score train: 70.62810001012248


#### Ridge model

In [57]:
# Initialize
ridge_model= Ridge(solver='lsqr')

# fit
ridge_model.fit(X_train_ss, y_train)
ridge_preds= ridge_model.predict(X_test_ss)
ridge_preds= np.round(ridge_preds)
ridge_train_pred= ridge_model.predict(X_train_ss)
ridge_train_pred= np.round(ridge_train_pred)
ridge_score_test= ridge_model.score(X_test_ss, y_test)
ridge_score_train= ridge_model.score(X_train_ss, y_train)
ridge_accuracy_test= define_late(y_true_test, ridge_preds)
ridge_accuracy_train= define_late(y_true_train, ridge_train_pred)


In [58]:
print(f'Ridge Accuracy score test:  {ridge_accuracy_test}')
print(f'Ridge Accuracy score train: {ridge_accuracy_train}')

Ridge Accuracy score test:  70.61878106305231
Ridge Accuracy score train: 70.62734082397004


#### XGboost

In [59]:
# initialize 
xg_boost= XGBRegressor()
#fit
xg_boost.fit(X_train_ss, y_train)
xg_pred= xg_boost.predict(X_test_ss)
xg_pred= np.round(xg_pred)
xg_train_pred= xg_boost.predict(X_train_ss)
xg_train_pred= np.round(xg_train_pred)
xg_train_accuracy=define_late(y_true_train, xg_train_pred)
xg_test_accuracy= define_late(y_true_test, xg_pred)
print(f'Train Score: {xg_boost.score(X_train_ss, y_train)}')
print(f'test score: {xg_boost.score(X_test_ss, y_test)}')
print(f'Test Accuracy : {xg_test_accuracy}')
print(f'Train Accuracy : {xg_train_accuracy}')

Train Score: 0.2243377815372739
test score: 0.13916641540699792
Test Accuracy : 71.60368859511495
Train Accuracy : 71.63528697236562


In [60]:
print(f'rigde lost= {evaluate_loss(ridge_preds, y_true_test)} -- linear loss= {evaluate_loss(linear_preds, y_true_test)} --xg lost= {evaluate_loss(xg_pred, y_true_test)}')

rigde lost= 0.8006822483829498 -- linear loss= 0.8006579546719841 --xg lost= 0.758265429037058


In [61]:
accuracy = [linear_accuracy_test, linear_accuracy_train, ridge_accuracy_test, ridge_accuracy_train, xg_test_accuracy, xg_train_accuracy]
columns=['linear test', 'linear train', 'ridge test', 'ridge train', 'xg test', 'xg train']

#### Neural Network

In [62]:
#build model
tf.random.set_seed(123)
# Create a new sequential model
nn_model= keras.Sequential()
regularizer= keras.regularizers.l2(0.02)
#hidden layers
nn_model.add(Dense(40, activation="relu", kernel_regularizer=regularizer))
nn_model.add(Dropout(0.2))
nn_model.add(Dense(40, activation="relu", kernel_regularizer=regularizer))
nn_model.add(Dense(40, activation="relu", kernel_regularizer=regularizer))
# nn_model.add(Dropout(0.2))
#output layer
nn_model.add(Dense(1))

#compile nn_model
nn_model.compile(
     optimizer=keras.optimizers.Adam(),
     loss=keras.losses.MeanSquaredError(),
     metrics=[keras.metrics.BinaryAccuracy()]
)

In [63]:
history= nn_model.fit(X_train_ss, y_train, epochs=50,batch_size=64, verbose=0)

In [67]:
#evaluate
NN_train_accuracy=history.history['binary_accuracy'][-1] 
result = nn_model.evaluate(X_test_ss,y_test, verbose=0)
NN_pred_test = np.round(nn_model.predict(X_test_ss))
NN_pred_train = np.round(nn_model.predict(X_train_ss))



In [68]:
print(f'Test Accuracy: {define_late(y_true_test, NN_pred_test)}')
print(f'Train Accuracy: {define_late(y_true_train, NN_pred_train)}')

Test Accuracy: 73.81846524480974
Train Accuracy: 73.85767790262172


#### Recurrent Neural Network


In [70]:
print(X_train_ss.shape, y_train.shape)

(395160, 8) (395160,)


In [34]:
#Make embedding
number_class= X_train_ss.shape[1]
embedding_dim= 8


In [71]:
tf.random.set_seed(123)

#define rnn
rnn_mode= keras.Sequential()
# add layers
embedding_layer=Embedding(number_class, embedding_dim)

rnn_mode.add(LSTM(64, activation='relu', input_shape=(X_train_ss.shape[1], 1)))

rnn_mode.add(Dense(64, activation= 'relu'))

#output layer
rnn_mode.add(Dense(1))

# Compile mode
rnn_mode.compile(
    loss='mean_squared_error', 
    optimizer=Adam(learning_rate=0.02),
    metrics='accuracy'
)

In [72]:
# fit model
rnn_history= rnn_mode.fit(X_train_ss, y_train, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [74]:
rnn_train_accu= rnn_history.history['accuracy'][-1]
result= rnn_mode.evaluate(X_test_ss, y_test)
rnn_pred_test= np.round(rnn_mode.predict(X_test_ss))
rnn_pred_train= np.round(rnn_mode.predict(X_train_ss))



In [75]:
rnn_mode.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 64)                16896     
                                                                 
 dense_10 (Dense)            (None, 64)                4160      
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 21,121
Trainable params: 21,121
Non-trainable params: 0
_________________________________________________________________


In [76]:
print(f'Test  accuracy= {define_late(y_true_test, rnn_pred_test)}')
print(f'Train  accuracy= {define_late(y_true_train, rnn_pred_train)}')

Test  accuracy= 71.93873935884848
Train  accuracy= 71.86354894220062


So far we have tried : Linear Regression, Ridge regression, XGboost, Neural Network, and Recurrent Neural Network

The score we have for each model is:

  | Model | Accurancy Score |
  | ----------- | ----------- |
  | Linear Regression | 70.49 |
  | Ridge Regression | 70.48 |
  | XGboost | 71.51 |
  | Neural Network | 72.21 |
  | Recurrent Neural Network | 71.9486 |

At the moment, Neural Network have the best accuracy for the datase with 72.21 percent of accuracy.  We are going to do tune hyperparameter to see if we can imporve the accuracy of the model.

### Tuning Hyperparameter


##### Turning Hyperparameter for `Ridge` and `XGboost`

In [102]:
estimators= [
    ('normalise', StandardScaler()),
    ('model', LinearRegression())
] 

my_pipe= Pipeline(estimators)
grid1= [
    {
        'model': [Ridge()],
        'normalise':[StandardScaler()],
        'model__alpha':[0.001, 0.01, 0.1, 1],
        'model__solver':['auto', ]
    }]
gridCV1= GridSearchCV(my_pipe, grid1, cv=5, verbose=0)
fit_grid1= gridCV1.fit(X_train, y_train)


In [107]:
grid2=[    
    {
        'model':[XGBRegressor()],
        'normalise':[StandardScaler()],
        'model__subsample': np.arange(0.1, 1, 0.2),
        'model__max_depth': range (4, 12, 2),
        'model__n_estimators': [60, 120, 180],
        'model__learning_rate': [0.1, 0.01, 0.05]
    }
]
gridCV2= GridSearchCV(my_pipe, grid2, cv=10, verbose=0)
fit_grid2= gridCV2.fit(X_train, y_train)

In [103]:
fit_grid1.best_estimator_

In [104]:
fit_grid1.best_params_

{'model': Ridge(alpha=1),
 'model__alpha': 1,
 'model__solver': 'auto',
 'normalise': StandardScaler()}

In [None]:
fit_grid2.best_estimator_

NameError: name 'fit_grid2' is not defined

In [None]:
fit_grid2.best_params_

NameError: name 'fit_grid2' is not defined

In [105]:
grid1_pred_test= np.round(fit_grid1.predict(X_test))
grid1_pred_train= np.round(fit_grid1.predict(X_train))

In [106]:
print(f'Ridge tuned hyperparameter Test Accuracy: {define_late(y_true_test, grid1_pred_test)}')
print(f'Ridge tuned hyperparameter Train Accuracy: {define_late( y_true_train, grid1_pred_train)}')

Test Accuracy: 70.61371987326781
Train Accuracy: 70.6283530721733


In [None]:
print(f'XGBoost Test Accuracy: {define_late(y_true_test), np.round(fit_grid2.predict(X_test))}')
print(f'XGBoost  Train Accuracy: {define_late(y_true_train), np.round(fit_grid2.predict(X_train))}')