In [2]:
# imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from joblib import load, dump

%matplotlib inline

In [3]:
# loading data
clearedDF = '../../data/processed/casted_clearedDF.jsonl'
data = pd.read_json(clearedDF, lines = True)
data = data.drop(columns=["session_id", "user_id", "purchase_timestamp", "delivery_timestamp"])
data = data.drop(columns=["product_name", "price", "category_path", "week_day"])
data = data.drop(columns=["delivery_company"])
data.head()

Unnamed: 0,city,street,delivery_time,purchase_week_day_plus_hour
0,Radom,ul. Bukowa 64,2,14
1,Radom,ul. Bukowa 64,3,109
2,Gdynia,ul. Bałtycka 29,1,3
3,Gdynia,ul. Bałtycka 29,3,113
4,Gdynia,ul. Bałtycka 29,3,3


In [4]:
# converting strings to integers
label_city = LabelEncoder()
data['city'] = label_city.fit_transform(data['city'])
label_street = LabelEncoder()
data['street'] = label_street.fit_transform(data['street'])
#label_product_name = LabelEncoder()
#data['product_name'] = label_product_name.fit_transform(data['product_name'])
#label_category_path = LabelEncoder()
#data['category_path'] = label_category_path.fit_transform(data['category_path'])

In [5]:
# extract label from data
X = data.drop(columns=["delivery_time"])
y = data["delivery_time"]
data.head()

Unnamed: 0,city,street,delivery_time,purchase_week_day_plus_hour
0,3,139,2,14
1,3,139,3,109
2,0,136,1,3
3,0,136,3,113
4,0,136,3,3


In [6]:
# dividing data to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [7]:
# scaling data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [8]:
# random forest classifier
rfc = RandomForestClassifier(n_estimators = 128, min_samples_split = 16)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

rfc_Cross = cross_val_score(rfc, X, y, n_jobs = -1, cv = ms.KFold(shuffle = True))

In [9]:
print(classification_report(y_test, pred_rfc, zero_division=0))
print(confusion_matrix(y_test, pred_rfc))
print("%0.2f accuracy with a standard deviation of %0.2f" % (rfc_Cross.mean(), rfc_Cross.std()))

              precision    recall  f1-score   support

           1       0.33      0.06      0.10        81
           2       0.70      0.77      0.73       443
           3       0.76      0.87      0.81       692
           4       0.77      0.44      0.56       182
           5       0.00      0.00      0.00         9

    accuracy                           0.73      1407
   macro avg       0.51      0.43      0.44      1407
weighted avg       0.71      0.73      0.71      1407

[[  5  66  10   0   0]
 [  9 342  81  11   0]
 [  1  76 603  12   0]
 [  0   8  94  80   0]
 [  0   0   8   1   0]]
0.74 accuracy with a standard deviation of 0.01


In [10]:
# svm classifier
svmc = svm.SVC(C = 174, gamma = 0.1)
svmc.fit(X_train, y_train)
pred_svmc = svmc.predict(X_test)

svmc_Cross = cross_val_score(svmc, X, y, n_jobs = -1, cv = ms.KFold(shuffle = True))


In [11]:
print(classification_report(y_test, pred_svmc, zero_division=0))
print(confusion_matrix(y_test, pred_svmc))
print(svmc_Cross)
print("%0.2f accuracy with a standard deviation of %0.2f" % (svmc_Cross.mean(), svmc_Cross.std()))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        81
           2       0.58      0.81      0.68       443
           3       0.67      0.76      0.72       692
           4       0.00      0.00      0.00       182
           5       0.00      0.00      0.00         9

    accuracy                           0.63      1407
   macro avg       0.25      0.31      0.28      1407
weighted avg       0.51      0.63      0.56      1407

[[  0  79   2   0   0]
 [  0 359  84   0   0]
 [  0 163 529   0   0]
 [  0  19 163   0   0]
 [  0   0   9   0   0]]
[0.56787491 0.57325747 0.56685633 0.58321479 0.54267425]
0.57 accuracy with a standard deviation of 0.01


In [12]:
# mulit-layer perceptron
mlpc = MLPClassifier(activation = 'logistic', max_iter = 1024)
mlpc.fit(X_train, y_train)
pred_mlpc = mlpc.predict(X_test)

mlpc_Cross = cross_val_score(mlpc, X, y, n_jobs = -1, cv = ms.KFold(shuffle = True))


In [13]:
print(classification_report(y_test, pred_mlpc, zero_division=0))
print(confusion_matrix(y_test, pred_mlpc))
print(mlpc_Cross)
print("%0.2f accuracy with a standard deviation of %0.2f" % (mlpc_Cross.mean(), mlpc_Cross.std()))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        81
           2       0.37      0.41      0.39       443
           3       0.56      0.74      0.64       692
           4       0.00      0.00      0.00       182
           5       0.00      0.00      0.00         9

    accuracy                           0.49      1407
   macro avg       0.19      0.23      0.21      1407
weighted avg       0.39      0.49      0.44      1407

[[  0  58  23   0   0]
 [  0 181 262   0   0]
 [  0 179 513   0   0]
 [  0  65 117   0   0]
 [  0   5   4   0   0]]
[0.65174129 0.57467994 0.66429587 0.66785206 0.62944523]
0.64 accuracy with a standard deviation of 0.03


In [14]:
#persit models
dump(svmc, '../../models/svm.joblib')
dump(rfc, '../../models/random-forest.joblib')
dump(mlpc, '../../models/ml-perceptron.joblib')

['../../models/ml-perceptron.joblib']