In [3]:
# imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from joblib import load, dump

%matplotlib inline

In [4]:
# loading data
clearedDF = '../../data/processed/casted_clearedDF.jsonl'
data = pd.read_json(clearedDF, lines = True)
data = data.drop(columns=["session_id", "user_id", "purchase_timestamp", "delivery_timestamp"])
#data = data.drop(columns=["product_name", "price", "category_path", "week_day"])
data.head()

Unnamed: 0,city,street,product_name,category_path,price,delivery_company,delivery_time,week_day,purchase_week_day_plus_hour
0,Radom,ul. Bukowa 64,Plantronics Savi W710,Sprzęt RTV,553,360,2,0,14
1,Radom,ul. Bukowa 64,Plantronics Savi W710,Sprzęt RTV,553,620,3,4,109
2,Gdynia,ul. Bałtycka 29,Plantronics Savi W710,Sprzęt RTV,553,620,1,0,3
3,Gdynia,ul. Bałtycka 29,Plantronics Savi W710,Sprzęt RTV,553,360,3,4,113
4,Gdynia,ul. Bałtycka 29,Plantronics Savi W710,Sprzęt RTV,553,360,3,0,3


In [5]:
# converting strings to integers
label_city = LabelEncoder()
data['city'] = label_city.fit_transform(data['city'])
label_street = LabelEncoder()
data['street'] = label_street.fit_transform(data['street'])
label_product_name = LabelEncoder()
data['product_name'] = label_product_name.fit_transform(data['product_name'])
label_category_path = LabelEncoder()
data['category_path'] = label_category_path.fit_transform(data['category_path'])

In [6]:
# extract label from data
X = data.drop(columns=["delivery_time"])
y = data["delivery_time"]
data.head()

Unnamed: 0,city,street,product_name,category_path,price,delivery_company,delivery_time,week_day,purchase_week_day_plus_hour
0,3,139,66,3,553,360,2,0,14
1,3,139,66,3,553,620,3,4,109
2,0,136,66,3,553,620,1,0,3
3,0,136,66,3,553,360,3,4,113
4,0,136,66,3,553,360,3,0,3


In [7]:
# dividing data to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [8]:
# scaling data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [9]:
# random forest classifier
rfc = RandomForestClassifier(n_estimators=128)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

In [10]:
print(classification_report(y_test, pred_rfc))
print(confusion_matrix(y_test, pred_rfc))

              precision    recall  f1-score   support

           1       0.23      0.06      0.10        81
           2       0.69      0.73      0.71       443
           3       0.76      0.87      0.81       692
           4       0.71      0.48      0.58       182
           5       0.00      0.00      0.00         9

    accuracy                           0.72      1407
   macro avg       0.48      0.43      0.44      1407
weighted avg       0.70      0.72      0.70      1407

[[  5  66  10   0   0]
 [ 17 325  88  12   1]
 [  0  69 600  21   2]
 [  0  12  82  88   0]
 [  0   0   6   3   0]]


In [11]:
# svm classifier
svmc = svm.SVC(C=158, gamma=0.11)
svmc.fit(X_train, y_train)
pred_svmc = svmc.predict(X_test)


In [12]:
print(classification_report(y_test, pred_svmc))
print(confusion_matrix(y_test, pred_svmc))

              precision    recall  f1-score   support

           1       0.22      0.10      0.14        81
           2       0.57      0.65      0.61       443
           3       0.68      0.72      0.70       692
           4       0.50      0.35      0.41       182
           5       0.20      0.11      0.14         9

    accuracy                           0.61      1407
   macro avg       0.43      0.39      0.40      1407
weighted avg       0.59      0.61      0.60      1407

[[  8  58  14   1   0]
 [ 22 286 119  14   2]
 [  5 140 500  45   2]
 [  2  18  98  64   0]
 [  0   0   3   5   1]]


In [13]:
#mulit-layer perceptron
mlpc = MLPClassifier(max_iter=1024)
mlpc.fit(X_train, y_train)
pred_mlpc = mlpc.predict(X_test)


In [14]:
print(classification_report(y_test, pred_mlpc))
print(confusion_matrix(y_test, pred_mlpc))

              precision    recall  f1-score   support

           1       0.26      0.07      0.12        81
           2       0.67      0.74      0.71       443
           3       0.76      0.85      0.80       692
           4       0.75      0.46      0.57       182
           5       0.33      0.22      0.27         9

    accuracy                           0.72      1407
   macro avg       0.55      0.47      0.49      1407
weighted avg       0.70      0.72      0.70      1407

[[  6  67   8   0   0]
 [ 16 329  88  10   0]
 [  1  82 590  15   4]
 [  0  12  87  83   0]
 [  0   0   4   3   2]]


In [15]:
#persit models
dump(svmc, '../../models/svm.joblib')
dump(rfc, '../../models/random-forest.joblib')
dump(mlpc, '../../models/ml-perceptron.joblib')



['../../models/ml-perceptron.joblib']