In [1]:
# imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from joblib import load, dump

%matplotlib inline

In [2]:
# loading and clearing data
clearedDF = '../../data/processed/casted_clearedDF.jsonl'
data = pd.read_json(clearedDF, lines = True)
data = data.drop(columns=["session_id", "user_id", "purchase_timestamp", "delivery_timestamp"])
data = data.drop(columns=["product_name", "price", "category_path", "week_day"])
data = data.drop(columns=["delivery_company"])
data.head()

Unnamed: 0,city,street,delivery_time,purchase_week_day_plus_hour
0,Radom,ul. Bukowa 64,2,14
1,Radom,ul. Bukowa 64,3,109
2,Gdynia,ul. Bałtycka 29,1,3
3,Gdynia,ul. Bałtycka 29,3,113
4,Gdynia,ul. Bałtycka 29,3,3


In [3]:
# converting strings to integers
label_city = LabelEncoder()
data['city'] = label_city.fit_transform(data['city'])
label_street = LabelEncoder()
data['street'] = label_street.fit_transform(data['street'])

In [4]:
# extract label from data
X = data.drop(columns=["delivery_time"])
y = data["delivery_time"]
data.head()

Unnamed: 0,city,street,delivery_time,purchase_week_day_plus_hour
0,3,139,2,14
1,3,139,3,109
2,0,136,1,3
3,0,136,3,113
4,0,136,3,3


In [5]:
# dividing data to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# SVM Classifier

Poszukiwanie jak najlepszych paramatrów modelu wykonaliśmy w excelu:

![alt text](images/przeszukiwanie.png "Przeszukiwanie")

In [6]:
svmc = svm.SVC(C = 174, gamma = 0.1)
svmc.fit(X_train, y_train)
pred_svmc = svmc.predict(X_test)

svmc_Cross = cross_val_score(svmc, X, y, n_jobs = -1, cv = ms.KFold(shuffle = True))

print(classification_report(y_test, pred_svmc, zero_division=0))
print(confusion_matrix(y_test, pred_svmc))
print(svmc_Cross)
print("%0.2f accuracy with a standard deviation of %0.2f" % (svmc_Cross.mean(), svmc_Cross.std()))

              precision    recall  f1-score   support

           1       0.19      0.14      0.16        86
           2       0.55      0.59      0.57       458
           3       0.65      0.68      0.67       649
           4       0.51      0.45      0.48       202
           5       0.33      0.17      0.22        12

    accuracy                           0.58      1407
   macro avg       0.45      0.40      0.42      1407
weighted avg       0.57      0.58      0.57      1407

[[ 12  49  23   2   0]
 [ 39 268 129  21   1]
 [ 12 132 441  62   2]
 [  1  32  77  91   1]
 [  0   4   4   2   2]]
[0.56858564 0.57965861 0.57681366 0.58463727 0.58605974]
0.58 accuracy with a standard deviation of 0.01


# Mulit-layer perceptron

Dla kolejnego modelu skorzystaliśmy z GridSearchCV:

In [7]:
mlpc_grid = GridSearchCV(MLPClassifier(), {
    'hidden_layer_sizes': [(27,18,9), (81,54,27), (81,81,81), (81)],
    'activation': ['identity','logistic', 'tanh','relu'],
    'solver': ['lbfgs', 'sgd', 'adam']
}, n_jobs = -1, cv = ms.KFold(shuffle = True))
mlpc_grid.fit(X, y)
df = pd.DataFrame(mlpc_grid.cv_results_)
df.drop(columns=["split0_test_score", "split1_test_score", "split2_test_score", "split3_test_score", "split4_test_score"])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_hidden_layer_sizes,param_solver,params,mean_test_score,std_test_score,rank_test_score
0,2.512202,0.903593,0.005403,0.001496,identity,"(27, 18, 9)",lbfgs,"{'activation': 'identity', 'hidden_layer_sizes...",0.463943,0.068809,46
1,2.577606,0.687006,0.003398,0.000494,identity,"(27, 18, 9)",sgd,"{'activation': 'identity', 'hidden_layer_sizes...",0.485282,0.013306,41
2,1.318801,0.264588,0.004,0.000631,identity,"(27, 18, 9)",adam,"{'activation': 'identity', 'hidden_layer_sizes...",0.466076,0.03112,45
3,9.309007,1.71475,0.007396,0.000795,identity,"(81, 54, 27)",lbfgs,"{'activation': 'identity', 'hidden_layer_sizes...",0.490828,0.006901,37
4,3.762796,0.792139,0.006401,0.0008,identity,"(81, 54, 27)",sgd,"{'activation': 'identity', 'hidden_layer_sizes...",0.48784,0.009489,39
5,1.110003,0.089699,0.006193,0.001163,identity,"(81, 54, 27)",adam,"{'activation': 'identity', 'hidden_layer_sizes...",0.438637,0.047457,48
6,25.333794,17.467873,0.009404,0.0012,identity,"(81, 81, 81)",lbfgs,"{'activation': 'identity', 'hidden_layer_sizes...",0.4914,0.017685,35
7,6.4892,1.627218,0.009201,0.0016,identity,"(81, 81, 81)",sgd,"{'activation': 'identity', 'hidden_layer_sizes...",0.500924,0.007956,31
8,2.669399,1.691941,0.009798,0.001944,identity,"(81, 81, 81)",adam,"{'activation': 'identity', 'hidden_layer_sizes...",0.444879,0.095869,47
9,5.685199,1.553635,0.004796,0.000742,identity,81,lbfgs,"{'activation': 'identity', 'hidden_layer_sizes...",0.492392,0.008638,34


odrzucamy parametry dające najgorsze wyniki:
- 'hidden_layer_sizes': [(27,18,9), (81,54,27), (81)],
- 'activation': ['identity', 'relu'],
- 'solver': ['sgd']

zauważamy że ilość i wielkość poziomów poprawia wynik sprawdzamy czy uda nam się jeszcze go ulepszyć

In [8]:
mlpc_grid = GridSearchCV(MLPClassifier(), {
    'hidden_layer_sizes': [(81,81,81), (81,81,81,81,81,81), (120,120,120,120)],
    'activation': ['logistic', 'tanh'],
    'solver': ['lbfgs', 'adam']
}, n_jobs = -1, cv = ms.KFold(shuffle = True))
mlpc_grid.fit(X, y)
df = pd.DataFrame(mlpc_grid.cv_results_)
df.drop(columns=["split0_test_score", "split1_test_score", "split2_test_score", "split3_test_score", "split4_test_score"])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_hidden_layer_sizes,param_solver,params,mean_test_score,std_test_score,rank_test_score
0,21.470207,0.448276,0.013603,0.000799,logistic,"(81, 81, 81)",lbfgs,"{'activation': 'logistic', 'hidden_layer_sizes...",0.518131,0.020843,11
1,21.085409,0.957915,0.013997,0.001412,logistic,"(81, 81, 81)",adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.679133,0.013669,5
2,2.129802,0.158069,0.0242,0.000746,logistic,"(81, 81, 81, 81, 81, 81)",lbfgs,"{'activation': 'logistic', 'hidden_layer_sizes...",0.476176,0.020208,12
3,24.349806,8.282902,0.025394,0.001353,logistic,"(81, 81, 81, 81, 81, 81)",adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.571614,0.044293,6
4,48.0788,1.186166,0.0276,0.001857,logistic,"(120, 120, 120, 120)",lbfgs,"{'activation': 'logistic', 'hidden_layer_sizes...",0.526668,0.014413,7
5,45.741394,0.880564,0.029606,0.00408,logistic,"(120, 120, 120, 120)",adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.731474,0.013131,1
6,24.200797,0.519829,0.015401,0.000797,tanh,"(81, 81, 81)",lbfgs,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.525102,0.012766,8
7,23.483799,0.264963,0.019004,0.001906,tanh,"(81, 81, 81)",adam,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.710708,0.017739,2
8,47.050797,2.30275,0.027201,0.000979,tanh,"(81, 81, 81, 81, 81, 81)",lbfgs,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.524674,0.022227,9
9,33.655599,12.751207,0.032199,0.000746,tanh,"(81, 81, 81, 81, 81, 81)",adam,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.702743,0.019553,3


odrzucamy:
- 'solver': ['lbfgs'] - 'adam' daje lepsze wyniki

parametry modelu wyglądają na dobrze dobrane, sprawdzamy który 'learning_rate' da lepszy wynik

In [9]:
mlpc_grid = GridSearchCV(MLPClassifier(), {
    'hidden_layer_sizes': [(120, 120, 120, 120), (243, 243, 243)],
    'activation': ['logistic', 'tanh'],
    'solver': ['adam'],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}, n_jobs = -1, cv = ms.KFold(shuffle = True))
mlpc_grid.fit(X, y)
df = pd.DataFrame(mlpc_grid.cv_results_)
df.drop(columns=["split0_test_score", "split1_test_score", "split2_test_score", "split3_test_score", "split4_test_score"])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_hidden_layer_sizes,param_learning_rate,param_solver,params,mean_test_score,std_test_score,rank_test_score
0,41.655212,2.406887,0.025401,0.001018,logistic,"(120, 120, 120, 120)",constant,adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.729907,0.016084,3
1,43.709408,2.444512,0.026597,0.000799,logistic,"(120, 120, 120, 120)",invscaling,adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.731614,0.017739,2
2,45.008208,3.133974,0.027798,0.000979,logistic,"(120, 120, 120, 120)",adaptive,adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.72493,0.018986,6
3,95.065402,10.047094,0.053801,0.004708,logistic,"(243, 243, 243)",constant,adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.729623,0.012277,4
4,109.892797,13.138803,0.054593,0.008409,logistic,"(243, 243, 243)",invscaling,adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.73261,0.014363,1
5,101.149405,5.11633,0.045796,0.002402,logistic,"(243, 243, 243)",adaptive,adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.729197,0.009598,5
6,44.035395,8.659477,0.035398,0.001349,tanh,"(120, 120, 120, 120)",constant,adam,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.717249,0.012714,10
7,39.355994,7.368435,0.034599,0.0012,tanh,"(120, 120, 120, 120)",invscaling,adam,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.724647,0.007676,7
8,42.417204,9.398621,0.037197,0.001716,tanh,"(120, 120, 120, 120)",adaptive,adam,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.70089,0.028354,11
9,78.010804,10.562645,0.065603,0.004082,tanh,"(243, 243, 243)",constant,adam,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.718246,0.009851,8


In [10]:
mlpc = MLPClassifier(hidden_layer_sizes = (243, 243, 243), activation = 'logistic', learning_rate = 'constant', solver = 'adam', max_iter = 1024)
mlpc.fit(X_train, y_train)
pred_mlpc = mlpc.predict(X_test)

mlpc_Cross = cross_val_score(mlpc, X, y, n_jobs = -1, cv = ms.KFold(shuffle = True))

print(classification_report(y_test, pred_mlpc, zero_division=0))
print(confusion_matrix(y_test, pred_mlpc))
print(mlpc_Cross)
print("%0.2f accuracy with a standard deviation of %0.2f" % (mlpc_Cross.mean(), mlpc_Cross.std()))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        86
           2       0.69      0.81      0.74       458
           3       0.75      0.84      0.79       649
           4       0.77      0.55      0.64       202
           5       0.00      0.00      0.00        12

    accuracy                           0.73      1407
   macro avg       0.44      0.44      0.44      1407
weighted avg       0.68      0.73      0.70      1407

[[  0  76  10   0   0]
 [  0 369  76  13   0]
 [  0  85 544  20   0]
 [  0   5  85 112   0]
 [  0   1  10   1   0]]
[0.73631841 0.7311522  0.73897582 0.71763869 0.72972973]
0.73 accuracy with a standard deviation of 0.01


In [11]:
#persit models
dump(svmc, '../../models/svm.joblib')
dump(mlpc, '../../models/ml-perceptron.joblib')

['../../models/ml-perceptron.joblib']