In [1]:
_author_ = 'Julia Schmid'

In [2]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
df = pd.read_csv('shipping.csv')

In [4]:
df.head(10)

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1
5,6,F,Flight,3,1,162,3,medium,F,12,1417,1
6,7,D,Flight,3,4,250,3,low,F,3,2371,1
7,8,F,Flight,4,1,233,2,low,F,48,2804,1
8,9,A,Flight,3,4,150,3,low,F,11,1861,1
9,10,B,Flight,3,2,164,3,medium,F,29,1187,1


In [7]:
df.shape

(10999, 12)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   10999 non-null  int64 
 1   Warehouse_block      10999 non-null  object
 2   Mode_of_Shipment     10999 non-null  object
 3   Customer_care_calls  10999 non-null  int64 
 4   Customer_rating      10999 non-null  int64 
 5   Cost_of_the_Product  10999 non-null  int64 
 6   Prior_purchases      10999 non-null  int64 
 7   Product_importance   10999 non-null  object
 8   Gender               10999 non-null  object
 9   Discount_offered     10999 non-null  int64 
 10  Weight_in_gms        10999 non-null  int64 
 11  Reached.on.Time_Y.N  10999 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 1.0+ MB


In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,10999.0,5500.0,3175.28214,1.0,2750.5,5500.0,8249.5,10999.0
Customer_care_calls,10999.0,4.054459,1.14149,2.0,3.0,4.0,5.0,7.0
Customer_rating,10999.0,2.990545,1.413603,1.0,2.0,3.0,4.0,5.0
Cost_of_the_Product,10999.0,210.196836,48.063272,96.0,169.0,214.0,251.0,310.0
Prior_purchases,10999.0,3.567597,1.52286,2.0,3.0,3.0,4.0,10.0
Discount_offered,10999.0,13.373216,16.205527,1.0,4.0,7.0,10.0,65.0
Weight_in_gms,10999.0,3634.016729,1635.377251,1001.0,1839.5,4149.0,5050.0,7846.0
Reached.on.Time_Y.N,10999.0,0.596691,0.490584,0.0,0.0,1.0,1.0,1.0


In [8]:
categoricalVar = [col for col in df if df[col].dtype == 'object']
print('Kategorische Variablen: ' + str(categoricalVar))

numericalVar = [col for col in df if df[col].dtype != 'object']
print('Numerische Variablen: '+ str(numericalVar))

Kategorische Variablen: ['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']
Numerische Variablen: ['ID', 'Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Discount_offered', 'Weight_in_gms', 'Reached.on.Time_Y.N']


In [9]:
notRelevantVar = ['ID']
df.drop(notRelevantVar, axis = 1, inplace = True)

In [10]:
for var in categoricalVar:
    dummies = pd.get_dummies(df[var], prefix=var, dtype=int)
    df = pd.concat([df, dummies], axis=1)
    df.drop(var, axis=1, inplace=True)  

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   Customer_care_calls        10999 non-null  int64
 1   Customer_rating            10999 non-null  int64
 2   Cost_of_the_Product        10999 non-null  int64
 3   Prior_purchases            10999 non-null  int64
 4   Discount_offered           10999 non-null  int64
 5   Weight_in_gms              10999 non-null  int64
 6   Reached.on.Time_Y.N        10999 non-null  int64
 7   Warehouse_block_A          10999 non-null  int64
 8   Warehouse_block_B          10999 non-null  int64
 9   Warehouse_block_C          10999 non-null  int64
 10  Warehouse_block_D          10999 non-null  int64
 11  Warehouse_block_F          10999 non-null  int64
 12  Mode_of_Shipment_Flight    10999 non-null  int64
 13  Mode_of_Shipment_Road      10999 non-null  int64
 14  Mode_of_Shipment_Ship 

In [12]:
print(df['Reached.on.Time_Y.N'].value_counts())
print(df.shape)

Reached.on.Time_Y.N
1    6563
0    4436
Name: count, dtype: int64
(10999, 20)


In [13]:
y = df['Reached.on.Time_Y.N']
X = df.loc[:,df.columns != 'Reached.on.Time_Y.N']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

rf = RandomForestClassifier(criterion='entropy', n_estimators=500, random_state=33)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)
roc_auc = roc_auc_score(y_test, y_pred_rf)

# Ergebnisse ausgeben
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)


Accuracy: 0.6536363636363637
Precision: 0.7451838879159369
Recall: 0.6437216338880484
F1 Score: 0.6907467532467533
ROC AUC: 0.6561432770806985


In [18]:
# Hyperparameter Tuning 
from sklearn.model_selection import RandomizedSearchCV

random_grid_rf = {'n_estimators': [100, 500, 1000],
               'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [20, 50, 100],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 3, 5],
               'bootstrap': [True, False], 
               'criterion': ['gini', 'entropy']}

print(random_grid_rf)

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid_rf, cv = 5, random_state=123, n_jobs=-1)
rf_random.fit(X_train, y_train)

rf_random.best_params_


{'n_estimators': [100, 500, 1000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [20, 50, 100], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 3, 5], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']}


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
11 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/juliaschmid/Desktop/MALE01_TEST/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/juliaschmid/Desktop/MALE01_TEST/.venv/lib/python3.11/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Users/juliaschmid/Desktop/MALE01_TEST/.venv/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/juliaschmid/Desktop/MALE01_TEST/.venv/lib/python3

{'n_estimators': 100,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 20,
 'criterion': 'entropy',
 'bootstrap': True}

In [19]:
from sklearn.ensemble import RandomForestClassifier

rf_ht = RandomForestClassifier(
    random_state=123, 
    n_estimators=100,  
    min_samples_split = 10, 
    min_samples_leaf = 1, 
    max_features = 'log2', 
    max_depth = 20, 
    criterion='entropy', 
    bootstrap = True
    )
rf_ht.fit(X_train, y_train)
y_predicted_rf_ht = rf_ht.predict(X_test)

accuracy = accuracy_score(y_test, y_predicted_rf_ht)
precision = precision_score(y_test, y_predicted_rf_ht)
recall = recall_score(y_test, y_predicted_rf_ht)
f1 = f1_score(y_test, y_predicted_rf_ht)
roc_auc = roc_auc_score(y_test, y_predicted_rf_ht)

# Ergebnisse ausgeben
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)

Accuracy: 0.655
Precision: 0.761860465116279
Recall: 0.6195158850226928
F1 Score: 0.6833541927409261
ROC AUC: 0.6639720655181802


In [15]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective="binary:logistic", eval_metric = 'auc', 
random_state=33, n_estimators=500, learning_rate=0.2)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_xgb)
precision = precision_score(y_test, y_pred_xgb)
recall = recall_score(y_test, y_pred_xgb)
f1 = f1_score(y_test, y_pred_xgb)
roc_auc = roc_auc_score(y_test, y_pred_xgb)

# Ergebnisse ausgeben
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)


Accuracy: 0.6527272727272727
Precision: 0.7217806041335453
Recall: 0.686838124054463
F1 Score: 0.703875968992248
ROC AUC: 0.644102433325637


In [16]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred_continuous = regressor.predict(X_test)


# Umwandlung in Binäre Klassifikation (>= 0 -> 1, <0 -> 0) ?????????????????
temp = 0.5
y_pred_linRegression_binary = (y_pred_continuous >= temp).astype(int)


accuracy = accuracy_score(y_test, y_pred_linRegression_binary)
precision = precision_score(y_test, y_pred_linRegression_binary)
recall = recall_score(y_test, y_pred_linRegression_binary)
f1 = f1_score(y_test, y_pred_linRegression_binary)
roc_auc = roc_auc_score(y_test, y_pred_linRegression_binary) # !!!!!!

# Ergebnisse ausgeben
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)

Accuracy: 0.6613636363636364
Precision: 0.727344365642238
Recall: 0.6981845688350984
F1 Score: 0.7124662292551138
ROC AUC: 0.652053560043973


In [17]:
from sklearn.svm import SVC

svc = SVC(kernel='poly', probability=True)  
svc.fit(X_train, y_train)
y_pred_svm = svc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_svm)
precision = precision_score(y_test, y_pred_svm)
recall = recall_score(y_test, y_pred_svm)
f1 = f1_score(y_test, y_pred_svm)
roc_auc = roc_auc_score(y_test, y_pred_svm) # !!!!!!

# Ergebnisse ausgeben
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)


Accuracy: 0.6418181818181818
Precision: 0.680161943319838
Recall: 0.762481089258699
F1 Score: 0.7189728958630528
ROC AUC: 0.61130888175919
