In [178]:
# Import the modules
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

In [179]:
#load csv file into dataframe DataCoSupplyChainDataset_VL.csv from Resources folder
file_path = Path("Resources/DataCoSupplyChainDataset_VL.csv")
sales_df = pd.read_csv(file_path)
sales_df.head()


Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late_delivery_risk,Category Name,Customer City,Customer Country,Customer Id,Customer Segment,...,Order State,Order Status,Order Zipcode,Product Card Id,Product Category Id,Product Name,Product Price,shipping date (DateOrders),Shipping Mode,order_date_VL
0,DEBIT,3,4,Advance shipping,0,Sporting Goods,Caguas,Puerto Rico,20755,Consumer,...,Java Occidental,COMPLETE,,1360,73,Smart watch,327.75,2/3/2018 22:56,Standard Class,2018-01-31
1,TRANSFER,5,4,Late delivery,1,Sporting Goods,Caguas,Puerto Rico,19492,Consumer,...,Rajastán,PENDING,,1360,73,Smart watch,327.75,1/18/2018 12:27,Standard Class,2018-01-13
2,CASH,4,4,Shipping on time,0,Sporting Goods,San Jose,EE. UU.,19491,Consumer,...,Rajastán,CLOSED,,1360,73,Smart watch,327.75,1/17/2018 12:06,Standard Class,2018-01-13
3,DEBIT,3,4,Advance shipping,0,Sporting Goods,Los Angeles,EE. UU.,19490,Home Office,...,Queensland,COMPLETE,,1360,73,Smart watch,327.75,1/16/2018 11:45,Standard Class,2018-01-13
4,PAYMENT,2,4,Advance shipping,0,Sporting Goods,Caguas,Puerto Rico,19489,Corporate,...,Queensland,PENDING_PAYMENT,,1360,73,Smart watch,327.75,1/15/2018 11:24,Standard Class,2018-01-13


In [180]:
sales_df.columns

Index(['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)',
       'Delivery Status', 'Late_delivery_risk', 'Category Name',
       'Customer City', 'Customer Country', 'Customer Id', 'Customer Segment',
       'Customer State', 'Customer Zipcode', 'Department Id',
       'Department Name', 'Latitude', 'Longitude', 'Market', 'Order City',
       'Order Country', 'Order Customer Id', 'order date (DateOrders)',
       'Order Id', 'Order Item Cardprod Id', 'Order Item Discount',
       'Order Item Discount Rate', 'Order Item Id', 'Order Item Product Price',
       'Order Item Profit Ratio', 'Order Item Quantity', 'Sales',
       'Order Item Total', 'Order Profit Per Order', 'Order Region',
       'Order State', 'Order Status', 'Order Zipcode', 'Product Card Id',
       'Product Category Id', 'Product Name', 'Product Price',
       'shipping date (DateOrders)', 'Shipping Mode', 'order_date_VL'],
      dtype='object')

Unique Order dataframe preparation 

In [181]:
#remove rows with "Delivery Status"="Shipping canceled"
sales_df = sales_df[sales_df["Delivery Status"] != "Shipping canceled"]

In [182]:
# create a new dataframe orders_df listing all unique "Order ID" as index with min "Type", "Days for shipment (scheduled)"
orders_df = sales_df.groupby("Order Id").agg({"Type": "min", "Days for shipment (scheduled)": "min"})
#add columns to orders_df from sales_df with 'Customer Segment', 'Department Id', 'Order Country'
orders_df = orders_df.join(sales_df.groupby("Order Id").agg({"Customer Segment": "min","Market":"min", "Order Country": "min", "Order State":"min",
                                                             "Shipping Mode":"min", "Late_delivery_risk":"min","Order Region":"min",
                                                             "Order Status":"min"}))
#add columns to orders_df from sales_df with sum of "Order Item Product Price", "Order Item Discount", 'Order Item Quantity',"Order Item Total"
orders_df = orders_df.join(sales_df.groupby("Order Id").agg({"Order Item Product Price": "sum", "Order Item Discount": "sum", "Order Item Quantity": "sum", "Order Item Total": "sum"}))
# add column with number of unique "Product Card Id" per "Order Id"
orders_df["Number of Products"] = sales_df.groupby("Order Id")["Product Card Id"].nunique()
#add column with number of unique "Category Name" per "Order Id"
orders_df["Number of Categories"] = sales_df.groupby("Order Id")["Category Name"].nunique()
#add column with number of unique "Department Name" per "Order Id"
orders_df["Number of Departments"] = sales_df.groupby("Order Id")["Department Name"].nunique()
orders_df.tail()

Unnamed: 0_level_0,Type,Days for shipment (scheduled),Customer Segment,Market,Order Country,Order State,Shipping Mode,Late_delivery_risk,Order Region,Order Status,Order Item Product Price,Order Item Discount,Order Item Quantity,Order Item Total,Number of Products,Number of Categories,Number of Departments
Order Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
77200,TRANSFER,4,Consumer,Pacific Asia,Indonesia,Java Occidental,Standard Class,1,Southeast Asia,PROCESSING,215.820007,53.959999,1,161.869995,1,1,1
77201,DEBIT,4,Consumer,Pacific Asia,Indonesia,Java Occidental,Standard Class,0,Southeast Asia,COMPLETE,215.820007,43.16,1,172.660004,1,1,1
77202,DEBIT,4,Consumer,Pacific Asia,Indonesia,Java Occidental,Standard Class,0,Southeast Asia,COMPLETE,327.75,13.11,1,314.640015,1,1,1
77203,PAYMENT,4,Corporate,Pacific Asia,Indonesia,Java Occidental,Standard Class,0,Southeast Asia,PENDING_PAYMENT,11.54,0.63,1,10.91,1,1,1
77204,CASH,0,Corporate,Pacific Asia,Australia,Queensland,Same Day,1,Oceania,CLOSED,39.75,4.77,1,34.98,1,1,1


In [183]:
orders_df.columns

Index(['Type', 'Days for shipment (scheduled)', 'Customer Segment', 'Market',
       'Order Country', 'Order State', 'Shipping Mode', 'Late_delivery_risk',
       'Order Region', 'Order Status', 'Order Item Product Price',
       'Order Item Discount', 'Order Item Quantity', 'Order Item Total',
       'Number of Products', 'Number of Categories', 'Number of Departments'],
      dtype='object')

In [184]:
#change column name "Order Item Total" to "order_total", change column "Order Item Product Price" to "order_original_price", "Order Item Discount" to "order_discount", "Order Item Quantity" to "no_of_items"
orders_df = orders_df.rename(columns={"Order Item Total": "order_total", "Order Item Product Price": "order_original_price", 
                                      "Order Item Discount": "order_discount", "Order Item Quantity": "no_of_items",
                                      "Number of Products": "no_of_unique_items", "Number of Categories": "no_of_unique_categories","Number of Departments": "no_of_unique_departments"})

In [185]:
#show order id ="49405"
orders_df.loc[orders_df.index == 68703]

Unnamed: 0_level_0,Type,Days for shipment (scheduled),Customer Segment,Market,Order Country,Order State,Shipping Mode,Late_delivery_risk,Order Region,Order Status,order_original_price,order_discount,no_of_items,order_total,no_of_unique_items,no_of_unique_categories,no_of_unique_departments
Order Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
68703,DEBIT,4,Consumer,Europe,Reino Unido,Inglaterra,Standard Class,0,Northern Europe,COMPLETE,3089.949993,681.5,13,2768.409988,5,5,3


In [186]:
#save orders_df to csv file
orders_df.to_csv("Resources/orders_df.csv")

DATA CLEANING 

In [187]:
#show data type for each column
orders_df.dtypes

Type                              object
Days for shipment (scheduled)      int64
Customer Segment                  object
Market                            object
Order Country                     object
Order State                       object
Shipping Mode                     object
Late_delivery_risk                 int64
Order Region                      object
Order Status                      object
order_original_price             float64
order_discount                   float64
no_of_items                        int64
order_total                      float64
no_of_unique_items                 int64
no_of_unique_categories            int64
no_of_unique_departments           int64
dtype: object

In [188]:
#dummy code categorical columns
orders_df = pd.get_dummies(orders_df, columns=["Type", "Customer Segment", "Order Country", "Order State", "Shipping Mode", "Market", "Order Region", "Order Status"])
orders_df.head()

Unnamed: 0_level_0,Days for shipment (scheduled),Late_delivery_risk,order_original_price,order_discount,no_of_items,order_total,no_of_unique_items,no_of_unique_categories,no_of_unique_departments,Type_CASH,...,Order Region_West Asia,Order Region_West of USA,Order Region_Western Europe,Order Status_CLOSED,Order Status_COMPLETE,Order Status_ON_HOLD,Order Status_PAYMENT_REVIEW,Order Status_PENDING,Order Status_PENDING_PAYMENT,Order Status_PROCESSING
Order Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4,0,299.980011,60.0,1,239.979996,1,1,1,True,...,False,False,False,True,False,False,False,False,False,False
2,4,0,379.980011,50.6,7,529.380005,3,3,3,False,...,False,False,False,False,False,False,False,False,True,False
4,4,1,184.960001,78.98,14,620.870014,4,4,4,True,...,False,False,False,True,False,False,False,False,False,False
5,4,1,839.920029,142.789999,10,987.070007,4,4,2,False,...,False,False,False,False,True,False,False,False,False,False
7,2,1,515.960016,54.4,7,525.520004,3,3,2,False,...,False,False,False,False,True,False,False,False,False,False


In [189]:
#show basic statistics for the dataframe
orders_df.describe()

Unnamed: 0,Days for shipment (scheduled),Late_delivery_risk,order_original_price,order_discount,no_of_items,order_total,no_of_unique_items,no_of_unique_categories,no_of_unique_departments
count,62897.0,62897.0,62897.0,62897.0,62897.0,62897.0,62897.0,62897.0,62897.0
mean,2.935625,0.573127,388.062967,56.756416,5.844333,503.11883,2.431006,2.426666,2.00256
std,1.37376,0.494627,288.728212,46.618533,4.170329,321.290779,1.258391,1.254526,0.935946
min,0.0,0.0,9.99,0.0,1.0,7.49,1.0,1.0,1.0
25%,2.0,0.0,159.98,20.0,2.0,244.349998,1.0,1.0,1.0
50%,4.0,1.0,339.970013,47.4,5.0,456.869995,2.0,2.0,2.0
75%,4.0,1.0,547.960016,82.48,9.0,721.960007,3.0,3.0,3.0
max,4.0,1.0,3089.949993,681.5,24.0,2768.409988,5.0,5.0,5.0


In [190]:
# change column name Late_delivery_risk to late_delivery
orders_df = orders_df.rename(columns={"Late_delivery_risk": "late_delivery"})
#show the number of orders in each delivery time bucket
orders_df["late_delivery"].value_counts()

late_delivery
1    36048
0    26849
Name: count, dtype: int64

LOGISTIC REGRESSION - DELIVERY TIME PREDICTION MODEL

In [191]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = orders_df["late_delivery"]

# Separate the X variable, the features
X = orders_df.drop(columns="late_delivery")

In [192]:
# Review the y variable Series
print(f"Labels: {y[:10]}")

Labels: Order Id
1     0
2     0
4     1
5     1
7     1
8     0
9     1
10    1
11    0
12    0
Name: late_delivery, dtype: int64


In [193]:
# Review the X variable DataFrame
print(f"Data: {X[:10]}")

Data:           Days for shipment (scheduled)  order_original_price  order_discount  \
Order Id                                                                        
1                                     4            299.980011       60.000000   
2                                     4            379.980011       50.600000   
4                                     4            184.960001       78.980000   
5                                     4            839.920029      142.789999   
7                                     2            515.960016       54.400000   
8                                     4            219.960003      142.070000   
9                                     4            499.970009       12.000000   
10                                    4            601.940016      126.150000   
11                                    4            299.930000       72.990000   
12                                    4            524.950008      103.490000   

          no_of_items

In [194]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(47172, 1301)

In [195]:
X_test.shape

(15725, 1301)

In [196]:
#concatenate our training data back together
train_data = pd.concat([X_train, y_train], axis=1)

In [197]:
#Create a logistic regression model
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model with class weight to handle imbalance in the dataset
classifier = LogisticRegression(max_iter=10000)


# Fit the model using scaled training data
classifier.fit(X_train, y_train)

In [198]:
from sklearn.calibration import CalibratedClassifierCV

# Instantiate the calibrated classifier with sigmoid calibration
calibrated_clf = CalibratedClassifierCV(classifier, method='sigmoid', cv='prefit')

# Fit the calibrated classifier using the validation data (you might need to split your data again)
calibrated_clf.fit(X_train, y_train)



In [199]:
# Get calibrated probabilities
calibrated_probs = calibrated_clf.predict_proba(X_test)

# Predict labels (if needed)
calibrated_preds = calibrated_clf.predict(X_test)

In [200]:
from sklearn.metrics import accuracy_score, log_loss

print("Accuracy: ", accuracy_score(y_test, calibrated_preds))
print("Log loss: ", log_loss(y_test, calibrated_probs))

Accuracy:  0.6927821939586646
Log loss:  0.5414004367066779


In [201]:
# Make a prediction using the testing data with the calibrated classifier
calibrated_predictions = calibrated_clf.predict(X_test)

pd.DataFrame({"Prediction": calibrated_predictions, "Actual": y_test})

Unnamed: 0_level_0,Prediction,Actual
Order Id,Unnamed: 1_level_1,Unnamed: 2_level_1
2658,1,1
21959,0,1
77119,0,1
6286,1,0
2450,0,1
...,...,...
14644,1,1
8832,0,0
5990,1,1
76735,0,1


In [202]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Generate a confusion matrix for the model
confusion_matrix(y_test, calibrated_predictions)

array([[5855,  858],
       [3973, 5039]], dtype=int64)

In [203]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, calibrated_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual On Time [0]", "Actual Late [1]"], columns=["Predicted On Time [0]", "Predicted Late [1]"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, calibrated_predictions)

# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, calibrated_predictions))


Confusion Matrix


Unnamed: 0,Predicted On Time [0],Predicted Late [1]
Actual On Time [0],5855,858
Actual Late [1],3973,5039


Accuracy Score : 0.6927821939586646
Classification Report
              precision    recall  f1-score   support

           0       0.60      0.87      0.71      6713
           1       0.85      0.56      0.68      9012

    accuracy                           0.69     15725
   macro avg       0.73      0.72      0.69     15725
weighted avg       0.74      0.69      0.69     15725



Trying to imporve the accuracy score by training the model on more detailed individual product order infromation 

In [204]:
#load csv file into dataframe DataCoSupplyChainDataset_VL.csv from Resources folder
file_path = Path("Resources/DataCoSupplyChainDataset_VL.csv")
sales_df = pd.read_csv(file_path)
sales_df.head()

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late_delivery_risk,Category Name,Customer City,Customer Country,Customer Id,Customer Segment,...,Order State,Order Status,Order Zipcode,Product Card Id,Product Category Id,Product Name,Product Price,shipping date (DateOrders),Shipping Mode,order_date_VL
0,DEBIT,3,4,Advance shipping,0,Sporting Goods,Caguas,Puerto Rico,20755,Consumer,...,Java Occidental,COMPLETE,,1360,73,Smart watch,327.75,2/3/2018 22:56,Standard Class,2018-01-31
1,TRANSFER,5,4,Late delivery,1,Sporting Goods,Caguas,Puerto Rico,19492,Consumer,...,Rajastán,PENDING,,1360,73,Smart watch,327.75,1/18/2018 12:27,Standard Class,2018-01-13
2,CASH,4,4,Shipping on time,0,Sporting Goods,San Jose,EE. UU.,19491,Consumer,...,Rajastán,CLOSED,,1360,73,Smart watch,327.75,1/17/2018 12:06,Standard Class,2018-01-13
3,DEBIT,3,4,Advance shipping,0,Sporting Goods,Los Angeles,EE. UU.,19490,Home Office,...,Queensland,COMPLETE,,1360,73,Smart watch,327.75,1/16/2018 11:45,Standard Class,2018-01-13
4,PAYMENT,2,4,Advance shipping,0,Sporting Goods,Caguas,Puerto Rico,19489,Corporate,...,Queensland,PENDING_PAYMENT,,1360,73,Smart watch,327.75,1/15/2018 11:24,Standard Class,2018-01-13


In [205]:
#drop rows with "Delivery Status"="Shipping canceled"
sales_df = sales_df[sales_df["Delivery Status"] != "Shipping canceled"]


In [206]:
#drop column "Delivery Status" & "Days for shipment (real)"
sales_df = sales_df.drop(columns=["Delivery Status", "Days for shipping (real)"])


In [207]:
sales_df.dtypes

Type                              object
Days for shipment (scheduled)      int64
Late_delivery_risk                 int64
Category Name                     object
Customer City                     object
Customer Country                  object
Customer Id                        int64
Customer Segment                  object
Customer State                    object
Customer Zipcode                 float64
Department Id                      int64
Department Name                   object
Latitude                         float64
Longitude                        float64
Market                            object
Order City                        object
Order Country                     object
Order Customer Id                  int64
order date (DateOrders)           object
Order Id                           int64
Order Item Cardprod Id             int64
Order Item Discount              float64
Order Item Discount Rate         float64
Order Item Id                      int64
Order Item Produ

In [208]:
#drrop columns shipping date (DateOrders), order date (DateOrders)   
sales_df = sales_df.drop(columns=["order date (DateOrders)", "shipping date (DateOrders)","Product Name","order_date_VL"])

In [209]:
#dummy code categorical columns
sales_df = pd.get_dummies(sales_df, columns=["Type", "Category Name", "Customer City","Customer Country",
                                             "Customer Segment", "Customer State", "Department Name","Order Country", "Order State", 
                                             "Shipping Mode", "Market","Order City", "Order Region", "Order Status"])
sales_df.head()

Unnamed: 0,Days for shipment (scheduled),Late_delivery_risk,Customer Id,Customer Zipcode,Department Id,Latitude,Longitude,Order Customer Id,Order Id,Order Item Cardprod Id,...,Order Region_West Asia,Order Region_West of USA,Order Region_Western Europe,Order Status_CLOSED,Order Status_COMPLETE,Order Status_ON_HOLD,Order Status_PAYMENT_REVIEW,Order Status_PENDING,Order Status_PENDING_PAYMENT,Order Status_PROCESSING
0,4,0,20755,725.0,2,18.251453,-66.037056,20755,77202,1360,...,False,False,False,False,True,False,False,False,False,False
1,4,1,19492,725.0,2,18.279451,-66.037064,19492,75939,1360,...,False,False,False,False,False,False,False,True,False,False
2,4,0,19491,95125.0,2,37.292233,-121.881279,19491,75938,1360,...,False,False,False,True,False,False,False,False,False,False
3,4,0,19490,90027.0,2,34.125946,-118.291016,19490,75937,1360,...,False,False,False,False,True,False,False,False,False,False
4,4,0,19489,725.0,2,18.253769,-66.037048,19489,75936,1360,...,False,False,False,False,False,False,False,False,True,False


In [210]:
#replace Nan with 0
sales_df = sales_df.fillna(0)

In [211]:
sales_df.describe()

Unnamed: 0,Days for shipment (scheduled),Late_delivery_risk,Customer Id,Customer Zipcode,Department Id,Latitude,Longitude,Order Customer Id,Order Id,Order Item Cardprod Id,...,Order Item Product Price,Order Item Profit Ratio,Order Item Quantity,Sales,Order Item Total,Order Profit Per Order,Order Zipcode,Product Card Id,Product Category Id,Product Price
count,172765.0,172765.0,172765.0,172765.0,172765.0,172765.0,172765.0,172765.0,172765.0,172765.0,...,172765.0,172765.0,172765.0,172765.0,172765.0,172765.0,172765.0,172765.0,172765.0,172765.0
mean,2.9331,0.5729,6687.759048,35930.98891,5.444095,29.724145,-84.91261,6687.759048,36215.16721,692.618858,...,141.278595,0.120801,2.127694,203.828493,183.165948,22.03236,7597.117813,692.618858,31.856065,141.278595
std,1.373405,0.494659,4165.06918,37537.246337,1.629248,9.811816,21.410511,4165.06918,21036.193887,336.47117,...,139.862956,0.46661,1.453663,132.39252,120.141871,104.355313,22415.287313,336.47117,15.642216,139.862956
min,0.0,0.0,1.0,0.0,2.0,-33.937553,-158.025986,1.0,1.0,19.0,...,9.99,-2.75,1.0,9.99,7.49,-4274.97998,0.0,19.0,2.0,9.99
25%,2.0,0.0,3252.0,725.0,4.0,18.265436,-98.443069,3252.0,18069.0,403.0,...,50.0,0.08,1.0,119.980003,104.379997,7.03,0.0,403.0,18.0,50.0
50%,4.0,1.0,6457.0,19380.0,5.0,33.144863,-76.847908,6457.0,36126.0,627.0,...,59.990002,0.27,1.0,199.919998,163.990005,31.52,0.0,627.0,29.0,59.990002
75%,4.0,1.0,9782.0,78207.0,7.0,39.279617,-66.370583,9782.0,54111.0,1004.0,...,199.990005,0.36,3.0,299.950012,247.399994,64.800003,0.0,1004.0,45.0,199.990005
max,4.0,1.0,20757.0,99205.0,12.0,48.781933,115.263077,20757.0,77204.0,1363.0,...,1999.98999,0.5,5.0,1999.98999,1939.98999,911.799988,99301.0,1363.0,76.0,1999.98999


In [212]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = sales_df["Late_delivery_risk"]

# Separate the X variable, the features
X = sales_df.drop(columns="Late_delivery_risk")

In [213]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(129573, 5572)

In [214]:
y_train.shape

(129573,)

In [215]:
#concatenate our training data back together
train_data = pd.concat([X_train, y_train], axis=1)

In [216]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
classifier = LogisticRegression(max_iter=10000)

# Fit the model using scaled training data
classifier.fit(X_train, y_train)


In [217]:
from sklearn.calibration import CalibratedClassifierCV

# Instantiate the calibrated classifier with sigmoid calibration
calibrated_clf = CalibratedClassifierCV(classifier, method='sigmoid', cv='prefit')

# Fit the calibrated classifier using the validation data (you might need to split your data again)
calibrated_clf.fit(X_train, y_train)

# Get calibrated probabilities
calibrated_probs = calibrated_clf.predict_proba(X_test)

# Predict labels (if needed)
calibrated_preds = calibrated_clf.predict(X_test)

In [218]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)

pd.DataFrame({"Prediction": predictions, "Actual": y_test})


Unnamed: 0,Prediction,Actual
46463,0,0
153114,1,1
179302,1,1
127823,0,0
106978,1,1
...,...,...
53595,0,0
78304,0,1
168391,1,1
149009,0,1


In [219]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual On-Time Delivery", "Actual Late Delivery"], columns=["Predicted On-Time Delivery", "Predicted Late Delivery"]
)
display(cm_df)
# Print the classification report for the model
target_names = ["On-Time Delivery", "Late Delivery"]
print(classification_report(y_test, predictions, target_names=target_names))
reg_acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score: {reg_acc_score}")

Unnamed: 0,Predicted On-Time Delivery,Predicted Late Delivery
Actual On-Time Delivery,15328,3119
Actual Late Delivery,10054,14691


                  precision    recall  f1-score   support

On-Time Delivery       0.60      0.83      0.70     18447
   Late Delivery       0.82      0.59      0.69     24745

        accuracy                           0.70     43192
       macro avg       0.71      0.71      0.69     43192
    weighted avg       0.73      0.70      0.69     43192

Accuracy Score: 0.6950129653639563
