<a href="https://colab.research.google.com/github/itsalanthomas/shipping-optimization/blob/dev/src/Ecommerce_Optimization_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
# load libraries
from google.colab import files
from io import StringIO
import pandas  as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


# load data in with file simulation
uploaded = files.upload()
filename = 'ecommerce_shipping.csv'

with open(filename, 'r') as file:
  csv_text = file.read()

# create df
df = pd.read_csv(StringIO(csv_text))

# EDA - Exploratory Data Analysis
df.info()
df.describe()
df.isnull().sum() #check count of null values in columns
df[df.isnull().any(axis = 1)] #check if there are any missing values in rows

# rename columns
new_cols = []
for col in df.columns:
    new_col = col.replace('_', ' ').title()
    new_cols.append(new_col)

df.columns = new_cols
df.rename(columns={'Reached.On.Time Y.N': 'On-time Delivery'},inplace=True)


Saving ecommerce_shipping.csv to ecommerce_shipping (3).csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   10999 non-null  int64 
 1   Warehouse_block      10999 non-null  object
 2   Mode_of_Shipment     10999 non-null  object
 3   Customer_care_calls  10999 non-null  int64 
 4   Customer_rating      10999 non-null  int64 
 5   Cost_of_the_Product  10999 non-null  int64 
 6   Prior_purchases      10999 non-null  int64 
 7   Product_importance   10999 non-null  object
 8   Gender               10999 non-null  object
 9   Discount_offered     10999 non-null  int64 
 10  Weight_in_gms        10999 non-null  int64 
 11  Reached.on.Time_Y.N  10999 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 1.0+ MB


In [20]:
#creating dictionary to map the order for importance
product_order = {'low': 0 ,'medium':1, 'high':2}
df['Product Importance'] = df['Product Importance'].map(product_order)

#New features from dataset.
df['Total Price'] = df['Cost Of The Product'] - df['Discount Offered']
df['Cost Rate'] = round((df['Cost Of The Product']/df['Discount Offered']) + 1, 2)

In [21]:
#columns to encode and initialize encoder
columns = ['Warehouse Block','Mode Of Shipment', 'Gender']
encoder = pd.get_dummies(df[columns], drop_first=True)
encoder = encoder.astype(int)
df_new = pd.concat([df.drop(columns = columns), encoder], axis=1)

In [23]:
#Scaling nominal data
scaler = StandardScaler()
columns_to_scale = ['Customer Rating','Cost Of The Product',
                    'Prior Purchases', 'Product Importance',
                    'Discount Offered', 'Weight In Gms',
                    'Total Price','Cost Rate']
df_new_scaled = scaler.fit_transform(df_new[columns_to_scale])
df_new_scaled = pd.DataFrame(df_new_scaled, columns = columns_to_scale)
df_new_scaled = pd.concat([df_new.drop(columns = columns_to_scale), df_new_scaled], axis = 1)

In [24]:
#Checking for multicollinearity values greater than 5 (ignoring the total price since that was created from two other features)
#taking out correlated columns (price ones)
df_vif = df_new_scaled.drop(columns = ['Total Price','Cost Of The Product','Discount Offered','Cost Rate'])

vif = pd.DataFrame()
vif['feature']  = df_vif.columns
vif['VIF'] = [variance_inflation_factor(df_vif.values, i) for i in range(df_vif.shape[1])]
print(vif)

                  feature        VIF
0                      Id   5.081097
1     Customer Care Calls  10.057208
2        On-time Delivery   2.626789
3       Warehouse Block_B   1.817151
4       Warehouse Block_C   1.824856
5       Warehouse Block_D   1.820068
6       Warehouse Block_F   2.653188
7   Mode Of Shipment_Road   1.784536
8   Mode Of Shipment_Ship   4.316716
9                Gender_M   1.922786
10        Customer Rating   1.000519
11        Prior Purchases   1.075453
12     Product Importance   1.019861
13          Weight In Gms   1.302657


In [25]:
#Drop the columns that are linearly correlated and target variable.
X = df_new_scaled.drop(columns = ['On-time Delivery','Cost Of The Product','Discount Offered','Id','Customer Care Calls'])
y = df_new_scaled['On-time Delivery']

In [26]:
#Splitting data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)


In [27]:
#Create Model
model_logreg = LogisticRegression()

#train model
model_logreg.fit(X_train, y_train)

#predict
y_pred = model_logreg.predict(X_test)

In [28]:
print('Accuracy:', round(accuracy_score(y_test, y_pred),2))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.63
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.43      0.48      1312
           1       0.67      0.76      0.71      1988

    accuracy                           0.63      3300
   macro avg       0.61      0.60      0.60      3300
weighted avg       0.62      0.63      0.62      3300



In [29]:
# Compute the confusion matrix
print(confusion_matrix(y_test, y_pred))


[[ 565  747]
 [ 468 1520]]
