In [1]:
# Import the modules
import numpy as np
import pandas as pd
import codecs
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

In [2]:
#readining in CSV file 
with codecs.open("/Users/aileen/Downloads/DataCoSupplyChainDataset_Original.csv", 'r', encoding='ISO-8859-1') as f:
    dataco_df = pd.read_csv(f)

#printing df
dataco_df.head()

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class


In [3]:
all_columns = dataco_df.columns
all_columns

Index(['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)',
       'Benefit per order', 'Sales per customer', 'Delivery Status',
       'Late_delivery_risk', 'Category Id', 'Category Name', 'Customer City',
       'Customer Country', 'Customer Email', 'Customer Fname', 'Customer Id',
       'Customer Lname', 'Customer Password', 'Customer Segment',
       'Customer State', 'Customer Street', 'Customer Zipcode',
       'Department Id', 'Department Name', 'Latitude', 'Longitude', 'Market',
       'Order City', 'Order Country', 'Order Customer Id',
       'order date (DateOrders)', 'Order Id', 'Order Item Cardprod Id',
       'Order Item Discount', 'Order Item Discount Rate', 'Order Item Id',
       'Order Item Product Price', 'Order Item Profit Ratio',
       'Order Item Quantity', 'Sales', 'Order Item Total',
       'Order Profit Per Order', 'Order Region', 'Order State', 'Order Status',
       'Order Zipcode', 'Product Card Id', 'Product Category Id',
       'Product De

In [4]:
#all delivery statuses
unique_delivery_values = dataco_df['Delivery Status'].unique()
unique_delivery_values

array(['Advance shipping', 'Late delivery', 'Shipping on time',
       'Shipping canceled'], dtype=object)

In [5]:
#all zipcodes
unique_zipcode_values = dataco_df['Order Zipcode'].unique()
unique_zipcode_values

array([   nan, 99301., 90049., 94110., 66212., 77041., 85234., 95123.,
       93727., 85345., 45231., 47201., 19134., 60016., 60126., 98103.,
       94601., 10009., 28110., 80219., 90008., 77095., 32216., 77036.,
       74133., 94109., 92553., 20735., 60543., 60174., 31907., 62521.,
       92627., 28806., 35630.,  1852., 10024., 44312., 22153., 89015.,
       90045., 10011., 49505., 78664., 76017., 92105., 85705., 90032.,
       48227., 55407., 28205., 23602., 94122., 11572., 78207., 68104.,
       21215., 75081., 10035., 98115., 49201., 43229., 60653.,  6708.,
       98105., 19143., 42420., 80134., 94533., 92037., 90036., 22304.,
       35810., 19711., 38109., 72701., 23223., 40475., 65807., 33801.,
       13021., 45503., 78577., 11550., 40214., 67212., 76106., 21044.,
       30318., 79762., 60610., 75217., 90004., 89115., 95037., 97206.,
       14215., 79109., 33614., 75034., 28540., 32303.,  4240., 34952.,
       63116., 77070.,  1841., 97756., 19140., 28314., 13440., 60623.,
      

In [6]:
#all order statuses 
order_statuses = dataco_df['Order Status'].unique()
order_statuses

array(['COMPLETE', 'PENDING', 'CLOSED', 'PENDING_PAYMENT', 'CANCELED',
       'PROCESSING', 'SUSPECTED_FRAUD', 'ON_HOLD', 'PAYMENT_REVIEW'],
      dtype=object)

In [7]:
#looking at the most frequent order status
counts = dataco_df['Order Status'].value_counts()
counts

COMPLETE           59491
PENDING_PAYMENT    39832
PROCESSING         21902
PENDING            20227
CLOSED             19616
ON_HOLD             9804
SUSPECTED_FRAUD     4062
CANCELED            3692
PAYMENT_REVIEW      1893
Name: Order Status, dtype: int64

In [8]:
#all order regions 
unique_regions = dataco_df['Order Region'].unique()
unique_regions

array(['Southeast Asia', 'South Asia', 'Oceania', 'Eastern Asia',
       'West Asia', 'West of USA ', 'US Center ', 'West Africa',
       'Central Africa', 'North Africa', 'Western Europe',
       'Northern Europe', 'Central America', 'Caribbean', 'South America',
       'East Africa', 'Southern Europe', 'East of USA', 'Canada',
       'Southern Africa', 'Central Asia', 'Eastern Europe',
       'South of  USA '], dtype=object)

In [9]:
# Use str.replace to update USA region values 
dataco_df['Order Region'] = dataco_df['Order Region'].str.replace('West of USA', 'USA').str.replace('South of USA', 'USA').str.replace('South of USA ', 'USA').str.replace('East of USA', 'USA').str.replace('US Center', 'USA').str.replace('South of  USA ', 'USA').str.replace('USA ', 'USA')

#checking to make sure data is compiled
unique_regions = dataco_df['Order Region'].unique()
unique_regions

array(['Southeast Asia', 'South Asia', 'Oceania', 'Eastern Asia',
       'West Asia', 'USA', 'West Africa', 'Central Africa',
       'North Africa', 'Western Europe', 'Northern Europe',
       'Central America', 'Caribbean', 'South America', 'East Africa',
       'Southern Europe', 'Canada', 'Southern Africa', 'Central Asia',
       'Eastern Europe'], dtype=object)

In [10]:
#checking to see if there are any null values in order region
result = dataco_df.loc[dataco_df['Order Region'].isnull()]
result

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode


In [11]:
selected_data = dataco_df.drop(['Delivery Status', 'Product Description','Customer Email', 'Customer Fname', 'Customer Id',
       'Customer Lname', 'Customer Password', 'Customer Segment',  'Order Profit Per Order','Order Item Cardprod Id',
       'Order Item Discount', 'Order Item Discount Rate', 'Order Item Id',
       'Order Item Product Price', 'Order Item Profit Ratio','Product Card Id', 'Product Category Id',
       'Product Description', 'Product Image', 'Benefit per order', 'Category Id', 'Order Status', 'Product Status', 'Order Item Total','Department Id', 
       'Department Name', 'Order Id', 'Order Customer Id', 'Order Zipcode','Days for shipping (real)', 'Sales per customer', 'Category Name', 'Customer City', 'Customer Country', 'Customer State',
       'Customer Street',], axis=1)
selected_data

Unnamed: 0,Type,Days for shipment (scheduled),Late_delivery_risk,Customer Zipcode,Latitude,Longitude,Market,Order City,Order Country,order date (DateOrders),Order Item Quantity,Sales,Order Region,Order State,Product Name,Product Price,shipping date (DateOrders),Shipping Mode
0,DEBIT,4,0,725.0,18.251453,-66.037056,Pacific Asia,Bekasi,Indonesia,1/31/2018 22:56,1,327.750000,Southeast Asia,Java Occidental,Smart watch,327.750000,2/3/2018 22:56,Standard Class
1,TRANSFER,4,1,725.0,18.279451,-66.037064,Pacific Asia,Bikaner,India,1/13/2018 12:27,1,327.750000,South Asia,Rajastán,Smart watch,327.750000,1/18/2018 12:27,Standard Class
2,CASH,4,0,95125.0,37.292233,-121.881279,Pacific Asia,Bikaner,India,1/13/2018 12:06,1,327.750000,South Asia,Rajastán,Smart watch,327.750000,1/17/2018 12:06,Standard Class
3,DEBIT,4,0,90027.0,34.125946,-118.291016,Pacific Asia,Townsville,Australia,1/13/2018 11:45,1,327.750000,Oceania,Queensland,Smart watch,327.750000,1/16/2018 11:45,Standard Class
4,PAYMENT,4,0,725.0,18.253769,-66.037048,Pacific Asia,Townsville,Australia,1/13/2018 11:24,1,327.750000,Oceania,Queensland,Smart watch,327.750000,1/15/2018 11:24,Standard Class
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180514,CASH,4,0,11207.0,40.640930,-73.942711,Pacific Asia,Shanghái,China,1/16/2016 3:40,1,399.980011,Eastern Asia,Shanghái,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,1/20/2016 3:40,Standard Class
180515,DEBIT,2,1,93304.0,35.362545,-119.018700,Pacific Asia,Hirakata,Japón,1/16/2016 1:34,1,399.980011,Eastern Asia,Osaka,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,1/19/2016 1:34,Second Class
180516,TRANSFER,4,1,6010.0,41.629959,-72.967155,Pacific Asia,Adelaide,Australia,1/15/2016 21:00,1,399.980011,Oceania,Australia del Sur,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,1/20/2016 21:00,Standard Class
180517,PAYMENT,4,0,725.0,18.213350,-66.370575,Pacific Asia,Adelaide,Australia,1/15/2016 20:18,1,399.980011,Oceania,Australia del Sur,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,1/18/2016 20:18,Standard Class


In [12]:
selected_columns = selected_data.columns
selected_columns

Index(['Type', 'Days for shipment (scheduled)', 'Late_delivery_risk',
       'Customer Zipcode', 'Latitude', 'Longitude', 'Market', 'Order City',
       'Order Country', 'order date (DateOrders)', 'Order Item Quantity',
       'Sales', 'Order Region', 'Order State', 'Product Name', 'Product Price',
       'shipping date (DateOrders)', 'Shipping Mode'],
      dtype='object')

In [13]:
# Convert categorical data to numeric with `pd.get_dummies`
clean_data = pd.get_dummies(selected_data)
clean_data

In [None]:
clean_columns = clean_data.columns
clean_columns

Index(['Days for shipment (scheduled)', 'Late_delivery_risk',
       'Customer Zipcode', 'Latitude', 'Longitude', 'Order Item Quantity',
       'Sales', 'Product Price', 'Type_CASH', 'Type_DEBIT',
       ...
       'shipping date (DateOrders)_9/9/2017 6:20',
       'shipping date (DateOrders)_9/9/2017 6:41',
       'shipping date (DateOrders)_9/9/2017 7:02',
       'shipping date (DateOrders)_9/9/2017 7:23',
       'shipping date (DateOrders)_9/9/2017 8:47',
       'shipping date (DateOrders)_9/9/2017 9:08', 'Shipping Mode_First Class',
       'Shipping Mode_Same Day', 'Shipping Mode_Second Class',
       'Shipping Mode_Standard Class'],
      dtype='object', length=134462)

In [None]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = clean_data['Late_delivery_risk']

# Separate the X variable, the features
x = clean_data.drop('Late_delivery_risk', axis = 1)

In [None]:
# Review the y variable Series
y.head()

0    0
1    1
2    0
3    0
4    0
Name: Late_delivery_risk, dtype: int64

In [None]:
# Check the balance of our target values
value_cnts = y.value_counts()
print(value_cnts)

1    98977
0    81542
Name: Late_delivery_risk, dtype: int64


In [None]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

: 

: 

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver= 'lbfgs', random_state = 1)

# Fit the model using training data
classifier.fit(x_train, y_train)

ValueError: could not convert string to float: 'CASH'

In [None]:
# Make a prediction using the testing data
predictions = classifier.predict(x_test)

In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score = balanced_accuracy_score(y_test, predictions)
print(balanced_accuracy_score)

1.0


In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[20386,     0],
       [    0, 24744]])

In [None]:
# Print the classification report for the model
target_names = ["no late delivery risk", "late delivery risk"]
print(classification_report(y_test, predictions, target_names=target_names))

                       precision    recall  f1-score   support

no late delivery risk       1.00      1.00      1.00     20386
   late delivery risk       1.00      1.00      1.00     24744

             accuracy                           1.00     45130
            macro avg       1.00      1.00      1.00     45130
         weighted avg       1.00      1.00      1.00     45130

