## Part II: Data Cleaning, Preparation and Feature Engineering

## Imports and Setup

In [97]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

### Load Dataset and Interrogate

In [98]:
datafile = './data/shipping.csv'
TARGET = 'Reached.on.Time_Y.N'

# Load dataset
df = pd.read_csv(datafile)

# Drop the ID column
df = df.drop(columns='ID')

# Display dataset
df

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,D,Flight,4,2,177,3,low,F,44,1233,1
1,F,Flight,4,5,216,2,low,M,59,3088,1
2,A,Flight,2,2,183,4,low,M,48,3374,1
3,B,Flight,3,3,176,4,medium,M,10,1177,1
4,C,Flight,2,2,184,3,medium,F,46,2484,1
...,...,...,...,...,...,...,...,...,...,...,...
10994,A,Ship,4,1,252,5,medium,F,1,1538,1
10995,B,Ship,4,1,232,5,medium,F,6,1247,0
10996,C,Ship,5,4,242,5,low,F,4,1155,0
10997,F,Ship,5,2,223,6,medium,M,2,1210,0


In [99]:
# separate features from target
X = df.drop(columns=TARGET)
y = df[TARGET]

### Encode Categorical Features

There are several categorical features present in the dataset that must be encoded for use by any downstream models.  These features will be one-hot encoded.


In [100]:
# label encode the product_importance column since it is ordinal
le = LabelEncoder()

X_encoded = X.copy()

categorical_columns = ['Product_importance']

# Apply LabelEncoder to each categorical column
for col in categorical_columns:
    X_encoded[col] = le.fit_transform(X_encoded[col])

In [101]:
# inspect the results
X_encoded.head()

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
0,D,Flight,4,2,177,3,1,F,44,1233
1,F,Flight,4,5,216,2,1,M,59,3088
2,A,Flight,2,2,183,4,1,M,48,3374
3,B,Flight,3,3,176,4,2,M,10,1177
4,C,Flight,2,2,184,3,2,F,46,2484


In [102]:
# one hot encode the remaining categorical features
X_encoded = pd.get_dummies(X_encoded)

In [103]:
# inspect results
X_encoded.head()

Unnamed: 0,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Discount_offered,Weight_in_gms,Warehouse_block_A,Warehouse_block_B,Warehouse_block_C,Warehouse_block_D,Warehouse_block_F,Mode_of_Shipment_Flight,Mode_of_Shipment_Road,Mode_of_Shipment_Ship,Gender_F,Gender_M
0,4,2,177,3,1,44,1233,0,0,0,1,0,1,0,0,1,0
1,4,5,216,2,1,59,3088,0,0,0,0,1,1,0,0,0,1
2,2,2,183,4,1,48,3374,1,0,0,0,0,1,0,0,0,1
3,3,3,176,4,2,10,1177,0,1,0,0,0,1,0,0,0,1
4,2,2,184,3,2,46,2484,0,0,1,0,0,1,0,0,1,0


### Perform Train/Test Split

Split the prepared dataset into train/test/val datasets for use by modeling in next stages.  Implement an 80/10/10 split.

In [104]:
# split the data into train and test sets (80% and 20%)
X_train, X_test_val, y_train, y_test_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# split the test_val into test and val sets (10% and 10%)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42)

### Normalize Continuous Numerical Features

For the remaining continuous features, a scaler will be fit to the dataset to normalize the numerical range

In [105]:
#list for cols to scale
cols_to_scale = ['Cost_of_the_Product', 'Discount_offered', 'Weight_in_gms', 'Prior_purchases', 'Customer_care_calls']

#create and fit scaler using X train set
scaler = StandardScaler()
scaler.fit(X_train[cols_to_scale])

#scale X train, test, val data
X_train[cols_to_scale] = scaler.transform(X_train[cols_to_scale])
X_val[cols_to_scale] = scaler.transform(X_val[cols_to_scale])
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

### Write prepared data to file for use by modeling experiments

In [106]:
# train data
X_train.to_csv('./data/x_train.csv', index=False)
y_train.to_csv('./data/y_train.csv', index=False)

# test data
X_test.to_csv('./data/x_test.csv', index=False)
y_test.to_csv('./data/y_test.csv', index=False)

# val data
X_val.to_csv('./data/x_val.csv', index=False)
y_val.to_csv('./data/y_val.csv', index=False)