Predict Order Status based on Features

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [33]:
df = pd.read_csv('datasets/Retail_Store.csv' , encoding='latin-1')

In [34]:
df = df.drop(columns=['Customer Lname','Customer Zipcode','Order Zipcode','Product Description',
                      'Customer Password','Order Customer Id','Order Item Total','Order Profit Per Order',
                      'Product Card Id','Product Category Id','Product Price','Customer Fname',
                      'Product Description','Customer Email','Product Image','Product Status',
                      'Order Id','Order Item Id','Order Item Cardprod Id'])

In [35]:
df['shipment_date'] = pd.to_datetime(df['shipping date (DateOrders)'])
df['shipment_year']=df['shipment_date'].dt.year
df['shipment_month']=df['shipment_date'].dt.month
df['shipment_week']=df['shipment_date'].dt.weekday

df['order_date'] = pd.to_datetime(df['order date (DateOrders)'])
df['order_year']=df['order_date'].dt.year
df['order_month']=df['order_date'].dt.month
df['order_week']=df['order_date'].dt.weekday

df = df.drop(columns=['order date (DateOrders)','shipping date (DateOrders)'])

In [36]:
df['shipping_time_diff']=df['Days for shipping (real)']-df['Days for shipment (scheduled)']

df = df[df['Department Name'] != 'Technology']

In [7]:
categorical_cols = ['Delivery Status', 'Category Name', 'Customer City', 'Customer Country', 
                    'Customer Segment', 'Order Region', 'Shipping Mode','Market','Department Name',
                    'Order City','Order State','Type']  

numerical_cols = ['Days for shipping (real)', 'Days for shipment (scheduled)', 'Benefit per order', 
                  'Sales per customer', 'shipping_time_diff', 'Sales', 'Order Item Quantity']

In [54]:
features_df = ['Type','Order Status','Delivery Status','Category Name','Customer Segment','Order Region','Shipping Mode',
               'Market','Department Name','Benefit per order','Sales','Order Item Quantity','Customer Id','shipment_year',
               'shipment_month','shipment_week','order_year','order_month','order_week']

len(features_df)

19

In [55]:
new_df = df[features_df]
model_df = new_df
new_df.head(5)

Unnamed: 0,Type,Order Status,Delivery Status,Category Name,Customer Segment,Order Region,Shipping Mode,Market,Department Name,Benefit per order,Sales,Order Item Quantity,Customer Id,shipment_year,shipment_month,shipment_week,order_year,order_month,order_week
0,DEBIT,COMPLETE,Advance shipping,Sporting Goods,Consumer,Southeast Asia,Standard Class,Pacific Asia,Fitness,91.25,327.75,1,20755,2018,2,5,2018,1,2
1,TRANSFER,PENDING,Late delivery,Sporting Goods,Consumer,South Asia,Standard Class,Pacific Asia,Fitness,-249.089996,327.75,1,19492,2018,1,3,2018,1,5
2,CASH,CLOSED,Shipping on time,Sporting Goods,Consumer,South Asia,Standard Class,Pacific Asia,Fitness,-247.779999,327.75,1,19491,2018,1,2,2018,1,5
3,DEBIT,COMPLETE,Advance shipping,Sporting Goods,Home Office,Oceania,Standard Class,Pacific Asia,Fitness,22.860001,327.75,1,19490,2018,1,1,2018,1,5
4,PAYMENT,PENDING_PAYMENT,Advance shipping,Sporting Goods,Corporate,Oceania,Standard Class,Pacific Asia,Fitness,134.210007,327.75,1,19489,2018,1,0,2018,1,5


In [57]:
# Prepare data for training
x = model_df.drop('Order Status', axis=1)  # Features
y = model_df['Order Status']  # Target variable

x.shape, y.shape

((179054, 18), (179054,))

In [71]:
# One-hot encode only the feature columns
x = pd.get_dummies(x, drop_first=True)

# Combine features and target if needed
model_df = pd.concat([x, y], axis=1)

print(model_df.shape)


(179054, 103)


In [72]:
# Convert bool columns to integers
x_train = x_train.astype({col: 'int' for col in x_train.select_dtypes(include='bool').columns})
x_test = x_test.astype({col: 'int' for col in x_test.select_dtypes(include='bool').columns})

# Scale numeric data
scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Confirm the scaling process
print(x_train_scaled[:5])  # First 5 rows of scaled data


[[ 1.70437331e-01 -4.30046359e-01  5.89453712e-01 -1.19742801e+00
   1.24004652e+00 -6.43298373e-02 -4.99745640e-01  1.25359006e+00
  -5.88831731e-02  4.94003208e-01 -7.89130674e-01  1.82409474e+00
  -6.17770897e-01  9.07582589e-01 -2.10756248e-01 -4.67604275e-01
  -2.08090929e-02 -3.34399772e-02 -5.94806746e-02 -1.94196792e-02
  -4.73917749e-02 -4.75395162e-02 -3.87711513e-02 -2.87556915e-01
  -2.74483854e-01 -5.98929668e-02 -3.99534236e-01 -5.19132337e-02
  -5.08892183e-02 -1.34169905e-01 -3.26447374e-01 -4.12225751e-02
  -5.34456991e-02 -8.23559120e-02 -4.97034385e-02 -1.83086656e-02
  -9.03561953e-02 -7.76722681e-02 -5.40985998e-02 -4.55818787e-02
  -5.95397458e-02 -5.09581195e-02 -3.48335144e-01 -4.58884428e-02
  -4.40173103e-02 -3.43686911e-02 -3.77006421e-01 -3.91306982e-02
  -4.91356236e-02 -5.19132337e-02 -2.55609835e-01 -2.75957273e-02
  -4.48063693e-02 -2.58967039e-02 -4.28884378e-02 -5.52546073e-02
  -7.39462512e-02 -6.87058089e-02 -3.06854385e-01  2.74065148e+00
  -6.00688

In [68]:
print(x_train.dtypes)
print(x_test.dtypes)


Benefit per order                     float64
Sales                                 float64
Order Item Quantity                     int64
Customer Id                             int64
shipment_year                           int32
                                       ...   
Department Name_Footwear                int64
Department Name_Golf                    int64
Department Name_Health and Beauty       int64
Department Name_Outdoors                int64
Department Name_Pet Shop                int64
Length: 102, dtype: object
Benefit per order                     float64
Sales                                 float64
Order Item Quantity                     int64
Customer Id                             int64
shipment_year                           int32
                                       ...   
Department Name_Footwear                int64
Department Name_Golf                    int64
Department Name_Health and Beauty       int64
Department Name_Outdoors                int64
Departm

In [73]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [66]:
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

x_train.dtypes

Benefit per order                     float64
Sales                                 float64
Order Item Quantity                     int64
Customer Id                             int64
shipment_year                           int32
                                       ...   
Department Name_Footwear                 bool
Department Name_Golf                     bool
Department Name_Health and Beauty        bool
Department Name_Outdoors                 bool
Department Name_Pet Shop                 bool
Length: 102, dtype: object

In [29]:
# Define categorical and numerical columns
categorical_cols = ['Delivery Status', 'Category Name', 'Customer City', 'Customer Country', 
                    'Customer Segment', 'Order Region', 'Order Status', 'Shipping Mode']  # Update as needed

numerical_cols = ['Days for shipping (real)', 'Days for shipment (scheduled)', 'Benefit per order', 
                  'Sales per customer', 'shipping_time_diff', 'Sales', 'Order Item Quantity']  # Update as needed

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Scale numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # One-hot encode categorical features
    ])


In [74]:
# Create the pipeline with preprocessor and classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))  # Example classifier
])


In [75]:
# Train the model
model.fit(x_train, y_train)

# Evaluate the model
y_pred = model.predict(x_test)

# Evaluate performance
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


ValueError: A given column is not a column of the dataframe

In [32]:
print(df.dtypes)


Type                                     object
Days for shipping (real)                  int64
Days for shipment (scheduled)             int64
Benefit per order                       float64
Sales per customer                      float64
Delivery Status                          object
Late_delivery_risk                        int64
Category Id                               int64
Category Name                            object
Customer City                            object
Customer Country                         object
Customer Id                               int64
Customer Segment                         object
Customer State                           object
Customer Street                          object
Department Id                             int64
Department Name                          object
Latitude                                float64
Longitude                               float64
Market                                   object
Order City                              