### **Import Required Libraries**

In [34]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

### **Import Datasets**

In [35]:
train_data=pd.read_csv("../dataset/train_data.csv")
test_data=pd.read_csv("../dataset/test_data.csv")

In [36]:
print(f"Shape of the Training data:{train_data.shape}")
print(f"Shape of the Testing data:{test_data.shape}")

Shape of the Training data:(1550, 13)
Shape of the Testing data:(388, 13)


### **Drop Unecessary Columns**

In [37]:
train_data=train_data.drop(["Unnamed: 0"],axis=1)
test_data=test_data.drop(["Unnamed: 0"],axis=1)

In [38]:
##split train data into the X_train and Y_train
X_train=train_data.drop(["delivery_status"],axis=1)
Y_train=train_data["delivery_status"]

In [39]:
##Adjust the datatype of  assembly_service_requested column
X_train["assembly_service_requested"]=X_train["assembly_service_requested"].astype("object")

### **Model Building**

In [40]:
# ### Numbercal columns 
numerical_features=X_train.select_dtypes(include="number").columns.to_list()
categorical_features=X_train.select_dtypes(include="object").columns.to_list()

# ##Nominal variables
categorical_features.remove("brand")
##Ordinal variables
ordinal_features=["brand"]

In [41]:
### Define pipelines
nominal_pipeline=Pipeline(steps=[
    ("One-Hot-Encoder",OneHotEncoder(sparse_output=False,handle_unknown="ignore"))
])
ordinal_pipeline=Pipeline(steps=[
    ("Ordinal-Encoder",OrdinalEncoder())
])

transfomer=ColumnTransformer(transformers=[
    ("Nominal Pipeline",nominal_pipeline,categorical_features),
    ("Ordinal Pipeline",ordinal_pipeline,ordinal_features)
])

final_pipeline=Pipeline(steps=[
    ("Transfomer",transfomer),
    ("Resampling SMOTE",SMOTE(random_state=42)),
    ("DTC",DecisionTreeClassifier(random_state=42))
])

## Execute Pipeline
final_pipeline.fit(X_train,Y_train)

0,1,2
,steps,"[('Transfomer', ...), ('Resampling SMOTE', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('Nominal Pipeline', ...), ('Ordinal Pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


### **Model Evaluation**

In [42]:
##split test data into the X_test and Y_test
X_test=test_data.drop(["delivery_status"],axis=1)
Y_test=test_data["delivery_status"]

##Adjust the datatype of  assembly_service_requested column
X_test["assembly_service_requested"]=X_test["assembly_service_requested"].astype("object")

In [43]:
y_pred=final_pipeline.predict(X_test) ##get the predicted values

In [44]:
##print the classification report
print(classification_report(Y_test,y_pred))

                 precision    recall  f1-score   support

      Delivered       0.18      0.22      0.19        69
Failed Delivery       0.50      0.54      0.52       192
       On Going       0.34      0.26      0.29       127

       accuracy                           0.39       388
      macro avg       0.34      0.34      0.34       388
   weighted avg       0.39      0.39      0.39       388



In [45]:
y_pred_train=final_pipeline.predict(X_train)

In [46]:
##print the classification report
print(classification_report(Y_train,y_pred_train))

                 precision    recall  f1-score   support

      Delivered       0.42      0.51      0.46       275
Failed Delivery       0.65      0.71      0.68       767
       On Going       0.57      0.42      0.48       508

       accuracy                           0.58      1550
      macro avg       0.54      0.54      0.54      1550
   weighted avg       0.58      0.58      0.57      1550



### **Hyper Parameter Tune**

In [47]:
##define parameter grid for hyperparameter tuning
param_grid = {
    "DTC__criterion": ["gini", "entropy"],
    "DTC__max_depth": [3, 5, 10, 15, None],
    "DTC__min_samples_split": [2, 5, 10],
    "DTC__min_samples_leaf": [1, 2, 5],
    "DTC__max_features": [None, "sqrt", "log2"]
}

In [48]:
grid_search = GridSearchCV(
    estimator=final_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring="f1_macro",
    n_jobs=-1,
)

grid_search.fit(X_train, Y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'DTC__criterion': ['gini', 'entropy'], 'DTC__max_depth': [3, 5, ...], 'DTC__max_features': [None, 'sqrt', ...], 'DTC__min_samples_leaf': [1, 2, ...], ...}"
,scoring,'f1_macro'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('Nominal Pipeline', ...), ('Ordinal Pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [49]:
##best parameters
print("Best Parameters:",grid_search.best_params_)

Best Parameters: {'DTC__criterion': 'gini', 'DTC__max_depth': 10, 'DTC__max_features': None, 'DTC__min_samples_leaf': 1, 'DTC__min_samples_split': 10}


### **Model Evaluation**

In [50]:
# Evaluate best model
best_rf =grid_search.best_estimator_
y_pred_cv = best_rf.predict(X_test)

In [51]:
print("\nTest Accuracy:", accuracy_score(Y_test, y_pred_cv))
print("\nClassification Report:\n", classification_report(Y_test, y_pred_cv))


Test Accuracy: 0.39948453608247425

Classification Report:
                  precision    recall  f1-score   support

      Delivered       0.18      0.23      0.21        69
Failed Delivery       0.51      0.53      0.52       192
       On Going       0.37      0.30      0.33       127

       accuracy                           0.40       388
      macro avg       0.35      0.35      0.35       388
   weighted avg       0.41      0.40      0.40       388



In [52]:
y_pred_train_cv=best_rf.predict(X_train)

In [53]:
##print the classification report
print("\nTrain Accuracy:", accuracy_score(Y_train, y_pred_train_cv))
print(classification_report(Y_train,y_pred_train_cv))


Train Accuracy: 0.5129032258064516
                 precision    recall  f1-score   support

      Delivered       0.35      0.43      0.39       275
Failed Delivery       0.61      0.62      0.61       767
       On Going       0.48      0.40      0.43       508

       accuracy                           0.51      1550
      macro avg       0.48      0.48      0.48      1550
   weighted avg       0.52      0.51      0.51      1550

