### **Import Required Libraries**

In [22]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.metrics import accuracy_score,precision_score,recall_score,classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline

### **Import Datasets**

In [23]:
train_data=pd.read_csv("../dataset/train_data.csv")
test_data=pd.read_csv("../dataset/test_data.csv")

In [24]:
print(f"Shape of the Training data:{train_data.shape}")
print(f"Shape of the Testing data:{test_data.shape}")

Shape of the Training data:(1550, 11)
Shape of the Testing data:(388, 11)


### **Drop Unecessary Columns**

In [25]:
train_data=train_data.drop(["Unnamed: 0"],axis=1)
test_data=test_data.drop(["Unnamed: 0"],axis=1)

In [26]:
##split train data into the X_train and Y_train
X_train=train_data.drop(["delivery_status"],axis=1)
Y_train=train_data["delivery_status"]

In [27]:
##Adjust the datatype of  assembly_service_requested column
X_train["assembly_service_requested"]=X_train["assembly_service_requested"].astype("object")

### **Model Building**

In [None]:
# ### Numbercal columns 
numerical_features=X_train.select_dtypes(include="number").columns.to_list()
categorical_features=X_train.select_dtypes(include="object").columns.to_list()

# ##Nominal variables
categorical_features.remove("brand")
##Ordinal variables
ordinal_features=["brand"]


In [None]:
### Define pipelines
nominal_pipeline=Pipeline(steps=[
    ("One-Hot-Encoder",OneHotEncoder(sparse_output=False,handle_unknown="ignore"))
])
ordinal_pipeline=Pipeline(steps=[
    ("Ordinal-Encoder",OrdinalEncoder())
])

transfomer=ColumnTransformer(transformers=[
    ("Nominal Pipeline",nominal_pipeline,categorical_features),
    ("Ordinal Pipeline",ordinal_pipeline,ordinal_features)
])

final_pipeline=Pipeline(steps=[
    ("Transfomer",transfomer),
    ("Resampling SMOTE",SMOTE(random_state=42)), ##resample
    ("rf",RandomForestClassifier(random_state=42))
])

## Execute Pipeline
final_pipeline.fit(X_train,Y_train)

0,1,2
,steps,"[('Transfomer', ...), ('Resampling SMOTE', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('Nominal Pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### **Model Evaluation**

In [30]:
##split test data into the X_test and Y_test
X_test=test_data.drop(["delivery_status"],axis=1)
Y_test=test_data["delivery_status"]

##Adjust the datatype of  assembly_service_requested column
X_test["assembly_service_requested"]=X_test["assembly_service_requested"].astype("object")

In [31]:
y_pred=final_pipeline.predict(X_test) ##get the predicted values

In [32]:
##print the classification report
print(classification_report(Y_test,y_pred))

                 precision    recall  f1-score   support

      Delivered       0.18      0.37      0.24        62
Failed Delivery       0.48      0.30      0.37       209
       On Going       0.28      0.31      0.29       117

       accuracy                           0.31       388
      macro avg       0.31      0.33      0.30       388
   weighted avg       0.37      0.31      0.32       388



In [33]:
y_pred_train=final_pipeline.predict(X_train)

In [34]:
##print the classification report
print(classification_report(Y_train,y_pred_train))

                 precision    recall  f1-score   support

      Delivered       0.19      0.40      0.25       282
Failed Delivery       0.52      0.29      0.37       750
       On Going       0.36      0.36      0.36       518

       accuracy                           0.33      1550
      macro avg       0.35      0.35      0.33      1550
   weighted avg       0.40      0.33      0.35      1550



### **Hyper Parameter Tune**

In [35]:
# Define parameter grid
param_grid = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__bootstrap': [True, False]
}

In [36]:
# Grid Search
grid_search = GridSearchCV(
    final_pipeline,
    param_grid=param_grid,
    cv=5,            
    n_jobs=-1,         
    scoring='accuracy'
)

##Fit on training data
grid_search.fit(X_train,Y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'rf__bootstrap': [True, False], 'rf__max_depth': [None, 10, ...], 'rf__max_features': ['sqrt', 'log2'], 'rf__min_samples_leaf': [1, 2, ...], ...}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('Nominal Pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [37]:
##best parameters
print("Best Parameters:",grid_search.best_params_)

Best Parameters: {'rf__bootstrap': True, 'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 200}


### **Model Evaluation**

In [38]:
# Evaluate best model
best_rf =grid_search.best_estimator_
y_pred_cv = best_rf.predict(X_test)

In [39]:
print("\nTest Accuracy:", accuracy_score(Y_test, y_pred_cv))
print("\nClassification Report:\n", classification_report(Y_test, y_pred_cv))


Test Accuracy: 0.3118556701030928

Classification Report:
                  precision    recall  f1-score   support

      Delivered       0.18      0.37      0.24        62
Failed Delivery       0.48      0.30      0.37       209
       On Going       0.28      0.31      0.29       117

       accuracy                           0.31       388
      macro avg       0.31      0.33      0.30       388
   weighted avg       0.37      0.31      0.32       388



In [40]:
y_pred_train_cv=best_rf.predict(X_train)

In [41]:
##print the classification report
print("\nTrain Accuracy:", accuracy_score(Y_train, y_pred_train_cv))
print(classification_report(Y_train,y_pred_train_cv))


Train Accuracy: 0.33419354838709675
                 precision    recall  f1-score   support

      Delivered       0.19      0.40      0.25       282
Failed Delivery       0.52      0.29      0.37       750
       On Going       0.36      0.36      0.36       518

       accuracy                           0.33      1550
      macro avg       0.35      0.35      0.33      1550
   weighted avg       0.40      0.33      0.35      1550

