In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
#LOADING THE PREPROCESSED AND CLEANED DATASET

In [2]:
df = pd.read_csv("cleaned_in_vehicle_coupon_data.csv")

In [4]:
df.head()

Unnamed: 0,temperature,Bar,CoffeeHouse,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,Y,destination_No Urgent Place,destination_Work,...,weather_Sunny,coupon_Carry out & Take away,coupon_Coffee House,coupon_Restaurant(20-50),coupon_Restaurant(<20),passanger_Friend(s),time_10PM,time_2PM,expiration_2h,maritalStatus_Single
0,55,4,4,0,1,0,0,1,True,False,...,True,False,False,False,True,False,False,True,False,False
1,80,4,4,0,1,0,0,0,True,False,...,True,False,True,False,False,True,False,False,True,False
2,80,4,4,0,1,1,0,1,True,False,...,True,True,False,False,False,True,False,False,True,False
3,80,4,4,0,1,1,0,0,True,False,...,True,False,True,False,False,True,False,True,True,False
4,80,4,4,0,1,1,0,0,True,False,...,True,False,True,False,False,True,False,True,False,False


In [5]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["Y"])
y = df["Y"]
#20% TESTING DATA AND 80% TRAINING ONE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# BEFORE HYPER PARAMETER TUNING, LETS TRAIN MODELS TO COMPARE PERFORMANCE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# AS FEATURE NAMES CONTAIN INVALID CHARACTERS
X_train.columns = X_train.columns.astype(str)
X_train.columns = X_train.columns.str.replace('[\[\]\<]', '', regex=True)
X_test.columns = X_test.columns.astype(str)
X_test.columns = X_test.columns.str.replace('[\[\]\<]', '', regex=True)

rf = RandomForestClassifier(random_state=42)
lr = LogisticRegression(random_state=42, max_iter=500)
xgb = XGBClassifier(random_state=42)

rf.fit(X_train, y_train)
lr.fit(X_train, y_train)
xgb.fit(X_train, y_train)

rf_preds = rf.predict(X_test)
lr_preds = lr.predict(X_test)
xgb_preds = xgb.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_preds))
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_preds))


Random Forest Accuracy: 0.658651951123374
Logistic Regression Accuracy: 0.6708711076074103
XGBoost Accuracy: 0.7114702404414663


In [8]:
# XGBOOST'S PERFORMANCE IS GOOD FOR NOW

In [9]:
# APPLYING RANDOMISED SEARCH CV FOR RANDOM FOREST
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
#PARAMETER GRID
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}
rf_random = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_iter=10, n_jobs=-1, verbose=2)
rf_random.fit(X_train, y_train)

#PRINT BEST PARAMETRS FOUND BY RANDOM SEARCH CV!
print("Best parameters for Random Forest:", rf_random.best_params_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for Random Forest: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 10}


In [10]:
# NOW APPLYING GRID SEARCH CV FOR LOGISTIC REG
from sklearn.model_selection import GridSearchCV

#PARAMETER GRID
param_grid_lr = {
    "C": [0.01, 0.1, 1, 10],
    "solver": ["liblinear", "lbfgs"],
}

grid_search_lr = GridSearchCV(LogisticRegression(random_state=42, max_iter=500), param_grid_lr, cv=5, n_jobs=-1, verbose=2)
grid_search_lr.fit(X_train, y_train)
print("Best parameters for Logistic Regression:", grid_search_lr.best_params_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters for Logistic Regression: {'C': 0.1, 'solver': 'liblinear'}


In [11]:
# NOW APPLYING GRID SEARCH CV FOR XGBOOST
param_grid_xgb = {
    "learning_rate": [0.01, 0.05, 0.1],
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
}

grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, cv=5, n_jobs=-1, verbose=2)
grid_search_xgb.fit(X_train, y_train)

print("Best parameters for XGBoost:", grid_search_xgb.best_params_)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}


In [13]:
# LETS COMPARE MODEL PERFORMANCE BEFORE AND AFTER TUNING
from sklearn.metrics import classification_report  # PROVIDED BY SKLEARN

#BEST MODELS
best_rf = rf_random.best_estimator_
best_lr = grid_search_lr.best_estimator_
best_xgb = grid_search_xgb.best_estimator_
rf_preds_tuned = best_rf.predict(X_test)
lr_preds_tuned = best_lr.predict(X_test)
xgb_preds_tuned = best_xgb.predict(X_test)

#EVALUATING
print("Random Forest After Tuning:\n", classification_report(y_test, rf_preds_tuned))
print("Logistic Regression After Tuning:\n", classification_report(y_test, lr_preds_tuned))
print("XGBoost After Tuning:\n", classification_report(y_test, xgb_preds_tuned))


Random Forest After Tuning:
               precision    recall  f1-score   support

           0       0.71      0.60      0.65      1128
           1       0.72      0.80      0.76      1409

    accuracy                           0.71      2537
   macro avg       0.71      0.70      0.70      2537
weighted avg       0.71      0.71      0.71      2537

Logistic Regression After Tuning:
               precision    recall  f1-score   support

           0       0.65      0.55      0.60      1128
           1       0.68      0.76      0.72      1409

    accuracy                           0.67      2537
   macro avg       0.67      0.66      0.66      2537
weighted avg       0.67      0.67      0.67      2537

XGBoost After Tuning:
               precision    recall  f1-score   support

           0       0.70      0.58      0.64      1128
           1       0.71      0.80      0.75      1409

    accuracy                           0.70      2537
   macro avg       0.70      0.69      0.

In [14]:
# SELECTING THE BEST WORKING MODEL
models = {"Random Forest": best_rf, "Logistic Regression": best_lr, "XGBoost": best_xgb}
best_model_name = max(models, key=lambda x: accuracy_score(y_test, models[x].predict(X_test)))

print("Best Model:", best_model_name)


Best Model: Random Forest


In [15]:
# ==> AFTER TUNING, RANDOM FOREST'S PERFORMANCE INCREASED MUCH MORE COMPARED
#     XGBOOST, AS PREVIOUSLY (BEFORE TUNING) XGBOOST WAS PERFORMING BETTER