In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek


In [16]:
from google.colab import drive
drive.mount('/content/drive')
# Load dataset
uber_data = pd.read_csv("/content/drive/MyDrive/uber request /Uber Request Data.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
uber_data.head()

Unnamed: 0,Request id,Pickup point,Driver id,Status,Request timestamp,Drop timestamp
0,619,Airport,1.0,Trip Completed,11/7/2016 11:51,11/7/2016 13:00
1,867,Airport,1.0,Trip Completed,11/7/2016 17:57,11/7/2016 18:47
2,1807,City,1.0,Trip Completed,12/7/2016 9:17,12/7/2016 9:58
3,2532,Airport,1.0,Trip Completed,12/7/2016 21:08,12/7/2016 22:03
4,3112,City,1.0,Trip Completed,13-07-2016 08:33:16,13-07-2016 09:25:47


In [18]:
uber_data.shape

(6745, 6)

In [19]:
# Convert timestamps to datetime
uber_data["Request timestamp"] = pd.to_datetime(uber_data["Request timestamp"], errors='coerce', dayfirst=True)
uber_data["Drop timestamp"] = pd.to_datetime(uber_data["Drop timestamp"], errors='coerce', dayfirst=True)

In [20]:
# checking for the null values
uber_data.isnull().sum()

Unnamed: 0,0
Request id,0
Pickup point,0
Driver id,2650
Status,0
Request timestamp,4071
Drop timestamp,5595


In [21]:
# Handle missing values
uber_data.loc[:, "Driver id"] = uber_data["Driver id"].fillna(-1)

# Define fraudulent cancellations
uber_data["Fraudulent Cancellation"] = ((uber_data["Status"] == "Cancelled") & (uber_data["Driver id"] != -1)).astype(int)

In [22]:
# Feature Engineering: Extract time-based features
uber_data["Request Hour"] = uber_data["Request timestamp"].dt.hour
uber_data["Request Weekday"] = uber_data["Request timestamp"].dt.weekday
uber_data["Time Bin"] = pd.cut(uber_data["Request Hour"], bins=[0, 6, 12, 18, 24], labels=[0, 1, 2, 3])

In [23]:
# Drop unnecessary columns
uber_data.drop(["Request id", "Request timestamp", "Drop timestamp", "Status"], axis=1, inplace=True)

In [None]:
uber_data.shape

(6745, 6)

In [None]:
uber_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6745 entries, 0 to 6744
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   Pickup point             6745 non-null   object  
 1   Driver id                6745 non-null   float64 
 2   Fraudulent Cancellation  6745 non-null   int64   
 3   Request Hour             2674 non-null   float64 
 4   Request Weekday          2674 non-null   float64 
 5   Time Bin                 2642 non-null   category
dtypes: category(1), float64(3), int64(1), object(1)
memory usage: 270.4+ KB


In [27]:
# Encode categorical variables
label_enc = LabelEncoder()
uber_data["Pickup point"] = label_enc.fit_transform(uber_data["Pickup point"])
uber_data["Time Bin"] = label_enc.fit_transform(uber_data["Time Bin"])

In [28]:

# Define features and target
X = uber_data.drop("Fraudulent Cancellation", axis=1)
y = uber_data["Fraudulent Cancellation"]
# Handle missing values
X.fillna(X.median(), inplace=True)

# Handle class imbalance using SMOTE-Tomek (better than SMOTE)
smt = SMOTETomek(random_state=42)
X_resampled, y_resampled = smt.fit_resample(X, y)
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
# Train a Random Forest Classifier with better hyperparameters
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(rf, param_dist, cv=skf, scoring='accuracy', n_jobs=-1, n_iter=20, random_state=42)
random_search.fit(X_train, y_train)


In [30]:

# Best model
best_rf = random_search.best_estimator_
y_train_pred = best_rf.predict(X_train)
y_test_pred = best_rf.predict(X_test)

# Evaluate model
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
report = classification_report(y_test, y_test_pred)

In [31]:
print(f"Best Model Accuracy on Training Data: {train_accuracy:.4f}")
print(f"Best Model Accuracy on Test Data: {test_accuracy:.4f}")
print("Classification Report:\n", report)

Best Model Accuracy on Training Data: 0.9091
Best Model Accuracy on Test Data: 0.8528
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.82      0.85      1084
           1       0.83      0.89      0.86      1083

    accuracy                           0.85      2167
   macro avg       0.85      0.85      0.85      2167
weighted avg       0.85      0.85      0.85      2167

