The goal of this model is to predict whether or not a customer is a generous tipper.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, precision_score, recall_score, f1_score,
    ConfusionMatrixDisplay, RocCurveDisplay
)
from sklearn.linear_model import LogisticRegression

# Load the dataset
df = pd.read_csv('taxidataset.csv')

# Keep only valid rows
df = df[df['fare_amount'] > 0]

# Define target: is_generous
df['is_generous'] = (df['tip_amount'] / df['fare_amount']) >= 0.2

# Convert pickup and dropoff time to datetime
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

# Time-based features
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_day_of_week'] = df['tpep_pickup_datetime'].dt.dayofweek
df['is_weekend'] = df['pickup_day_of_week'].isin([5, 6])

# Drop unnecessary columns
df_cleaned = df.drop(columns=['Unnamed: 0', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'store_and_fwd_flag'])

# One-hot encode categorical features
df_encoded = pd.get_dummies(df_cleaned, columns=['RatecodeID', 'payment_type', 'PULocationID', 'DOLocationID'], drop_first=True)

# Select features and target
features_to_exclude = ['tip_amount', 'total_amount', 'fare_amount', 'is_generous']
feature_columns = [col for col in df_encoded.columns if col not in features_to_exclude]

X = df_encoded[feature_columns]
y = df_encoded['is_generous']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the result extraction function
def make_results(model_name: str, model_object, metric: str):
    '''
    Returns a df with precision, recall, F1, and accuracy scores for the best model.
    '''
    metric_dict = {
        'precision': 'mean_test_precision',
        'recall': 'mean_test_recall',
        'f1': 'mean_test_f1',
        'accuracy': 'mean_test_accuracy',
    }

    cv_results = pd.DataFrame(model_object.cv_results_)
    best_estimator_results = cv_results.iloc[cv_results[metric_dict[metric]].idxmax(), :]

    table = pd.DataFrame({
        'model': [model_name],
        'precision': [best_estimator_results['mean_test_precision']],
        'recall': [best_estimator_results['mean_test_recall']],
        'F1': [best_estimator_results['mean_test_f1']],
        'accuracy': [best_estimator_results['mean_test_accuracy']],
    })

    return table

#  Train Logistic Regression with GridSearchCV
log_reg = LogisticRegression(max_iter=1000)
log_reg_params = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear']
}
scoring = {
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'accuracy': 'accuracy'
}
log_reg_grid = GridSearchCV(log_reg, log_reg_params, scoring=scoring, refit='f1', cv=5, return_train_score=True)
log_reg_grid.fit(X_train, y_train)

#  Get and display results
log_reg_results = make_results("Logistic Regression", log_reg_grid, metric="f1")
print(log_reg_results)


                 model  precision    recall        F1  accuracy
0  Logistic Regression   0.725817  0.996612  0.839884  0.814474
