In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = Path('Resource/sampled_rideshare.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,id,timestamp,hour,day,month,datetime,timezone,source,destination,cab_type,...,temperatureMinTime,temperatureMax,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,weekday,day_of_week,is_weekend
0,10c861ba-0259-4307-be1c-7177201f21f5,1969-12-31 19:00:01.543322722-05:00,12,27,11,2018-11-27 12:45:22,America/New_York,north station,south station,uber,...,1543377600,46.89,1543320000,31.86,1543377600,43.85,1543320000,1,1,False
1,3dd287e1-7ea7-4dcb-8cbf-fb41b43f621d,1969-12-31 19:00:01.544813113-05:00,18,14,12,2018-12-14 18:45:12,America/New_York,haymarket square,theatre district,uber,...,1544781600,46.65,1544814000,24.51,1544785200,43.86,1544817600,4,4,False
2,c8db3ae3-b4db-46a0-b0ea-9e0119efb287,1969-12-31 19:00:01.543499283-05:00,13,29,11,2018-11-29 13:48:03,America/New_York,fenway,back bay,uber,...,1543550400,44.89,1543510800,31.25,1543550400,38.68,1543510800,3,3,False
3,2bb2794f-6a41-421b-971e-a5b10cbdaa0e,1969-12-31 19:00:01.543527183-05:00,21,29,11,2018-11-29 21:33:03,America/New_York,haymarket square,theatre district,lyft,...,1543550400,44.76,1543510800,30.85,1543550400,38.44,1543510800,3,3,False
4,a416d46b-4548-4b26-8b50-5b8437ffd63a,1969-12-31 19:00:01.543756982-05:00,13,2,12,2018-12-02 13:23:02,America/New_York,theatre district,haymarket square,uber,...,1543726800,52.86,1543788000,35.33,1543744800,52.19,1543788000,6,6,True


In [3]:
# Pivot data so we compare Uber and Lyft prices for the same trip
df_pivot = df.pivot_table(index=['timestamp', 'source', 'destination'], 
                          columns='cab_type', 
                          values='price')

# Create the target column (1 if Uber is cheaper, 0 if Lyft is cheaper)
df_pivot['cheaper_service'] = (df_pivot['uber'] < df_pivot['lyft']).astype(int)

# Merge back to the original dataframe
df = df.merge(df_pivot[['cheaper_service']], left_on=['timestamp', 'source', 'destination'], right_index=True)


In [15]:
features = ['hour', 'day_of_week', 'distance', 'month', 'precipIntensity', 'precipProbability', 'temperature', 'temperatureHigh', 'surge_multiplier']
target = 'cheaper_service'  # 1 if Uber is cheaper, 0 if Lyft is cheaper

# Split data into features (X) and target (y)
X = df[features]
y = df[target]

In [18]:
model_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),  # Standardize numerical features
    ('classifier', LogisticRegression(class_weight='balanced', C=1.0))  # Classifier (you can choose another model here)
])

In [19]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Apply SMOTE only to the training data
from imblearn.over_sampling import SMOTE
from collections import Counter
smote = SMOTE(sampling_strategy='auto', random_state=42)  
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Print class distribution before and after
print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_resampled))

Before SMOTE: Counter({0: 81736, 1: 1432})
After SMOTE: Counter({0: 81736, 1: 81736})


In [21]:
# Train your model on the resampled data
# Train your model on the resampled data
model_pipeline.fit(X_train_resampled, y_train_resampled)

In [22]:
# Predict on test set
y_pred = model_pipeline.predict(X_test)


In [23]:
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.58


In [24]:
# Display confusion matrix and classification report
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[11868  8556]
 [  147   222]]


In [25]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.58      0.73     20424
           1       0.03      0.60      0.05       369

    accuracy                           0.58     20793
   macro avg       0.51      0.59      0.39     20793
weighted avg       0.97      0.58      0.72     20793



In [26]:

# Evaluate model
y_pred = model_pipeline.predict(X_test)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.58      0.73     20424
           1       0.03      0.60      0.05       369

    accuracy                           0.58     20793
   macro avg       0.51      0.59      0.39     20793
weighted avg       0.97      0.58      0.72     20793

