In [1]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import catboost as cb
from sklearn.linear_model import SGDRegressor


In [2]:
# Load dataset
df = pd.read_csv("/Users/omarmedhat/Documents/Cellula ML/Week 5/Task/final_internship_data.csv")


In [3]:
df_sampled = df.sample(n=100000, random_state=42)  # Use only 100K rows


In [4]:
# Step 1: Data Preprocessing
# Drop irrelevant columns
irrelevant_columns = ["User ID", "User Name", "Driver Name", "key", "pickup_datetime", "Cluster"]
df = df.drop(columns=irrelevant_columns, errors='ignore')


In [5]:
# Remove unreasonable fare amounts
df = df[(df["fare_amount"] > 0) & (df["fare_amount"] < 10000)]


In [6]:
# Handle missing values
for col in df.select_dtypes(include=["float64", "int64"]).columns:
    df[col] = df[col].fillna(df[col].median())  # Fix inplace warning

for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].fillna("Unknown")  # Fix inplace warning

In [7]:
# Encode categorical variables
label_encoders = {}
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [8]:
# Feature Scaling
scaler = StandardScaler()
X = df.drop(columns=["fare_amount"])
y = df["fare_amount"]
X_scaled = scaler.fit_transform(X)


In [9]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [10]:
# Define models

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42),  # Reduced complexity
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=50, max_depth=5, random_state=42),  # Reduced complexity
    "XGBoost": xgb.XGBRegressor(n_estimators=50, max_depth=5, tree_method='hist', verbosity=0, random_state=42),  # Faster training
    "CatBoost": cb.CatBoostRegressor(n_estimators=50, depth=5, verbose=0, random_state=42),  # Reduced logging
    "SGD Regressor":  SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)
}

In [11]:
def train_and_evaluate(name, model):
    print(f"Training {name}...")
    model.fit(X_train, y_train.copy())  # Ensure writable y_train
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred) * 100
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"{name} R² Score: {r2:.2f}% | RMSE: {rmse:.2f}")
    return name, (r2, rmse)

results = {}
for name, model in models.items():
    results[name] = train_and_evaluate(name, model)

print("Final Results:", results)  # Check if results are being stored

Training Linear Regression...
Linear Regression R² Score: 33.84% | RMSE: 8.25
Training Random Forest...




Random Forest R² Score: 76.24% | RMSE: 4.94
Training Gradient Boosting...




Gradient Boosting R² Score: 76.26% | RMSE: 4.94
Training XGBoost...




XGBoost R² Score: 77.04% | RMSE: 4.86
Training CatBoost...




CatBoost R² Score: 76.93% | RMSE: 4.87
Training SGD Regressor...
SGD Regressor R² Score: -15069880476.49% | RMSE: 124446.22
Final Results: {'Linear Regression': ('Linear Regression', (33.83876218010128, 8.245720887022326)), 'Random Forest': ('Random Forest', (76.24076591387318, 4.941320655693121)), 'Gradient Boosting': ('Gradient Boosting', (76.26123927527685, 4.939191225800297)), 'XGBoost': ('XGBoost', (77.04033410698689, 4.857464117601402)), 'CatBoost': ('CatBoost', (76.92808563828203, 4.86932356974666)), 'SGD Regressor': ('SGD Regressor', (-15069880476.492352, 124446.22425635459))}




In [12]:
# Find the best model based on the highest R² score
best_model_name = max(results, key=lambda k: results[k][0])  # Select model with highest R²
print(f"Best Model: {best_model_name}")


Best Model: XGBoost


In [13]:
# Step 4: Hyperparameter Tuning (Only for Best Model)
if best_model_name in ["XGBoost", "Random Forest", "CatBoost"]:
    param_dist = {
        'n_estimators': [50, 100, 200],  # Reduced values for speed
        'learning_rate': [0.01, 0.03, 0.05],
        'max_depth': [5, 10, 15],  # Reduced depth to avoid overfitting
        'subsample': [0.6, 0.8, 1.0],  # Use a fraction of the data per tree
        'colsample_bytree': [0.6, 0.8, 1.0]  # Use a fraction of features per tree
    }

    best_model = models[best_model_name]
    random_search = RandomizedSearchCV(
        best_model, 
        param_distributions=param_dist, 
        n_iter=5,  # Keep this small to reduce training time
        cv=2,  # Reduce cross-validation folds
        scoring='r2', 
        n_jobs=-1,
        random_state=42  # Ensures reproducibility
    )

    random_search.fit(X_train, y_train)

    best_params = random_search.best_params_
    best_tuned_r2 = random_search.best_score_ * 100  # Convert to percentage
    print(f"Best Hyperparameters for {best_model_name}: {best_params}")
    print(f"Best Tuned R² Score: {best_tuned_r2:.2f}%")


Best Hyperparameters for XGBoost: {'subsample': 1.0, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
Best Tuned R² Score: 79.63%


In [14]:
# Step 5: Final Model Selection
print(f"Final Best Model: {best_model_name} with Tuned R² Score: {best_tuned_r2:.2f}%")


Final Best Model: XGBoost with Tuned R² Score: 79.63%
