In [16]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from scipy.stats import randint
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score

In [26]:

# Load your cleaned dataset
df = pd.read_csv("processed_data.csv")

# Drop the original datetime column
df.drop(columns=['pickup_datetime'], inplace=True)
df.drop(columns=['key'], inplace=True)
# Drop any rows with missing values
df.dropna(inplace=True)

# Define features and target
X = df.drop(columns=['fare_amount'])  # Features
y = df['fare_amount']  # Target

# Identify categorical and numerical columns
categorical_features = ['Weather', 'Traffic Condition', 'Car Condition']
numerical_features = [col for col in X.columns if col not in categorical_features]

# # Preprocessing: One-Hot Encoding for categorical & Scaling for numerical
# preprocessor = ColumnTransformer([
#     ('num', StandardScaler(), numerical_features),
#     ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
# ])
# Reduce One-Hot Encoding memory issue by using Ordinal Encoding
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OrdinalEncoder(), categorical_features)
])

# Split the dataset (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# # Load your dataset
# df = pd.read_csv("final_internship_data.csv")

# # Drop the original datetime column
# df.drop(columns=['User ID', 'User Name', 'Driver Name', 'pickup_datetime'], inplace=True)
# df.drop(columns=['key'], inplace=True)
# # Drop any rows with missing values
# df.dropna(inplace=True)

# for column in df.select_dtypes('float64', 'int64'):
#     q1 = df[column].quantile(0.25)
#     q3 = df[column].quantile(0.75)
#     iqr = q3 - q1
#     lower_bound = q1 - 1.5 * iqr
#     upper_bound = q3 + 1.5 * iqr
#     df[column] = df[column].apply(lambda x: min(upper_bound, max(lower_bound, x)))

# # Define features and target
# X = df.drop(columns=['fare_amount'])  # Features
# y = df['fare_amount']  # Target

# # Identify categorical and numerical columns
# categorical_features = ['Weather', 'Traffic Condition', 'Car Condition']
# numerical_features = [col for col in X.columns if col not in categorical_features]

# # Check for multicollinearity using VIF
# def calculate_vif(df, features):
#     vif_data = pd.DataFrame()
#     vif_data["feature"] = features
#     vif_data["VIF"] = [variance_inflation_factor(df[features].values, i) for i in range(len(features))]
#     return vif_data

# vif_data = calculate_vif(X, numerical_features)
# print(vif_data)

# # Remove features with VIF > 10
# high_vif_features = vif_data[vif_data["VIF"] > 10]["feature"].tolist()
# X.drop(columns=high_vif_features, inplace=True)
# numerical_features = [col for col in numerical_features if col not in high_vif_features]

# # Preprocessing: MinMax Scaling for numerical & Ordinal Encoding for categorical
# preprocessor = ColumnTransformer([
#     ('num', MinMaxScaler(), numerical_features),
#     ('cat', OrdinalEncoder(), categorical_features)
# ])

# # Split the dataset (80-20)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:

# ----------------------- LINEAR REGRESSION -----------------------

# Pipeline for Linear Regression
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Cross-validation for Linear Regression
lr_scores = cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

# Fit and Evaluate Linear Regression
lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)

# Metrics
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)

# Binarize predictions based on a threshold (e.g., median of y_test)
threshold = y_test.median()
y_pred_lr_class = (y_pred_lr >= threshold).astype(int)
y_test_class = (y_test >= threshold).astype(int)

# Accuracy and F1 Score
accuracy_lr = accuracy_score(y_test_class, y_pred_lr_class)
f1_lr = f1_score(y_test_class, y_pred_lr_class)

print(f"Linear Regression - MAE: {mae_lr}, RMSE: {rmse_lr}")
print(f"Linear Regression - Accuracy: {accuracy_lr}, F1 Score: {f1_lr}")

Linear Regression - MAE: 1.503137409082195, RMSE: 2.1209490676919125
Linear Regression - Accuracy: 0.8392184267138022, F1 Score: 0.8452387232920073




In [None]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

# ---------------- FAST RANDOM FOREST (FIXED MEMORY) ----------------
param_dist = {
    'model__n_estimators': randint(100, 200),
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': randint(2, 10),
    'model__min_samples_leaf': randint(1, 4),
    'model__max_features': ['sqrt', 'log2']
}

# Pipeline for Random Forest
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42, n_jobs=-1))
])

# Randomized Search for best parameters (reduced `n_iter=10` for speed)
random_search = RandomizedSearchCV(
    rf_pipeline, param_distributions=param_dist, 
    n_iter=10, cv=3, scoring='neg_mean_absolute_error',
    n_jobs=-1, verbose=1, random_state=42
)

# Train Randomized Search (with smaller training set for tuning)
subset_size = 250000  # Train on 250K rows instead of full 500K to save memory
random_search.fit(X_train[:subset_size], y_train[:subset_size])

# Best model after tuning
best_rf = random_search.best_estimator_

# Predictions on test set
y_pred_rf = best_rf.predict(X_test)

# Final Metrics
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)

print(f"Best Random Forest Parameters: {random_search.best_params_}")
print(f"Random Forest (Optimized) - MAE: {mae_rf}, RMSE: {rmse_rf}")

# Binarize predictions based on a threshold (e.g., median of y_test)
y_pred_rf_class = (y_pred_rf >= threshold).astype(int)

# Accuracy and F1 Score
accuracy_rf = accuracy_score(y_test_class, y_pred_rf_class)
f1_rf = f1_score(y_test_class, y_pred_rf_class)

print(f"Best Random Forest Parameters: {random_search.best_params_}")
print(f"Random Forest (Optimized) - MAE: {mae_rf}, RMSE: {rmse_rf}")
print(f"Random Forest (Optimized) - Accuracy: {accuracy_rf}, F1 Score: {f1_rf}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits


1 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\LENOVO\AppData\Roaming\Python\Python311\site-packages\joblib\_utils.py", line 72, in __call__
    return self.func(**kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LENOVO\AppData\Roaming\Python\Python311\site-packages\joblib\parallel.py", line 598, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LENOVO\AppData\Roaming\Python\Python311\site-packages\joblib\parallel.py", line 598, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^

Best Random Forest Parameters: {'model__max_depth': None, 'model__max_features': 'log2', 'model__min_samples_leaf': 3, 'model__min_samples_split': 5, 'model__n_estimators': 163}
Random Forest (Optimized) - MAE: 1.1819829275781135, RMSE: 1.7629483210635255
Best Random Forest Parameters: {'model__max_depth': None, 'model__max_features': 'log2', 'model__min_samples_leaf': 3, 'model__min_samples_split': 5, 'model__n_estimators': 163}
Random Forest (Optimized) - MAE: 1.1819829275781135, RMSE: 1.7629483210635255
Random Forest (Optimized) - Accuracy: 0.8767761472163987, F1 Score: 0.8802163112537794




In [23]:
import xgboost as xgb

# Encode categorical features
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, n_jobs=-1)
xgb_model.fit(X_train_encoded, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_test_encoded)

# Evaluation
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)

print(f"XGBoost - MAE: {mae_xgb}, RMSE: {rmse_xgb}")

# Binarize predictions based on a threshold (e.g., median of y_test)
y_pred_xgb_class = (y_pred_xgb >= threshold).astype(int)

# Accuracy and F1 Score
accuracy_xgb = accuracy_score(y_test_class, y_pred_xgb_class)
f1_xgb = f1_score(y_test_class, y_pred_xgb_class)

print(f"XGBoost - Accuracy: {accuracy_xgb}, F1 Score: {f1_xgb}")


XGBoost - MAE: 1.141176266577775, RMSE: 1.7278117048109183
XGBoost - Accuracy: 0.8792836491689617, F1 Score: 0.8822444396920445




In [24]:
import lightgbm as lgb

# Encode categorical features
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# LightGBM model
lgb_model = lgb.LGBMRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, n_jobs=-1)
lgb_model.fit(X_train_encoded, y_train)

# Predictions
y_pred_lgb = lgb_model.predict(X_test_encoded)

# Evaluation
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
rmse_lgb = mean_squared_error(y_test, y_pred_lgb, squared=False)

print(f"LightGBM - MAE: {mae_lgb}, RMSE: {rmse_lgb}")

# Binarize predictions based on a threshold (e.g., median of y_test)
y_pred_lgb_class = (y_pred_lgb >= threshold).astype(int)

# Accuracy and F1 Score
accuracy_lgb = accuracy_score(y_test_class, y_pred_lgb_class)
f1_lgb = f1_score(y_test_class, y_pred_lgb_class)

print(f"LightGBM - Accuracy: {accuracy_lgb}, F1 Score: {f1_lgb}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042930 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2908
[LightGBM] [Info] Number of data points in the train set: 291920, number of used features: 20
[LightGBM] [Info] Start training from score 8.603115
LightGBM - MAE: 1.1491589450057116, RMSE: 1.737150668473005
LightGBM - Accuracy: 0.8784067085953878, F1 Score: 0.8813001605136437


