<a href="https://colab.research.google.com/github/harshachourey2/TripFare-Taxi-Fare-Prediction/blob/main/03_Streamlit_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Finalize Best Model + Hyperparameter Tuning + Streamlit UI

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


1. Load model-ready data
2. Hyperparameter tuning (GridSearchCV / RandomizedSearchCV)
3. Finalize best-performing model
4. Save trained model
5. Prepare Streamlit application code


##Import Required Libraries

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import pickle


###Load Model-Ready Dataset

In [3]:
df = pd.read_csv("model_ready_data.csv")
df.head()


Unnamed: 0,VendorID,passenger_count,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,pickup_hour,pickup_day,is_weekend,is_night,trip_duration_min
0,1,1,-73.976746,40.765152,1,0,-74.004265,40.746128,1,9.0,...,0.5,2.05,0.0,0.3,12.35,0,1,0,1,7.916667
1,1,1,-73.983482,40.767925,1,0,-74.005943,40.733166,1,11.0,...,0.5,3.05,0.0,0.3,15.35,0,1,0,1,11.1
2,2,2,-73.782021,40.64481,1,0,-73.974541,40.67577,1,54.5,...,0.5,8.0,0.0,0.3,63.8,0,1,0,1,31.1
3,1,1,-73.788773,40.647758,1,0,-73.829208,40.712345,3,20.5,...,0.5,0.0,0.0,0.3,21.8,0,1,0,1,16.05
4,1,1,-73.958221,40.764641,1,0,-73.967896,40.762901,1,5.5,...,0.5,2.0,0.0,0.3,8.8,0,1,0,1,4.983333


In [4]:
selected_features = [
    "passenger_count",
    "trip_duration_min",
    "pickup_hour",
    "is_weekend",
    "is_night"
]

X = df[selected_features]
y = df["total_amount"]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [6]:
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}


In [7]:
rf = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring="r2",
    n_jobs=-1
)

# Handle NaN values in y_train before fitting
# Create a mask for non-NaN values in y_train
not_nan_mask = y_train.notna()

# Filter X_train and y_train based on the mask to remove rows with NaN in y_train
X_train_cleaned = X_train[not_nan_mask]
y_train_cleaned = y_train[not_nan_mask]

grid_search.fit(X_train_cleaned, y_train_cleaned)




In [8]:
grid_search.best_params_


{'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 200}

In [9]:
best_model = grid_search.best_estimator_
best_model


In [10]:
y_pred = best_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("R2 Score:", r2)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)


R2 Score: 0.7943520995914799
MAE: 2.7695074398147437
MSE: 26.950538116606793
RMSE: 5.1913907690142915


In [11]:
best_model.fit(X_train, y_train)
pickle.dump(best_model, open("final_taxi_fare_model.pkl", "wb"))


##Streamlit App Code (app.py)

In [18]:
best_model.n_features_in_

5

In [1]:
pip install streamlit pyngrok




In [3]:
!pip install pyngrok
from pyngrok import ngrok
ngrok.kill()
ngrok.set_auth_token("368liuocCKZG1vR7GmCld7eLRCa_6dLMc5qBt6FpXrKgdH3ff")
public_url = ngrok.connect(8501)
print(public_url)
!streamlit run app.py --server.port 8501 --server.address 0.0.0.0 &>/dev/null&

NgrokTunnel: "https://heedfully-brickish-stormy.ngrok-free.dev" -> "http://localhost:8501"
