In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


In [2]:
chunk_size = 100000  # or any value you want
chunks = []

use_cols = ['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
            'dropoff_longitude', 'dropoff_latitude', 'passenger_count']

for chunk in pd.read_csv(r'C:\Users\himan\OneDrive\Desktop\Taxi-Fare-Prediciton\data\taxi.csv', 
                         usecols=use_cols, chunksize=chunk_size):
    chunks.append(chunk)
    if len(chunks) >= 10:
        break


In [3]:
df = pd.concat(chunks, ignore_index=True)
print("Loaded rows:", df.shape[0])
df.head()

Loaded rows: 974995


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1
1,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2
3,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1
4,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1


In [4]:
df.shape
df.info()
df.describe()
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974995 entries, 0 to 974994
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   fare_amount        974995 non-null  float64
 1   pickup_datetime    974995 non-null  object 
 2   pickup_longitude   974995 non-null  float64
 3   pickup_latitude    974995 non-null  float64
 4   dropoff_longitude  974995 non-null  float64
 5   dropoff_latitude   974995 non-null  float64
 6   passenger_count    974995 non-null  int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 52.1+ MB


fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [5]:
df.dropna(inplace=True) #removing null value by elimanting rows due to less null values

We first convert the pickup_datetime column in the dataset (with columns fare_amount, pickup_datetime, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count) to a datetime format using pd.to_datetime(), then clean the data by removing missing values, invalid entries (like negative fares or bad coordinates), and outliers to prepare it for analysis.

In [7]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')
df = df.dropna(subset=['pickup_datetime']) # comvert datetime 2009-06-15 17:26:21 UTC to 2009-06-15 17:26:21+00:00	where invaild value convert into nat than removed

In [8]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1
1,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2
3,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1
4,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1


In [9]:
df['passenger_count'].unique()

array([1, 2, 3, 6, 5, 4], dtype=int64)

In [10]:
df = df[(df['fare_amount'] > 0) & (df['passenger_count'] > 0) & (df['passenger_count'] <= 6)]

In [11]:
# Remove out-of-range  nyc lat/lon 
df = df[
    (df['pickup_latitude'].between(40.5, 41)) &
    (df['pickup_longitude'].between(-74.5, -73)) &
    (df['dropoff_latitude'].between(40.5, 41)) &
    (df['dropoff_longitude'].between(-74.5, -73))
]


In [12]:
df.shape

(974995, 7)

In [13]:
df.to_csv(r'C:\Users\himan\OneDrive\Desktop\Taxi-Fare-Prediciton\data\clean.csv', index=False) # saving cleaned data

In [14]:
df = pd.read_csv(r'C:\Users\himan\OneDrive\Desktop\Taxi-Fare-Prediciton\data\clean.csv')

In [15]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1
1,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2
3,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1
4,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1


In [16]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

df['hour'] = df['pickup_datetime'].dt.hour
df['day'] = df['pickup_datetime'].dt.day
df['month'] = df['pickup_datetime'].dt.month
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek  # Monday = 0
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

In [17]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of Earth in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2 #chatgpt helped for formula
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

df['distance_km'] = haversine_distance(
    df['pickup_latitude'],
    df['pickup_longitude'],
    df['dropoff_latitude'],
    df['dropoff_longitude']
)


The Haversine formula calculates the shortest distance between two points on a sphere using their latitudes and longitudes measured along the surface. It is important for use in navigation. The haversine can be expressed in trigonometric function as: 
haversine(\theta)=sin^2\Big(\frac{\theta}{2}\Big)     
The haversine of the central angle (which is d/r) is calculated by the following formula:
\largehaversine\Big(\frac{d}{r}\Big)=haversine(\Phi_2-\Phi_1)+ cos(\Phi_1)cos(\Phi_2)haversine(\lambda_2-\lambda_1)     

In [19]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,day_of_week,is_weekend,distance_km
0,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1,17,15,6,0,0,1.030764
1,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1,16,5,1,1,0,8.450134
2,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2,0,18,8,3,0,1.389525
3,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1,4,21,4,5,1,2.79927
4,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1,7,9,3,1,0,1.999157


removeing distance less than 0.1

In [21]:
df = df[df['distance_km'] > 0.1]

In [22]:
df.to_csv(r'C:\Users\himan\OneDrive\Desktop\Taxi-Fare-Prediciton\data\featured_data.csv', index=False)


#traning model

In [24]:

df = pd.read_csv(r'C:\Users\himan\OneDrive\Desktop\Taxi-Fare-Prediciton\data\featured_data.csv')

# Features and Target
features = ['passenger_count', 'hour', 'day', 'month', 'day_of_week', 'is_weekend', 'distance_km']
X = df[features]
y = df['fare_amount']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

Linear = LinearRegression()
Linear.fit(X_train, y_train) 
y_pred_linear = Linear.predict(X_test)

rmse_lr = mean_squared_error(y_test, y_pred_linear, squared=False)
print(f"Linear Regression RMSE: {rmse_lr:.2f}")


Linear Regression RMSE: 4.15




In [26]:
rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [10, 50],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV( estimator=rf, param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring='neg_root_mean_squared_error',
    verbose=1
)

print("Starting model training...")

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_


y_pred_rf = best_rf.predict(X_test)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)

print(f"Random Forest RMSE: {rmse_rf:.2f}")
print(f"Best Parameters: {grid_search.best_params_}")


Starting model training...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Random Forest RMSE: 4.01
Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 50}




In [33]:
import joblib
joblib.dump(best_rf, r'C:\Users\himan\OneDrive\Desktop\Taxi-Fare-Prediciton\models\random_forest_model.pkl')
print("Model saved successfully.")

Model saved successfully.


In [39]:
# Load the model from file
loaded_model = joblib.load(r'C:\Users\himan\OneDrive\Desktop\Taxi-Fare-Prediciton\models\random_forest_model.pkl')

# Use it for prediction
y_pred_loaded = loaded_model.predict(X_test)


In [41]:
from sklearn.metrics import mean_squared_error

rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
print(f"Random Forest RMSE: {rmse_rf:.2f}")


Random Forest RMSE: 4.01




In [54]:
u = df['fare_amount'].unique().tolist()
print(u)

[4.5, 16.9, 5.7, 7.7, 5.3, 12.1, 7.5, 16.5, 9.0, 8.9, 4.1, 7.0, 12.5, 4.0, 10.5, 11.5, 4.9, 6.1, 7.3, 9.3, 5.5, 22.54, 31.9, 18.1, 58.0, 9.8, 8.1, 10.9, 17.5, 6.9, 3.3, 9.7, 8.0, 25.5, 15.3, 34.67, 13.7, 6.5, 5.0, 8.5, 3.7, 10.1, 15.07, 24.9, 17.0, 12.0, 11.0, 13.5, 14.0, 10.0, 14.1, 11.3, 12.9, 9.5, 35.0, 14.5, 32.5, 17.7, 19.0, 49.57, 22.5, 52.5, 16.0, 17.3, 21.5, 14.9, 35.5, 11.7, 23.0, 24.5, 6.0, 27.0, 16.1, 15.7, 26.9, 42.5, 15.5, 30.9, 6.7, 13.0, 18.0, 25.3, 57.33, 26.67, 25.0, 43.5, 8.7, 20.5, 52.83, 20.0, 26.5, 10.6, 18.5, 34.0, 23.5, 20.33, 33.07, 36.5, 33.83, 25.7, 29.0, 21.0, 32.9, 15.0, 49.8, 13.3, 56.8, 22.0, 24.0, 28.0, 38.5, 3.0, 38.8, 38.1, 52.0, 2.9, 38.83, 28.5, 66.3, 3.5, 19.7, 38.33, 18.9, 45.0, 31.2, 34.1, 35.07, 28.1, 23.7, 31.83, 25.450000000000003, 6.6, 53.0, 30.83, 31.5, 22.1, 29.3, 6.3, 49.15, 30.8, 36.1, 19.3, 9.4, 24.1, 14.7, 54.0, 6.2, 20.7, 31.87, 40.33, 29.5, 33.0, 20.9, 27.1, 48.33, 19.5, 32.83, 43.0, 30.1, 39.7, 28.27, 40.83, 55.83, 43.54, 21.7, 29.7, 3

In [66]:
df_results = X_test.copy()
df_results['actual_fare'] = y_test.values
df_results['predicted_fare'] = y_pred_rf

# Save for Power BI
df_results.to_csv(r'C:\Users\himan\OneDrive\Desktop\Taxi-Fare-Prediciton\data\powerbi_predictions.csv', index=False)
print("Exported Power BI data to powerbi_predictions.csv")


Exported Power BI data to powerbi_predictions.csv
