In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Load dataset from Colab's sample_data folder
df = pd.read_csv("/content/sample_data/final_internship_data.csv", low_memory=True)
df.head()

Unnamed: 0,User ID,User Name,Driver Name,Car Condition,Weather,Traffic Condition,key,fare_amount,pickup_datetime,pickup_longitude,...,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
0,KHVrEVlD,Kimberly Adams,Amy Butler,Very Good,windy,Congested Traffic,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21,-1.288826,...,6,0,2009,20.26584,55.176046,14.342611,34.543548,27.572573,1.030764,-2.918897
1,lPxIuEri,Justin Tapia,Hannah Zimmerman,Excellent,cloudy,Flow Traffic,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16,-1.291824,...,1,1,2010,44.667679,31.832358,23.130775,15.125872,8.755732,8.450134,-0.375217
2,gsVN8JLS,Elizabeth Lopez,Amanda Jackson,Bad,stormy,Congested Traffic,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00,-1.291242,...,8,3,2011,43.597686,33.712082,19.865289,17.722624,9.847344,1.389525,2.599961
3,9I7kWFgd,Steven Wilson,Amy Horn,Very Good,stormy,Flow Traffic,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42,-1.291319,...,4,5,2012,42.642965,32.556289,21.063132,15.738963,7.703421,2.79927,0.133905
4,8QN5ZaGN,Alexander Andrews,Cassandra Larson,Bad,stormy,Congested Traffic,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00,-1.290987,...,3,1,2010,43.329953,39.406828,15.219339,23.732406,15.600745,1.999157,-0.502703


In [7]:
# Check for missing values and drop them
df.dropna(inplace=True)

In [8]:
# Convert categorical columns to numerical
categorical_cols = df.select_dtypes(include=["object"]).columns
for col in categorical_cols:
    df[col] = df[col].astype("category")

In [9]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# One-Hot Encoding (Memory Efficient)
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True, dtype="uint8")

# Define features (X) and target (y)
X = df.drop(columns=["fare_amount", "pickup_datetime"], errors="ignore")
y = df["fare_amount"]

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score

# ✅ Load dataset
df = pd.read_csv('/content/sample_data/final_internship_data.csv')  # Change to your file name

# ✅ Check for missing values
print("Missing values before handling:\n", df.isnull().sum())

# ✅ Fill missing values
df.fillna(df.median(numeric_only=True), inplace=True)  # Fill numerical NaNs with median
for col in ['Car Condition', 'Weather', 'Traffic Condition']:
    df[col].fillna(df[col].mode()[0], inplace=True)  # Fill categorical NaNs with mode

# ✅ Verify missing values are handled
print("Missing values after handling:\n", df.isnull().sum())

# ✅ Encode categorical variables
label_encoders = {}
for col in ['User Name', 'Driver Name', 'Car Condition', 'Weather', 'Traffic Condition']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders if needed later

# ✅ Select features (X) and target (y)
X = df.drop(['fare_amount', 'User ID', 'key', 'pickup_datetime'], axis=1)
y = df['fare_amount']

# ✅ Split dataset into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Define models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "Support Vector Regression": SVR(kernel='rbf')
}

# ✅ Train & Evaluate Models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions
    accuracy = r2_score(y_test, y_pred)  # Calculate R² score
    results[name] = accuracy
    print(f"{name} R² Score: {accuracy:.4f}")

# ✅ Find Best Model
best_model_name = max(results, key=results.get)
best_model_score = results[best_model_name]
print(f"\n🚀 Best Model: {best_model_name} with R² Score: {best_model_score:.4f}")

# ✅ Hyperparameter Tuning (For Best Model)
if best_model_name == "Random Forest":
    param_grid = {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20]}
elif best_model_name == "Gradient Boosting":
    param_grid = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}
elif best_model_name == "Support Vector Regression":
    param_grid = {'C': [0.1, 1, 10], 'epsilon': [0.01, 0.1, 0.5]}
else:
    param_grid = {}

if param_grid:
    print("\n🔍 Performing Hyperparameter Tuning...")
    grid_search = GridSearchCV(models[best_model_name], param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"✅ Best Parameters for {best_model_name}: {grid_search.best_params_}")
    best_model_score = grid_search.best_score_

print(f"\n🎯 Final Best Model: {best_model_name} with R² Score: {best_model_score:.4f}")


Missing values before handling:
 User ID              0
User Name            0
Driver Name          0
Car Condition        0
Weather              0
Traffic Condition    0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    5
dropoff_latitude     5
passenger_count      0
hour                 0
day                  0
month                0
weekday              0
year                 0
jfk_dist             5
ewr_dist             5
lga_dist             5
sol_dist             5
nyc_dist             5
distance             5
bearing              5
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)  # Fill categorical NaNs with mode


Missing values after handling:
 User ID              0
User Name            0
Driver Name          0
Car Condition        0
Weather              0
Traffic Condition    0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
hour                 0
day                  0
month                0
weekday              0
year                 0
jfk_dist             0
ewr_dist             0
lga_dist             0
sol_dist             0
nyc_dist             0
distance             0
bearing              0
dtype: int64
Linear Regression R² Score: 0.2980
Random Forest R² Score: 0.7870
Gradient Boosting R² Score: 0.7761
