In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset 
df = pd.read_csv("uber.csv")

# Create distance feature
def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 6371 * (2*np.arcsin(np.sqrt(a)))

df["distance_km"] = haversine(df["pickup_longitude"], df["pickup_latitude"],
                              df["dropoff_longitude"], df["dropoff_latitude"])

# Remove missing/outliers
df = df[(df["fare_amount"] > 0) & (df["distance_km"] > 0) & (df["distance_km"] < 100)]

# Features & target
X = df[["distance_km"]]
y = df["fare_amount"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)

# Random Forest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

print("Linear Regression R2:", r2_score(y_test, pred_lr))
print("Linear Regression RMSE:", mean_squared_error(y_test, pred_lr, squared=False))

print("Random Forest R2:", r2_score(y_test, pred_rf))
print("Random Forest RMSE:", mean_squared_error(y_test, pred_rf, squared=False))
