Predict the price of the Uber ride from a given pickup point to the agreed drop-off location. Perform
following tasks:
1. Pre-process the dataset.
2. Identify outliers.
3. Check the correlation.
4. Implement linear regression model.
5. Evaluate the model using R2, RMSE, etc.

Use Uber Dataset:
[Uber.csv](https://www.kaggle.com/datasets/yasserh/uber-fares-dataset)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as  plt
import seaborn as sns
from google.colab import files

uploaded = files.upload()

In [None]:
data = pd.read_csv('uber.csv')

print(data.head())

# Data Preprocessing

In [None]:
# Drop irrelevant columns
data = data.drop(["Unnamed: 0","key"], axis=1)

# Check for missing values in the dataset
missing_values = data.isnull().sum()

# Drop rows with missing values (as they are minimal)
data = data.dropna()

# Convert pickup_datetime to datetime format and extract useful time features
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'], errors='coerce')
data['pickup_day'] = data['pickup_datetime'].dt.day
data['pickup_hour'] = data['pickup_datetime'].dt.hour
data['pickup_dayofweek'] = data['pickup_datetime'].dt.dayofweek

# Drop the original pickup_datetime column as we have extracted useful features
# data = data.drop(columns=['pickup_datetime'])

# Verify the preprocessing
data.head(), missing_values


# Dropping null rows

In [None]:
data.describe().T

In [None]:
data.isna().sum()

# Boxplots

In [None]:
for col in data.select_dtypes(exclude=['object']):
    plt.figure()
    sns.boxplot(data=data,x=col)

# Dropping outliers

In [None]:
data = data[
    (data.pickup_latitude > -90) & (data.pickup_latitude < 90) &
    (data.dropoff_latitude > -90) & (data.dropoff_latitude < 90) &
    (data.pickup_longitude > -180) & (data.pickup_longitude < 180) &
    (data.dropoff_longitude > -180) & (data.dropoff_longitude < 180) &
    (data.fare_amount > 0) & (data.passenger_count > 0)  & (data.passenger_count < 50)
]

# Calculating Distance

In [None]:
from math import cos, asin, sqrt, pi

def distance(lat_1,lon_1,lat_2,lon_2):
#     lat1 = row.pickup_latitude
#     lon1 = row.pickup_longitude
#     lat2 = row.dropoff_latitude
#     lon2 = row.dropoff_longitude
    lon_1, lon_2, lat_1, lat_2 = map(np.radians, [lon_1, lon_2, lat_1, lat_2])  #Degrees to Radians


    diff_lon = lon_2 - lon_1
    diff_lat = lat_2 - lat_1


    km = 2 * 6371 * np.arcsin(np.sqrt(np.sin(diff_lat/2.0)**2 +  np.cos(lat_1) * np.cos(lat_2) * np.sin(diff_lon/2.0)**2))

    return km

In [None]:
temp = distance(data['pickup_latitude'],data['pickup_longitude'],data['dropoff_latitude'],data['dropoff_longitude'])
temp.head()

In [None]:
data_new = data.copy()
data_new['Distance'] = temp
data = data_new
data.head()

In [None]:
sns.boxplot(data=data,x='Distance')

In [None]:
data = data[(data['Distance'] < 200) & (data['Distance'] > 0)]

# Date and Time features extract

In [None]:
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])

In [None]:
data['week_day'] = data['pickup_datetime'].dt.day_name()
data['Year'] = data['pickup_datetime'].dt.year
data['Month'] = data['pickup_datetime'].dt.month
data['Hour'] = data['pickup_datetime'].dt.hour

In [None]:
data.drop(columns=['pickup_datetime','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude'],inplace=True)

In [None]:
data.head()

In [None]:
temp = data.copy()

def convert_week_day(day):
    if day in ['Monday','Tuesday','Wednesday','Thursday']:
        return 0 # Weekday
    return 1 # Weekend

def convert_hour(hour):
    if 5 <= hour <= 12:
        return 1
    elif 12 < hour <= 17:
        return 2
    elif 17 < hour < 24:
        return 3
    return 0

data.loc[:, 'week_day'] = temp['week_day'].apply(convert_week_day)

data.loc[:, 'Hour'] = temp['Hour'].apply(convert_hour)
data.head()


# Correlation Matrix

In [None]:
data.corr()

In [None]:
sns.scatterplot(y=data['fare_amount'],x=data['Distance'])

In [None]:
from sklearn.preprocessing import StandardScaler
x = data[['Distance']].values
y = data['fare_amount'].values.reshape(-1,1)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test = train_test_split(x,y,random_state=10)

In [None]:
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)

In [None]:
x_test = std_x.transform(x_test)

In [None]:
std_y = StandardScaler()
y_train = std_y.fit_transform(y_train)
y_test = std_y.transform(y_test)

In [None]:
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error
def fit_predict(model):
    model.fit(x_train,y_train.ravel())
    y_pred = model.predict(x_test)
    r_squared = r2_score(y_test,y_pred)
    RMSE = mean_squared_error(y_test, y_pred,squared=False)
    MAE = mean_absolute_error(y_test,y_pred)
    print('R-squared: ', r_squared)
    print('RMSE: ', RMSE)
    print("MAE:  ",MAE)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

def fit_predict(model):
    model.fit(x_train, y_train.ravel())
    y_pred = model.predict(x_test)
    r_squared = r2_score(y_test, y_pred)
    # Use mean_squared_error with squared=False to get RMSE directly
    RMSE = mean_squared_error(y_test, y_pred, squared=False)
    MAE = mean_absolute_error(y_test, y_pred)
    print('R-squared: ', r_squared)
    print('RMSE: ', RMSE)
    print("MAE:  ", MAE)

In [None]:
from sklearn.linear_model import LinearRegression
fit_predict(LinearRegression())

In [None]:
from sklearn.ensemble import RandomForestRegressor
fit_predict(RandomForestRegressor())