In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, adjusted_rand_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
data = pd.read_csv('uber_rides_data.xlsx - sample_train.csv')

In [None]:
shape = data.shape
print(shape)

(200000, 8)


In [None]:
integer_columns = data.select_dtypes(include=['int64'])

num_integer_columns = len(integer_columns.columns)

print(f"Number of integer columns in the dataset: {num_integer_columns}")

Number of integer columns in the dataset: 2


In [None]:
missing_values = data['dropoff_longitude'].isnull().sum()

print(f"Number of missing values in 'dropoff_longitude' column: {missing_values}")

Number of missing values in 'dropoff_longitude' column: 1


In [None]:
pickup_datetime_dtype = data['pickup_datetime'].dtype

print(f"Data type of 'pickup_datetime' feature: {pickup_datetime_dtype}")

Data type of 'pickup_datetime' feature: object


In [None]:
data = data.dropna(subset=['fare_amount'])

average_fare = data['fare_amount'].mean()

print(f"Average fare amount: {average_fare}")

Average fare amount: 11.359955250000002


In [None]:
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371
    return c * r


In [None]:
data['haversine_distance'] = haversine(data['pickup_latitude'], data['pickup_longitude'],
                                       data['dropoff_latitude'], data['dropoff_longitude'])

median_haversine_distance = data['haversine_distance'].median()

print(f"Median Haversine distance between pickup and dropoff location: {median_haversine_distance:.2f} kilometers")

Median Haversine distance between pickup and dropoff location: 2.12 kilometers


In [None]:
max_haversine_distance = data['haversine_distance'].max()

print(f"Maximum Haversine distance between pickup and dropoff location: {max_haversine_distance:.2f} kilometers")

Maximum Haversine distance between pickup and dropoff location: 16409.24 kilometers


In [None]:
zero_haversine_count = (data['haversine_distance'] == 0.0).sum()

print(f"Number of rides with 0.0 Haversine distance: {zero_haversine_count}")

Number of rides with 0.0 Haversine distance: 5632


In [None]:
zero_haversine_rides = data[data['haversine_distance'] == 0.0]

mean_fare_zero_haversine = zero_haversine_rides['fare_amount'].mean()

print(f"Mean 'fare_amount' for rides with 0 Haversine distance: {mean_fare_zero_haversine:.2f}")

Mean 'fare_amount' for rides with 0 Haversine distance: 11.59


In [None]:
max_fare_amount = data['fare_amount'].max()

print(f"Maximum 'fare_amount' for a ride: {max_fare_amount}")

Maximum 'fare_amount' for a ride: 499.0


In [None]:
costliest_ride = data[data['fare_amount'] == data['fare_amount'].max()]

haversine_distance_costliest = haversine(costliest_ride['pickup_latitude'].values[0],
                                         costliest_ride['pickup_longitude'].values[0],
                                         costliest_ride['dropoff_latitude'].values[0],
                                         costliest_ride['dropoff_longitude'].values[0])

print(f"Haversine distance for the costliest ride: {haversine_distance_costliest:.5f} kilometers")

Haversine distance for the costliest ride: 0.00079 kilometers


In [None]:
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])

data['pickup_year'] = data['pickup_datetime'].dt.year

rides_2014 = (data['pickup_year'] == 2014).sum()

print(f"Number of rides recorded in the year 2014: {rides_2014}")

Number of rides recorded in the year 2014: 29968


In [None]:
data['pickup_quarter'] = data['pickup_datetime'].dt.quarter

rides_first_quarter_2014 = ((data['pickup_year'] == 2014) & (data['pickup_quarter'] == 1)).sum()

print(f"Number of rides recorded in the first quarter of 2014: {rides_first_quarter_2014}")

Number of rides recorded in the first quarter of 2014: 7687


In [None]:
data['pickup_month'] = data['pickup_datetime'].dt.month
data['pickup_day_of_week'] = data['pickup_datetime'].dt.day_name()

september_2010_rides = (data['pickup_year'] == 2010) & (data['pickup_month'] == 9)

most_rides_day = data[september_2010_rides]['pickup_day_of_week'].mode().values[0]

print(f"Day of the week in September 2010 with the maximum rides: {most_rides_day}")

Day of the week in September 2010 with the maximum rides: Thursday


In [None]:
data.dropna(inplace=True)

In [None]:
pickup_lon = data['pickup_longitude']
pickup_lat = data['pickup_latitude']
dropoff_lon = data['dropoff_longitude']
dropoff_lat = data['dropoff_latitude']

data['distance'] = haversine(pickup_lon, pickup_lat, dropoff_lon, dropoff_lat)

data['ride_week_day'] = data['pickup_datetime'].dt.day_name()

data = pd.get_dummies(data, columns=['ride_week_day'])

features = ['passenger_count', 'distance'] + [col for col in data.columns if col.startswith('ride_week_day_')]
target = 'fare_amount'

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
linear_regression = LinearRegression()
decision_tree = DecisionTreeRegressor()
random_forest = RandomForestRegressor()
knn = KNeighborsRegressor()

In [None]:
linear_regression.fit(X_train, y_train)
y_pred_linear = linear_regression.predict(X_test)
r2_linear = r2_score(y_test, y_pred_linear)

In [None]:
decision_tree.fit(X_train, y_train)
y_pred_tree = decision_tree.predict(X_test)
r2_tree = r2_score(y_test, y_pred_tree)

In [None]:
random_forest.fit(X_train, y_train)
y_pred_forest = random_forest.predict(X_test)
r2_forest = r2_score(y_test, y_pred_forest)

In [None]:
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
r2_knn=r2_score(y_test, y_pred_knn)

In [None]:
n = len(y_test)
p = X_test.shape[1]
adjusted_r2_linear = 1 - (1 - r2_linear) * ((n - 1) / (n - p - 1))
adjusted_r2_tree = 1 - (1 - r2_tree) * ((n - 1) / (n - p - 1))
adjusted_r2_forest = 1 - (1 - r2_forest) * ((n - 1) / (n - p - 1))
adjusted_r2_knn = 1 - (1 - r2_knn) * ((n - 1) / (n - p - 1))

In [None]:
print(f"Adjusted R-squared for Linear Regression: {adjusted_r2_linear}")
print(f"Adjusted R-squared for Decision Tree: {adjusted_r2_tree}")
print(f"Adjusted R-squared for Random Forest: {adjusted_r2_forest}")
print(f"Adjusted R-squared for KNN: {adjusted_r2_knn}")

Adjusted R-squared for Linear Regression: -0.0022801679311490464
Adjusted R-squared for Decision Tree: 0.35255447056878086
Adjusted R-squared for KNN: 0.5651122739056964
