In [1]:
# importing important library
import numpy as np
import pandas as pd

from numpy import cos, sin, arcsin, sqrt
from math import radians

In [2]:
df = pd.read_csv("uber_rides_data.csv")
df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


## Data Understanding

In [22]:
df.shape

(200000, 8)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ride_id            200000 non-null  int64  
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 12.2+ MB


In [9]:
df.describe()

Unnamed: 0,ride_id,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,200000.0,199999.0,199999.0,200000.0
mean,27712500.0,11.359955,-72.527638,39.935885,-72.525292,39.92389,1.684535
std,16013820.0,9.901776,11.437787,7.720539,13.117408,6.794829,1.385997
min,1.0,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,13825350.0,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,27745500.0,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,41555300.0,12.5,-73.967153,40.767158,-73.963659,40.768001,2.0
max,55423570.0,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [5]:
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [11]:
dd = df.copy()

### converting to datatime

In [13]:
pd.to_datetime(df['pickup_datetime'])

0        2015-05-07 19:52:06+00:00
1        2009-07-17 20:04:56+00:00
2        2009-08-24 21:45:00+00:00
3        2009-06-26 08:22:21+00:00
4        2014-08-28 17:47:00+00:00
                    ...           
199995   2012-10-28 10:49:00+00:00
199996   2014-03-14 01:09:00+00:00
199997   2009-06-29 00:42:00+00:00
199998   2015-05-20 14:56:25+00:00
199999   2010-05-15 04:08:00+00:00
Name: pickup_datetime, Length: 200000, dtype: datetime64[ns, UTC]

In [14]:
df['pickup_datetime'].astype('datetime64[ns]')

0        2015-05-07 19:52:06
1        2009-07-17 20:04:56
2        2009-08-24 21:45:00
3        2009-06-26 08:22:21
4        2014-08-28 17:47:00
                 ...        
199995   2012-10-28 10:49:00
199996   2014-03-14 01:09:00
199997   2009-06-29 00:42:00
199998   2015-05-20 14:56:25
199999   2010-05-15 04:08:00
Name: pickup_datetime, Length: 200000, dtype: datetime64[ns]

In [23]:
# Dropping Null values
df.dropna(inplace = True)

In [24]:
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [25]:
avg_fare = df['fare_amount'].mean()
avg_fare

11.359891549457748

### Calculating Haversine Distance

In [31]:
def haversine(row):
    lon1 = row['pickup_longitude']
    lat1 = row['pickup_latitude']
    lon2 = row['dropoff_longitude']
    lat2 = row['dropoff_latitude']
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * arcsin(sqrt(a)) 
    km = 6371 * c
    return km

df['distance'] = df.apply(lambda row: haversine(row), axis=1)
df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,1.683323
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1,2.45759
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1,5.036377
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,1.661683
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,4.47545


In [32]:
mean_dist = df['distance'].mean()
mean_dist

20.855349825111237

In [41]:
# What is the median haversine distance between pickup and dropoff location according to the given dataset?
median_dist = df['distance'].median()
median_dist

2.1209923961833708

In [33]:
max_dist = df['distance'].max()
max_dist

16409.239135313164

In [36]:
zero_dist_count = df['fare_amount'][df.distance == 0.0].count()
zero_dist_count

5632

In [37]:
zero_dist_mean = df['fare_amount'][df.distance == 0.0].mean()
zero_dist_mean

# What is the mean 'fare_amount' for rides with 0 haversine distance?
# Do you sense something fishy? Try to analyze, and give your expert opinion in Jupyter Notebook.

11.585317826704546

### It is fishy. If distance is zero there should not be any amount charged for the ride. The cab driver is doing some incorrect activity.

In [38]:
max_fare_amount = df['fare_amount'].max()
max_fare_amount

499.0

In [40]:
dist_for_max_amount = df[df.fare_amount == max_fare_amount]['distance']
dist_for_max_amount

# What is the haversine distance between pickup and dropoff location for the costliest ride?
# Do you sense something fishy? Try to analyze, and give your expert opinion in Jupyter Notebook.

170081    0.00079
Name: distance, dtype: float64

### Very much incorrect.  How it is possible that for this much small ride they charged maximum. Some outlier present in the data.

## Working with DateTimeIndex to Extract Feature

In [56]:
df['year'] = pd.DatetimeIndex(df['pickup_datetime']).year
df['month'] = pd.DatetimeIndex(df['pickup_datetime']).month
df['day'] = pd.DatetimeIndex(df['pickup_datetime']).day_name()

In [46]:
ride_2014_count = df[df.year == 2014]['year'].count()
ride_2014_count

29968

In [53]:
ride_2014_1st_count = df[(df.year == 2014) & (df.month < 4)]['ride_id'].count()
ride_2014_1st_count

7687

In [59]:
ride_2010_sept_day = df[(df.year == 2010) & (df.month == 9)].groupby("day")["ride_id"].count()
ride_2010_sept_day

day
Friday       354
Monday       265
Saturday     362
Sunday       331
Thursday     457
Tuesday      322
Wednesday    391
Name: ride_id, dtype: int64

In [77]:
df.sample(10)

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,year,month,day
106170,10033786,5.0,2014-11-21 17:19:48 UTC,-73.957986,40.764925,-73.951847,40.773614,1,1.095793,2014,11,Friday
35828,15823888,8.9,2011-01-23 20:17:00 UTC,-73.955798,40.779177,-73.98624,40.763995,2,3.069452,2011,1,Sunday
24261,48909730,4.9,2011-12-09 12:00:57 UTC,-73.990215,40.771838,-73.982583,40.774309,1,0.698948,2011,12,Friday
53679,32055287,6.9,2012-04-04 00:10:54 UTC,-73.976073,40.765499,-73.995529,40.759676,1,1.7619,2012,4,Wednesday
8899,49949691,6.5,2011-02-04 12:53:00 UTC,-73.978897,40.76147,-73.98392,40.748797,1,1.471318,2011,2,Friday
176432,40505320,6.0,2013-01-29 16:30:00 UTC,-73.994793,40.75921,-73.982478,40.754343,1,1.169969,2013,1,Tuesday
61255,4115665,7.0,2013-03-02 12:48:13 UTC,-74.000946,40.725674,-74.00321,40.738915,1,1.484639,2013,3,Saturday
11717,47498131,31.33,2014-08-28 20:52:00 UTC,-73.873293,40.77411,-73.966467,40.789267,2,8.02399,2014,8,Thursday
74720,32296785,3.7,2009-05-28 19:49:51 UTC,-74.001272,40.736495,-74.003996,40.72984,1,0.774779,2009,5,Thursday
177602,28739240,31.83,2014-04-09 19:03:00 UTC,-73.87488,40.774067,-73.99108,40.75055,6,10.129908,2014,4,Wednesday


# Preparing data for training ML 

In [69]:
ml_df = df[["passenger_count","distance","day","fare_amount"]]
ml_df.shape

(199999, 4)

### Mapping data column to numeric

In [78]:
weekday_val = {"Monday" : 1, "Tuesday" : 2, "Wednesday" : 3, "Thursday" : 4, "Friday" : 5, "Saturday" : 6, "Sunday" : 7}
ml_df["day"] = ml_df["day"].map(weekday_val)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ml_df["day"] = ml_df["day"].map(weekday_val)


#### Spliting the data into train-test

In [79]:
from sklearn.model_selection import train_test_split
X = ml_df.drop('fare_amount', axis=1)
y = ml_df['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#### Scalling and Standardizing data

In [80]:
from sklearn.preprocessing import StandardScaler    
st_x= StandardScaler()    
X_train= st_x.fit_transform(X_train)    
X_test= st_x.transform(X_test)  

#### training data

In [82]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor 

lr = LinearRegression()
lr.fit(X_train, y_train)

dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

rfr = RandomForestRegressor(random_state=0)
rfr.fit(X_train, y_train)

knn_r = KNeighborsRegressor(n_neighbors=3)
knn_r.fit(X_train, y_train)


# from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay

#### Prediction for Test data

In [84]:
pred_lr = lr.predict(X_test)
pred_dtr = dtr.predict(X_test)
pred_rfr = rfr.predict(X_test)
pred_knn_r = knn_r.predict(X_test)

### R2 score calculation

In [85]:
from sklearn.metrics import r2_score

r2_lr = r2_score(y_test, pred_lr)
r2_dtr = r2_score(y_test, pred_dtr)
r2_rfr = r2_score(y_test, pred_rfr)
r2_knn_r = r2_score(y_test, pred_knn_r)

#### Adjusted R2 calculation

In [88]:
def adj_r2(score):
    return (1-(1-score)*((len(X_test)-1)/(len(X_test)-len(X_test[0])-1)))

lr_ad_score = adj_r2(r2_lr)
dtr_ad_score = adj_r2(r2_dtr)
rfr_ad_score = adj_r2(r2_rfr)
knn_ad_score = adj_r2(r2_knn_r)

print("Linear Regression Adjusted R2 Value: ", lr_ad_score)
print("Decission Tree Regression Adjusted R2 Value: ", dtr_ad_score)
print("random Forest Regression Adjusted R2 Value: ", rfr_ad_score)
print("KNN Regression Adjusted R2 Value: ", knn_ad_score)

Linear Regression Adjusted R2 Value:  0.0007124050615815447
Decission Tree Regression Adjusted R2 Value:  0.5182425470357652
random Forest Regression Adjusted R2 Value:  0.6624242815597181
KNN Regression Adjusted R2 Value:  0.6216590558644375
