## Imports

In [42]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [43]:
warnings.filterwarnings('ignore')

## Data Understanding

In [44]:
data = pd.read_csv('rideshare_kaggle.csv')

In [45]:
data['datetime'] = pd.to_datetime(data['datetime'])

In [46]:
data.sort_values(by='datetime', inplace=True)

In [47]:
data.head()

Unnamed: 0,id,timestamp,hour,day,month,datetime,timezone,source,destination,cab_type,...,precipIntensityMax,uvIndexTime,temperatureMin,temperatureMinTime,temperatureMax,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime
66422,a7b50600-c6c5-4e6c-bea9-4487344196d4,1543204000.0,3,26,11,2018-11-26 03:40:46,America/New_York,North Station,Haymarket Square,Uber,...,0.1396,1543161600,40.61,1543122000,46.15,1543154400,38.23,1543136400,43.17,1543186800
446073,9962f244-8fce-4ae9-a583-139d5d7522e1,1543204000.0,3,26,11,2018-11-26 03:40:46,America/New_York,Theatre District,North End,Uber,...,0.1396,1543161600,40.61,1543122000,46.15,1543154400,38.23,1543136400,43.17,1543186800
184332,4aa68a5d-abc0-4fdf-a47f-0003617afbae,1543204000.0,3,26,11,2018-11-26 03:40:46,America/New_York,North End,West End,Lyft,...,0.1396,1543161600,40.61,1543122000,46.15,1543154400,38.23,1543136400,43.17,1543186800
167114,ef8b695c-c24d-4ac1-b3fe-4aa1a7ed79f4,1543204000.0,3,26,11,2018-11-26 03:40:46,America/New_York,Boston University,Beacon Hill,Lyft,...,0.1396,1543161600,40.61,1543122000,46.15,1543154400,38.23,1543136400,43.17,1543186800
184333,89f35ef7-7129-483d-b3e6-d89afdf6946d,1543204000.0,3,26,11,2018-11-26 03:40:46,America/New_York,North End,West End,Lyft,...,0.1396,1543161600,40.61,1543122000,46.15,1543154400,38.23,1543136400,43.17,1543186800


In [48]:
data.shape

(693071, 57)

In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 693071 entries, 66422 to 166551
Data columns (total 57 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   id                           693071 non-null  object        
 1   timestamp                    693071 non-null  float64       
 2   hour                         693071 non-null  int64         
 3   day                          693071 non-null  int64         
 4   month                        693071 non-null  int64         
 5   datetime                     693071 non-null  datetime64[ns]
 6   timezone                     693071 non-null  object        
 7   source                       693071 non-null  object        
 8   destination                  693071 non-null  object        
 9   cab_type                     693071 non-null  object        
 10  product_id                   693071 non-null  object        
 11  name                       

In [50]:
data.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
timestamp,693071.0,1544045709.755097,1543203646.0,1543443968.0,1543737478.0,1544827509.0,1545160511.0,689192.492586
hour,693071.0,11.619137,0.0,6.0,12.0,18.0,23.0,6.948114
day,693071.0,17.794365,1.0,13.0,17.0,28.0,30.0,9.982286
month,693071.0,11.586684,11.0,11.0,12.0,12.0,12.0,0.492429
datetime,693071.0,2018-12-05 21:35:09.269777408,2018-11-26 03:40:46,2018-11-28 22:26:08,2018-12-02 07:57:57,2018-12-14 22:45:08,2018-12-18 19:15:10,
price,637976.0,16.545125,2.5,9.0,13.5,22.5,97.5,9.324359
distance,693071.0,2.18943,0.02,1.28,2.16,2.92,7.86,1.138937
surge_multiplier,693071.0,1.01387,1.0,1.0,1.0,1.0,3.0,0.091641
latitude,693071.0,42.338172,42.2148,42.3503,42.3519,42.3647,42.3661,0.04784
longitude,693071.0,-71.066151,-71.1054,-71.081,-71.0631,-71.0542,-71.033,0.020302


In [51]:
data.nunique()

id                             693071
timestamp                       36179
hour                               24
day                                17
month                               2
datetime                        31350
timezone                            1
source                             12
destination                        12
cab_type                            2
product_id                         13
name                               13
price                             147
distance                          549
surge_multiplier                    7
latitude                           11
longitude                          12
temperature                       308
apparentTemperature               319
short_summary                       9
long_summary                       11
precipIntensity                    63
precipProbability                  29
humidity                           51
windSpeed                         291
windGust                          286
windGustTime

In [52]:
data.shape

(693071, 57)

In [53]:
data.isnull().sum()/len(data)*100

id                             0.000000
timestamp                      0.000000
hour                           0.000000
day                            0.000000
month                          0.000000
datetime                       0.000000
timezone                       0.000000
source                         0.000000
destination                    0.000000
cab_type                       0.000000
product_id                     0.000000
name                           0.000000
price                          7.949402
distance                       0.000000
surge_multiplier               0.000000
latitude                       0.000000
longitude                      0.000000
temperature                    0.000000
apparentTemperature            0.000000
short_summary                  0.000000
long_summary                   0.000000
precipIntensity                0.000000
precipProbability              0.000000
humidity                       0.000000
windSpeed                      0.000000


**Price is target variable and it has null values. We can drop those rows**

In [54]:
data = data.dropna()

In [55]:
data[['id', 'timestamp', 'hour', 'day', 'month', 'datetime', 'timezone']].head(5)

Unnamed: 0,id,timestamp,hour,day,month,datetime,timezone
66422,a7b50600-c6c5-4e6c-bea9-4487344196d4,1543204000.0,3,26,11,2018-11-26 03:40:46,America/New_York
184332,4aa68a5d-abc0-4fdf-a47f-0003617afbae,1543204000.0,3,26,11,2018-11-26 03:40:46,America/New_York
167114,ef8b695c-c24d-4ac1-b3fe-4aa1a7ed79f4,1543204000.0,3,26,11,2018-11-26 03:40:46,America/New_York
184333,89f35ef7-7129-483d-b3e6-d89afdf6946d,1543204000.0,3,26,11,2018-11-26 03:40:46,America/New_York
184334,9e6a67e6-9628-4fb1-94e5-bf426f61b038,1543204000.0,3,26,11,2018-11-26 03:40:46,America/New_York


**We can drop following columns**
- id: just the id of the observation
- datetime: `hour, day and month` are dervied from it
- timezone: only one time zone `America/New_York`
- timestamp: similar to `datetime`

In [69]:
data = data.drop(['id', 'datetime', 'timezone', 'timestamp', 'long_summary', 'visibility.1'], axis=1)

KeyError: "['id', 'datetime', 'timezone', 'timestamp', 'long_summary', 'visibility.1'] not found in axis"

In [57]:
'short_summary'

'short_summary'

In [58]:
temp_time_cols = ['temperatureHigh','temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
 'apparentTemperatureHigh', 'apparentTemperatureHighTime',
 'apparentTemperatureLow', 'apparentTemperatureLowTime',
 'temperatureMin', 'temperatureMinTime', 'temperatureMax','temperatureMaxTime',
 'apparentTemperatureMin','apparentTemperatureMinTime',
 'apparentTemperatureMax','apparentTemperatureMaxTime',
 'uvIndexTime', 'windGustTime', 'sunriseTime','sunsetTime',
 'latitude','longitude'
]

In [59]:
data = data.drop(temp_time_cols, axis=1)

In [60]:
data.head()

Unnamed: 0,hour,day,month,source,destination,cab_type,product_id,name,price,distance,...,visibility,icon,dewPoint,pressure,windBearing,cloudCover,uvIndex,ozone,moonPhase,precipIntensityMax
66422,3,26,11,North Station,Haymarket Square,Uber,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX,7.0,0.56,...,1.685,fog,39.62,1014.1,57,1.0,0,317.9,0.6,0.1396
184332,3,26,11,North End,West End,Lyft,lyft,Lyft,7.0,1.23,...,1.685,fog,39.62,1014.1,57,1.0,0,317.9,0.6,0.1396
167114,3,26,11,Boston University,Beacon Hill,Lyft,lyft_premier,Lux,19.5,2.66,...,1.685,fog,39.62,1014.1,57,1.0,0,317.9,0.6,0.1396
184333,3,26,11,North End,West End,Lyft,lyft_line,Shared,5.0,1.23,...,1.685,fog,39.62,1014.1,57,1.0,0,317.9,0.6,0.1396
184334,3,26,11,North End,West End,Lyft,lyft_premier,Lux,13.5,1.23,...,1.685,fog,39.62,1014.1,57,1.0,0,317.9,0.6,0.1396


**Analyzing data in the categorical features**

In [61]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 637976 entries, 66422 to 166551
Data columns (total 29 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   hour                 637976 non-null  int64  
 1   day                  637976 non-null  int64  
 2   month                637976 non-null  int64  
 3   source               637976 non-null  object 
 4   destination          637976 non-null  object 
 5   cab_type             637976 non-null  object 
 6   product_id           637976 non-null  object 
 7   name                 637976 non-null  object 
 8   price                637976 non-null  float64
 9   distance             637976 non-null  float64
 10  surge_multiplier     637976 non-null  float64
 11  temperature          637976 non-null  float64
 12  apparentTemperature  637976 non-null  float64
 13  short_summary        637976 non-null  object 
 14  precipIntensity      637976 non-null  float64
 15  precipProbability 

In [62]:
cat_col = data.select_dtypes(include=['object','category']).columns.tolist()
data_new = data[cat_col]
data_new.head()

Unnamed: 0,source,destination,cab_type,product_id,name,short_summary,icon
66422,North Station,Haymarket Square,Uber,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX,Foggy,fog
184332,North End,West End,Lyft,lyft,Lyft,Foggy,fog
167114,Boston University,Beacon Hill,Lyft,lyft_premier,Lux,Foggy,fog
184333,North End,West End,Lyft,lyft_line,Shared,Foggy,fog
184334,North End,West End,Lyft,lyft_premier,Lux,Foggy,fog


In [63]:
for col in data_new:
    print(f'{col} : {data_new[col].unique()}')
    print()

source : ['North Station' 'North End' 'Boston University' 'South Station'
 'Beacon Hill' 'Fenway' 'Theatre District' 'Financial District'
 'Haymarket Square' 'Northeastern University' 'Back Bay' 'West End']

destination : ['Haymarket Square' 'West End' 'Beacon Hill' 'Boston University'
 'Theatre District' 'Fenway' 'South Station' 'Northeastern University'
 'Financial District' 'North End' 'Back Bay' 'North Station']

cab_type : ['Uber' 'Lyft']

product_id : ['55c66225-fbe7-4fd5-9072-eab1ece5e23e' 'lyft' 'lyft_premier' 'lyft_line'
 'lyft_lux' 'lyft_plus' 'lyft_luxsuv'
 '9a0e7b09-b92b-4c41-9779-2ad22b4d779d'
 '6f72dfc5-27f1-42e8-84db-ccc7a75f6969'
 '6d318bcc-22a3-4af6-bddd-b409bfce1546'
 '6c84fd89-3f11-4782-9b50-97c468b19529'
 '997acbb5-e102-41e1-b155-9df7de0a73f2']

name : ['UberX' 'Lyft' 'Lux' 'Shared' 'Lux Black' 'Lyft XL' 'Lux Black XL' 'WAV'
 'UberXL' 'Black SUV' 'Black' 'UberPool']

short_summary : [' Foggy ' ' Overcast ' ' Possible Drizzle ' ' Drizzle ' ' Light Rain '
 ' Rain ' ' 

In [64]:
data.groupby(['short_summary', 'icon'])['icon'].count()

short_summary       icon                 
 Clear               clear-day                24679
                     clear-night              55577
 Drizzle             rain                      6725
 Foggy               fog                       8292
 Light Rain          rain                     50488
 Mostly Cloudy       partly-cloudy-day        55769
                     partly-cloudy-night      78834
 Overcast            cloudy                  201429
 Partly Cloudy       partly-cloudy-day        50568
                     partly-cloudy-night      66658
 Possible Drizzle    rain                     17176
 Rain                rain                     21781
Name: icon, dtype: int64

**Analyzing features with numeric data types**

In [65]:
num_col = data.select_dtypes(include=['int64','float64']).columns.tolist()
data_new = data[num_col]
data_new.columns

Index(['hour', 'day', 'month', 'price', 'distance', 'surge_multiplier',
       'temperature', 'apparentTemperature', 'precipIntensity',
       'precipProbability', 'humidity', 'windSpeed', 'windGust', 'visibility',
       'dewPoint', 'pressure', 'windBearing', 'cloudCover', 'uvIndex', 'ozone',
       'moonPhase', 'precipIntensityMax'],
      dtype='object')

In [66]:
data_new.head()

Unnamed: 0,hour,day,month,price,distance,surge_multiplier,temperature,apparentTemperature,precipIntensity,precipProbability,...,windGust,visibility,dewPoint,pressure,windBearing,cloudCover,uvIndex,ozone,moonPhase,precipIntensityMax
66422,3,26,11,7.0,0.56,1.0,41.83,41.83,0.0,0.0,...,0.8,1.685,39.62,1014.1,57,1.0,0,317.9,0.6,0.1396
184332,3,26,11,7.0,1.23,1.0,41.83,41.83,0.0,0.0,...,0.8,1.685,39.62,1014.1,57,1.0,0,317.9,0.6,0.1396
167114,3,26,11,19.5,2.66,1.0,41.83,41.83,0.0,0.0,...,0.8,1.685,39.62,1014.1,57,1.0,0,317.9,0.6,0.1396
184333,3,26,11,5.0,1.23,1.0,41.83,41.83,0.0,0.0,...,0.8,1.685,39.62,1014.1,57,1.0,0,317.9,0.6,0.1396
184334,3,26,11,13.5,1.23,1.0,41.83,41.83,0.0,0.0,...,0.8,1.685,39.62,1014.1,57,1.0,0,317.9,0.6,0.1396


In [67]:
plt.figure(figsize=(20,20))
sns.heatmap(round(data.corr(), 4),annot=True, mask=np.triu(data.corr()))

ValueError: could not convert string to float: 'North Station'

<Figure size 2000x2000 with 0 Axes>

**All the climate related features has correlation almost 0 with respect to price column. Dropping these columns does'nt effect the accurracy prediction of price**

In [None]:
cli_col = ['precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'visibility', 'dewPoint', 'pressure', 'windBearing',
       'cloudCover', 'uvIndex', 'ozone', 'moonPhase',
       'precipIntensityMax']
data = data.drop(cli_col,axis=1)

In [None]:
data.groupby(['cab_type', 'name'])['price'].agg(['sum', np.mean, 'min', 'max'])

In [None]:
data.groupby(['product_id', 'name'])['price'].agg(['sum', 'count'])

Looks like 'product_id', 'name' are same. We can drop `product_id`

In [None]:
data = data.drop(['product_id'],axis=1)

In [None]:
data = data.drop(['short_summary', 'icon'],axis=1)

In [None]:
data.head()

**Encoding all the category columns with the Onehot Encoder**

In [None]:
scale_columns = data.select_dtypes(include=['int64','float64']).columns.tolist()
scale_columns

In [None]:
cat_col = data.select_dtypes(include=['object','category']).columns.tolist()
data[cat_col].head()

In [None]:
data[cat_col].nunique()

In [None]:
dummy_features = pd.get_dummies(data[cat_col], drop_first=True)

In [None]:
# create new dataframe after creating dummy var
data = pd.concat([data, dummy_features], axis=1)

In [None]:
data = data.drop(cat_col, axis=1)

In [None]:
data.head()

## Splitting the Data into Training and Testing Sets

In [None]:
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso

from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
df_train, df_test = train_test_split(data, train_size=0.9, random_state = 100)

In [None]:
print("X train data size ->", df_train.shape)
print("X test data size ->", df_test.shape)

Scaling train data

In [None]:
scale_columns.remove('price')

In [None]:
# scaler = StandardScaler()

# df_train[scale_columns] = scaler.fit_transform(df_train[scale_columns])

# df_test[scale_columns] = scaler.transform(df_test[scale_columns])

**Split train data => Predictors and response variables**

In [None]:
y_train = df_train.pop('price')
X_train = df_train

**Split test data => Predictors and response variables**

In [None]:
y_test = df_test.pop('price')
X_test = df_test

## Model

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense


In [None]:
X_train.shape[1]

In [None]:
tf.random.set_seed(42)

model=Sequential()
model.add(Dense(10,activation='relu',input_dim=X_train.shape[1]))
model.add(Dense(20,activation='relu'))
model.add(Dense(1,activation='linear'))

model.compile(optimizer='adam', loss='mse', metrics=['mse'])#,metrics=['mean_absolute_error'])

In [None]:
epochs=10
history = model.fit(
    np.array(X_train.values),y_train,
    epochs=epochs,
    validation_split=0.2,
    batch_size=128
)

In [None]:
del data

In [None]:
def visualize_train_result(history, epochs):
    acc = history.history['mse']
    val_acc = history.history['val_mse']

    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs_range = range(epochs)

    plt.figure(figsize=(16, 6))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label='Training mse')
    plt.plot(epochs_range, val_acc, label='Validation mse')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label='Training Loss')
    plt.plot(epochs_range, val_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.show()

visualize_train_result(history, epochs)

In [None]:
from sklearn.metrics import r2_score

y_pred_test = model.predict(np.array(X_test.values))
R2 = r2_score(y_test, y_pred_test)
print("R2 =",R2 )

In [None]:
mse_test_lr = mean_squared_error(y_test, y_pred_test)
print("MSE for test:", mse_test_lr)
print("--"*20)

rmse_test_lr = mse_test_lr**0.5
print("RMSE for test:", rmse_test_lr)

In [None]:
y_test.values - np.squeeze(y_pred_test, axis=1)

In [None]:
# residual = y_train - y_pred_train
y_pred_test = np.squeeze(y_pred_test, axis=1)
residual = y_test.values - y_pred_test

In [None]:
plt.figure(figsize=(16, 6))

plt.subplot(1,2,1)
plt.scatter(y_pred_test , residual)
plt.axhline(y=0, color='r', linestyle=':')
# plt.xscale('log')
# plt.yscale('log')
plt.xlabel("Predictions")
plt.ylabel("Residual")
plt.title("Residual v/s predictions plot")


plt.subplot(1,2,2)
sns.distplot(y_pred_test, kde=True)
plt.title('Normality of error terms/residuals')
plt.xlabel("Residuals")

plt.show()

In [None]:
plt.figure(figsize = (16, 8))
plt.scatter(x=y_test, y=y_pred_test, color='b')
plt.scatter(x=y_test, y=y_test, color='r')
plt.xlabel('Actual Points (y_test)', fontdict={'fontsize': 10})
plt.ylabel('Predicted Points (y_pred)', fontdict={'fontsize': 10})
