# Machine learning with Ensemble models - Regression  

### Problem Statement:
Given pickup and dropoff locations, the pickup timestamp, and the passenger count, the objective is to predict the fare of the taxi ride using ensemble models 

In [37]:
# import libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xg
from sklearn.metrics import mean_squared_error,r2_score

### Read the data 

In [3]:
# Read the data from local drive
taxi = pd.read_csv(r"D:\new Data science class\project\4. random forest\Taxi Fare Prediction\Dataset\TaxiFare.csv")

In [4]:
#Primary data inspection 
taxi.head()

Unnamed: 0,unique_id,amount,date_time_of_pickup,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
0,26:21.0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,52:16.0,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,35:00.0,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,30:42.0,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,51:00.0,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


### Data fields 
* unique_id -- A unique identifier or key for each record in the dataset 
* date_time_of_pickup -- The time when the ride started 
* longitude_of_pickup -- Longitude of the taxi ride pickup point 
* latitude_of_pickup -- Latitude of the taxi ride pickup point 
* longitude__of_dropoff -- Longitude of the taxi ride dropoff point 
* latitude_of_dropoff -- Latitude of the taxi ride dropoff point 
* no_of_passenger -- count of the passengers during the ride 
* amount(target variable) -- dollar amount of the cost of the taxi ride

### EDA

In [5]:
# Check no of rows and columns in dataset
taxi.shape

(50000, 8)

* There are 50,000 records
* There are 8 fields 

In [6]:
# check for nulls 
taxi.isnull().sum()

unique_id               0
amount                  0
date_time_of_pickup     0
longitude_of_pickup     0
latitude_of_pickup      0
longitude_of_dropoff    0
latitude_of_dropoff     0
no_of_passenger         0
dtype: int64

* There are no nulls in the dataset 

In [7]:
# checking the datatypes for fields 
taxi.dtypes

unique_id                object
amount                  float64
date_time_of_pickup      object
longitude_of_pickup     float64
latitude_of_pickup      float64
longitude_of_dropoff    float64
latitude_of_dropoff     float64
no_of_passenger           int64
dtype: object

* The date_time_of_pickup field is object, we need to convert it into datetime 

In [8]:
# checking all columns in the dataset 
taxi.columns

Index(['unique_id', 'amount', 'date_time_of_pickup', 'longitude_of_pickup',
       'latitude_of_pickup', 'longitude_of_dropoff', 'latitude_of_dropoff',
       'no_of_passenger'],
      dtype='object')

In [9]:
# Data description of numeric columns 
taxi.describe()

Unnamed: 0,amount,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,11.364171,-72.509756,39.933759,-72.504616,39.926251,1.66784
std,9.685557,10.39386,6.224857,10.40757,6.014737,1.289195
min,-5.0,-75.423848,-74.006893,-84.654241,-74.006377,0.0
25%,6.0,-73.992062,40.73488,-73.991152,40.734372,1.0
50%,8.5,-73.98184,40.752678,-73.980082,40.753372,1.0
75%,12.5,-73.967148,40.76736,-73.963584,40.768167,2.0
max,200.0,40.783472,401.083332,40.851027,43.41519,6.0


### Pre- processing 

#### Dropping rows having negative negative amount 

In [10]:
# Amount cannot be negative, hence dropping rows having negative amount 
taxi = taxi.drop(taxi[taxi["amount"]<1].index, axis = 0)

In [11]:
# Checking wheter the negative amount have been dropped 
taxi.amount.describe()

count    49990.000000
mean        11.366813
std          9.684697
min          2.500000
25%          6.000000
50%          8.500000
75%         12.500000
max        200.000000
Name: amount, dtype: float64

#### Converting date_time of pickup into datetime and extracting useful information 

In [12]:
# as date_time of pickup is object converting it to datetime 

taxi['date_time_of_pickup'] = pd.to_datetime(taxi['date_time_of_pickup'])
taxi.date_time_of_pickup.dtypes

datetime64[ns, UTC]

In [13]:
# Extraxting information from date_time of pickup

taxi["year"] = taxi["date_time_of_pickup"].dt.year
taxi["month"] = taxi["date_time_of_pickup"].dt.month
taxi["date"]=taxi["date_time_of_pickup"].dt.day
taxi["day_of_week"] = taxi["date_time_of_pickup"].dt.dayofweek
taxi["hour"] = taxi["date_time_of_pickup"].dt.hour

#### Creating a column distance to calculate fare as fare depends on distance

In [14]:
# creating distance column to calculate fare as fare depends on distance 
taxi["diff_long"] = (taxi["longitude_of_dropoff"]- taxi["longitude_of_pickup"]).abs()
taxi["diff_lat"] = (taxi["latitude_of_dropoff"]- taxi["latitude_of_pickup"]).abs()

#### Creating dataframe with only required columns and dropping unwanted columns

In [16]:
# drop unwanted columns
taxi.drop(columns=['unique_id','date_time_of_pickup',"longitude_of_dropoff","longitude_of_pickup","latitude_of_dropoff","latitude_of_pickup"],inplace= True)

In [17]:
# final analysis of required dataframe 
taxi.head()

Unnamed: 0,amount,no_of_passenger,year,month,date,day_of_week,hour,diff_long,diff_lat
0,4.5,1,2009,6,15,0,17,0.002701,0.009041
1,16.9,1,2010,1,5,1,16,0.03678,0.070701
2,5.7,2,2011,8,18,3,0,0.008504,0.010708
3,7.7,1,2012,4,21,5,4,0.004437,0.024949
4,5.3,1,2010,3,9,1,7,0.01144,0.015754


In [18]:
# Final check for the datatypes of required dataframe 
taxi.dtypes

amount             float64
no_of_passenger      int64
year                 int64
month                int64
date                 int64
day_of_week          int64
hour                 int64
diff_long          float64
diff_lat           float64
dtype: object

#### Dividing the data into train and test 

In [19]:
# train test split
trainx, testx, trainy,testy = train_test_split(taxi.drop("amount",1),
                                              taxi.amount,
                                              test_size= 0.2)

### Model1

In [20]:
# model building
model1 = RandomForestRegressor()
model1.fit(trainx,trainy)

RandomForestRegressor()

In [23]:
# predictions 
pred1 = model1.predict(testx)

# Evaluation
mse1 = mean_squared_error(testy,pred1)
r1= r2_score(testy,pred1)
print("model Random Forest Regressor \n\tmse = {}, \n\trmse = {}, \n\tr2_score= {}".format(round(mse1,2),round(np.sqrt(mse1),2),r1))

model Random Forest Regressor 
	mse = 20.1, 
	rmse = 4.48, 
	r2_score= 0.7726795663989037


* This Baseline model gives a r2_score of 77%

### Model 2 
We will build this model by hyper parameter tuning using gridSearchCV

In [24]:
# using girdSearchCV
# Defining the parameters 

params = ({'criterion':['squared_error', 'mse'],
          'max_depth':np.arange(4,10),
          'min_samples_split':np.arange(2,10)})

In [25]:
# run the gridSearchCV
rfreg = RandomForestRegressor()
grid = GridSearchCV(rfreg, param_grid = params, scoring = 'neg_mean_squared_error', 
                    cv= 3, n_jobs=-1).fit(trainx,trainy)

In [26]:
# best params 
bp = grid.best_params_
bp

{'criterion': 'mse', 'max_depth': 9, 'min_samples_split': 9}

In [28]:
# build the model using best params 

model2 = RandomForestRegressor(criterion = 'mse',
                          max_depth = bp['max_depth'],
                          min_samples_split= bp['min_samples_split']).fit(trainx,trainy)

In [29]:
# predictions 
pred2= model2.predict(testx)

In [30]:
# Evaluation
mse2 = mean_squared_error(testy,pred2)
r2= r2_score(testy,pred2)
print("model Random Forest Regressor \n\tmse = {}, \n\trmse = {}, \n\tr2_score= {}".format(round(mse2,2),round(np.sqrt(mse2),2),r2))

model Random Forest Regressor 
	mse = 17.62, 
	rmse = 4.2, 
	r2_score= 0.8007561961778287


* The model made by using gridSearchCV has improved the r2_score to 80 %

In [72]:
feat_imp= pd.DataFrame(model2.feature_importances_, trainx.columns, columns= ["importance"])

feat_imp.sort_values("importance", ascending = False)

Unnamed: 0,importance
diff_long,0.723064
diff_lat,0.226937
year,0.023631
hour,0.010481
month,0.005861
date,0.005128
day_of_week,0.003445
no_of_passenger,0.001453


* We can see that the diff_long and diff_lat have higher importance than all other features. 
* This means that price is highly dependent on distance travelled 

### Model 3
We will build this model using AdaBoostRegressor 

In [32]:
model3 = AdaBoostRegressor(RandomForestRegressor(max_depth=1),n_estimators=50,learning_rate= 1).fit(trainx,trainy)

In [33]:
# predictions 
pred3 = model3.predict(testx)

# Evaluation
mse3 = mean_squared_error(testy,pred3)
r3= r2_score(testy,pred3)
print("model Adaboost Regressor \n\tmse = {}, \n\trmse = {}, \n\tr2_score= {}".format(round(mse3,2),round(np.sqrt(mse3),2),r3))

model Adaboost Regressor 
	mse = 45.23, 
	rmse = 6.73, 
	r2_score= 0.4884955033185908


* This model is not performing well 

### Model 4 
We will build this model using Gradient Boosting Regressor

In [35]:
# Building model
model4 = GradientBoostingRegressor()
model4.fit(trainx,trainy)

GradientBoostingRegressor()

In [36]:
# predictions 
pred4 = model4.predict(testx)

# Evaluation
mse4 = mean_squared_error(testy,pred4)
r4= r2_score(testy,pred4)
print("model Gradient Boosting Regressor \n\tmse = {}, \n\trmse = {}, \n\tr2_score= {}".format(round(mse4,2),round(np.sqrt(mse4),2),r4))

model Gradient Boosting Regressor 
	mse = 17.24, 
	rmse = 4.15, 
	r2_score= 0.8050586677646006


* This model is giving a r2_score of 80.5% 

### Model 5
We will build thos model using XGboost Regressor

In [38]:
# Building model
model5 = xg.XGBRegressor()
model5.fit(trainx,trainy)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [39]:
# predictions 
pred5 = model5.predict(testx)

# Evaluation
mse5 = mean_squared_error(testy,pred5)
r5= r2_score(testy,pred5)
print("model XGBoost Regressor \n\tmse = {}, \n\trmse = {}, \n\tr2_score= {}".format(round(mse5,2),round(np.sqrt(mse5),2),r5))

model XGBoost Regressor 
	mse = 21.11, 
	rmse = 4.59, 
	r2_score= 0.7612624161163248


* This model gives an r2_score of 76% 

### Feature engineering 

We will converte month, dayof week and hour into categorical data

In [51]:
# make a copy of dataset so that we do not tamper with original dataset 
taxi2= taxi.copy()

#### Converting months into seasons as- fall,winter,spring and summer 

In [52]:
# converting months into season to reduce the levels 
taxi2.month.value_counts()
taxi2.month.astype('str')
taxi2.month = taxi2.month.replace({(9,10,11):'fall',
                   (12,1,2):'winter',
                    (3,4,5):'spring',
                    (6,7,8):'summer'})

#### Converting day of weeek as - weekend and weekday

In [53]:
# converting days of week into weekend and not weekend

taxi2.day_of_week.value_counts()
taxi2.day_of_week = taxi2.day_of_week.replace({(0,1,2,3,4):'weekday',
                                              (5,6):'weekend'})

#### Converting hour into intervals as - night, morning,afternoon and evening 

In [54]:
# splitting hour into 6 hr interval
taxi2.hour.value_counts()
taxi2.hour = taxi2.hour.replace({(23,24,0,1,2,3,4):'night',
                                (5,6,7,8,9,10):'morning',
                                (11,12,13,14,15,16):'afternoon',
                               (17,18,19,20,21,22):'evening'})

In [55]:
# final check on dAtatypes 
taxi2.dtypes

amount             float64
no_of_passenger      int64
year                 int64
month               object
date                 int64
day_of_week         object
hour                object
diff_long          float64
diff_lat           float64
dtype: object

* We can see that month, day_of_week and hour which were earlier int has now been converted into object datatype

### Splitting columns into numeric and categorical 

In [57]:
# split columns into categorical and numeric
fc = taxi2.select_dtypes(include="object").columns.values
nc = taxi2.select_dtypes(exclude = "object").columns.values
fc,nc

(array(['month', 'day_of_week', 'hour'], dtype=object),
 array(['amount', 'no_of_passenger', 'year', 'date', 'diff_long',
        'diff_lat'], dtype=object))

### Creating dummy variables for categorical columns

In [58]:
# create dummy variables 
for c in fc:
    dummy = pd.get_dummies(taxi2[c], drop_first= True, prefix = c)
    taxi2 = taxi2.join(dummy) 
print(taxi2.columns)

Index(['amount', 'no_of_passenger', 'year', 'month', 'date', 'day_of_week',
       'hour', 'diff_long', 'diff_lat', 'month_spring', 'month_summer',
       'month_winter', 'day_of_week_weekend', 'hour_evening', 'hour_morning',
       'hour_night'],
      dtype='object')


In [59]:
# remove old columns 
taxi2.drop(columns=fc, inplace= True)

In [60]:
taxi2.columns

Index(['amount', 'no_of_passenger', 'year', 'date', 'diff_long', 'diff_lat',
       'month_spring', 'month_summer', 'month_winter', 'day_of_week_weekend',
       'hour_evening', 'hour_morning', 'hour_night'],
      dtype='object')

### Splitting data into train and test 

In [61]:
# train test split 
trainx2,testx2, trainy2, testy2 = train_test_split(taxi2.drop("amount",1),
                                                  taxi2.amount,
                                                  test_size= 0.2)

### Model 6
We will build a baseline random forest model with this modified dataset

In [62]:
# model building
model6 = RandomForestRegressor()
model6.fit(trainx2,trainy2)

RandomForestRegressor()

In [69]:
# predictions 
pred6 = model6.predict(testx2)

# Evaluation
mse6 = mean_squared_error(testy2,pred6)
r6= r2_score(testy2,pred6)
print("model Random Forest Regressor \n\tmse = {}, \n\trmse = {}, \n\tr2_score= {}".format(round(mse6,2),round(np.sqrt(mse6),2),r6))

model Random Forest Regressor 
	mse = 22.67, 
	rmse = 4.76, 
	r2_score= 0.7505728357129633


* We can see the r2_score is 75 % which is slightly less than model 1 in which we did not use any feature engineering 

### Model 7 
We us try to build this model by tuning the above baseline model using GridSearchCV to see wether we can achive higher r2_score 

In [64]:
# build model on grid search cv 

params2 = ({'criterion':['squared_error', 'mse'],
          'max_depth':np.arange(2,10),
          'min_samples_split':np.arange(2,10)})

In [65]:
# run the gridSearchCV

rfreg = RandomForestRegressor()
grid2 = GridSearchCV(rfreg, param_grid = params2, scoring = 'neg_mean_squared_error', 
                    cv= 5, n_jobs=-1).fit(trainx2,trainy2)

In [66]:
# best params 
bp2 = grid2.best_params_
bp2

{'criterion': 'mse', 'max_depth': 8, 'min_samples_split': 8}

In [67]:
# build the model using best params 

model7 = RandomForestRegressor(criterion = 'mse',
                          max_depth = bp2['max_depth'],
                          min_samples_split= bp2['min_samples_split']).fit(trainx2,trainy2)

In [68]:
# predictions 
pred7= model7.predict(testx2)

In [70]:
# Evaluation
mse7 = mean_squared_error(testy2,pred7)
r7= r2_score(testy2,pred7)
print("model Random Forest Regressor \n\tmse = {}, \n\trmse = {}, \n\tr2_score= {}".format(round(mse7,2),round(np.sqrt(mse7),2),r7))

model Random Forest Regressor 
	mse = 21.71, 
	rmse = 4.66, 
	r2_score= 0.7611274056782187


* This model gives better r2_score than model 6 but not as good as model 2 

### Results

In [74]:
data= [['Model 1','Random Forest Regression',r1],
       ['Model 2','Random Forest Regression with GridSearchCV',r2],
       ['Model 3','AdaboostRegression',r3],
       ['Model 4','Gradient Boosting Regression',r4],
       ['Model 5','XGBoost Regression',r5],
       ['Model 6','Random Forest Regression with Feature Engineering ',r6],
       ['Model 7','Random Forest Regression with Feature Engineering and GridSearchCV',r7]]
       
df = pd.DataFrame(data,columns=['Alogithrm','Description','Ovearll Accuracy'])
df

Unnamed: 0,Alogithrm,Description,Ovearll Accuracy
0,Model 1,Random Forest Regression,0.77268
1,Model 2,Random Forest Regression with GridSearchCV,0.800756
2,Model 3,AdaboostRegression,0.488496
3,Model 4,Gradient Boosting Regression,0.805059
4,Model 5,XGBoost Regression,0.761262
5,Model 6,Random Forest Regression with Feature Engineer...,0.750573
6,Model 7,Random Forest Regression with Feature Engineer...,0.761127


### Conclusion
* The Gradient Boosting regression and Random Forest with GridSearchCV gives almost same results 
* Feature Engineering used in this model was found to reduce the r2_score slightly 
* Adaboost Regressor is not suitable model for this dataset 
* The best model for this dataset was found to be GradientBoosting