In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore") # to avoid warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [2]:
# Import dataset
dataset = pd.read_csv("https://full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com/Machine+Learning+Supervis%C3%A9/Decision+trees/uber.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,48462598,2015-05-07 10:24:44.0000004,13.0,2015-05-07 10:24:44 UTC,-73.971664,40.797035,-73.958939,40.777649,1
1,6637611,2014-07-09 09:14:04.0000002,5.5,2014-07-09 09:14:04 UTC,-73.991635,40.749855,-73.98825,40.741341,2
2,8357193,2013-11-11 18:51:00.000000240,8.5,2013-11-11 18:51:00 UTC,-73.982352,40.777042,-73.995912,40.759757,1
3,40466112,2014-05-22 01:54:00.00000069,19.0,2014-05-22 01:54:00 UTC,-73.991455,40.7517,-73.936357,40.812327,1
4,35405035,2011-06-21 23:37:33.0000002,7.7,2011-06-21 23:37:33 UTC,-73.974749,40.756255,-73.952276,40.778332,1


In [3]:
#Shape of the dataset
print("The shape of the dataset is :")
display(dataset.shape)
#The columns of the dataset
print("The columns of the dataset :")
display(dataset.columns)
#The type of the columns of the dataset 
print("The Type of columns of the dataset :")
display(dataset.dtypes)
#Some statistical information about the dataset
print(" Some statistical information about the dataset :")
display(dataset.describe(include="all"))
#The pourcentage of missing value in the columns of the dataset
print(" The pourcentage of missing value in the columns of the dataset:")
display(100*dataset.isnull().sum()/dataset.shape[0])

The shape of the dataset is :


(20000, 9)

The columns of the dataset :


Index(['Unnamed: 0', 'key', 'fare_amount', 'pickup_datetime',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count'],
      dtype='object')

The Type of columns of the dataset :


Unnamed: 0             int64
key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

 Some statistical information about the dataset :


Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,20000.0,20000,20000.0,20000,20000.0,20000.0,20000.0,20000.0,20000.0
unique,,20000,,19967,,,,,
top,,2015-05-07 10:24:44.0000004,,2012-08-28 14:03:00 UTC,,,,,
freq,,1,,2,,,,,
mean,27679490.0,,11.35815,,-72.490431,39.918498,-72.459891,39.923345,1.69015
std,16011230.0,,9.89199,,10.461597,6.051561,10.564266,6.90152,1.311384
min,3949.0,,-23.7,,-75.419276,-74.00619,-75.423067,-73.991765,0.0
25%,13834760.0,,6.0,,-73.992075,40.734733,-73.991423,40.734105,1.0
50%,27697240.0,,8.5,,-73.981904,40.752554,-73.980305,40.752997,1.0
75%,41480820.0,,12.5,,-73.967229,40.767075,-73.963509,40.768348,2.0


 The pourcentage of missing value in the columns of the dataset:


Unnamed: 0           0.0
key                  0.0
fare_amount          0.0
pickup_datetime      0.0
pickup_longitude     0.0
pickup_latitude      0.0
dropoff_longitude    0.0
dropoff_latitude     0.0
passenger_count      0.0
dtype: float64

#### We see that the fare amount column contains negative values !

#### We drop the useless columns and the rows containing outliers !

In [4]:
#useless columns of the dataset
useless_cols=["Unnamed: 0","key"]
dataset=dataset.drop(columns=useless_cols,axis=1)
#Dropping rows with negative values 
mask=dataset['fare_amount']>0
dataset=dataset.loc[mask,:]
print('Negative rows of fare_amount column and useless columns are dropped of the dataset ...')

Negative rows of fare_amount column and useless columns are dropped of the dataset ...


In [18]:
#Skewed target variable
fig = px.histogram(x = dataset['fare_amount'], nbins = 120, title = "Distribution of fare_amount ")
fig.show()

In [20]:
dataset['fare_amount']=np.log10(dataset['fare_amount'])

In [23]:
#Skewed target variable
fig = px.histogram(x = dataset['fare_amount'], nbins = 120, title = "Distribution of fare_amount ")
fig.show()

#### Convert pickup_datetime into datetime format :

In [8]:
dataset["pickup_datetime"] = pd.to_datetime(dataset["pickup_datetime"])
dataset.loc[:, "year"] = dataset["pickup_datetime"].dt.year
dataset.loc[:, "month"] = dataset["pickup_datetime"].dt.month
dataset.loc[:, "day"] = dataset["pickup_datetime"].dt.day
dataset.columns
weekdays_dict = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}
dataset.loc[:, "weekday"] = dataset["pickup_datetime"].dt.weekday.map(weekdays_dict)
dataset = dataset.drop('pickup_datetime', axis = 1)
dataset.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,weekday
0,13.0,-73.971664,40.797035,-73.958939,40.777649,1,2015,5,7,Thursday
1,5.5,-73.991635,40.749855,-73.98825,40.741341,2,2014,7,9,Wednesday
2,8.5,-73.982352,40.777042,-73.995912,40.759757,1,2013,11,11,Monday
3,19.0,-73.991455,40.7517,-73.936357,40.812327,1,2014,5,22,Thursday
4,7.7,-73.974749,40.756255,-73.952276,40.778332,1,2011,6,21,Tuesday


#### Computing the ride distance from the GPS coordinates

In [9]:
def haversine(lon_1, lon_2, lat_1, lat_2):
    
    lon_1, lon_2, lat_1, lat_2 = map(np.radians, [lon_1, lon_2, lat_1, lat_2])  # Convert degrees to Radians
    
    
    diff_lon = lon_2 - lon_1
    diff_lat = lat_2 - lat_1
    

    distance_km = 2*6371*np.arcsin(np.sqrt(np.sin(diff_lat/2.0)**2 + np.cos(lat_1) * np.cos(lat_2) * np.sin(diff_lon/2.0)**2)) # earth radius: 6371km
    
    return distance_km

In [10]:
#Creation of the column ride distance that computes the ride distance from GPS coordinates 
dataset.loc[:, 'ride_distance'] = dataset.apply(lambda x: haversine(x['pickup_longitude'], x['dropoff_longitude'], 
                                                                    x['pickup_latitude'], x['dropoff_latitude']), axis = 1)
dataset.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,weekday,ride_distance
0,13.0,-73.971664,40.797035,-73.958939,40.777649,1,2015,5,7,Thursday,2.407225
1,5.5,-73.991635,40.749855,-73.98825,40.741341,2,2014,7,9,Wednesday,0.988729
2,8.5,-73.982352,40.777042,-73.995912,40.759757,1,2013,11,11,Monday,2.235651
3,19.0,-73.991455,40.7517,-73.936357,40.812327,1,2014,5,22,Thursday,8.183379
4,7.7,-73.974749,40.756255,-73.952276,40.778332,1,2011,6,21,Tuesday,3.099698


In [11]:
# Separate target variable Y from features X
X=dataset.loc[:,dataset.columns!='fare_amount']
Y=dataset.loc[:,dataset.columns=='fare_amount']
print("Labels Separated from features...")
#Detecting numerical/categorical columns
numeric_features=[]
categorical_features=[]
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ( 'int' in str(t)):
        numeric_features.append(i)
    else:
        categorical_features.append(i)
        
print('Found numeric features:',numeric_features)  
print('Found categorical features:',categorical_features)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
numeric_transformer = StandardScaler() # Need to standardize features because we'll first use a linear regression as baseline model
# Create pipeline for categorical features
categorical_transformer = OneHotEncoder(drop='first') # no missing values in categorical data, so we only need the OHE
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


Labels Separated from features...
Found numeric features: ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year', 'month', 'day', 'ride_distance']
Found categorical features: ['weekday']


### Train a baseline model 

In [12]:
# Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


In [13]:
# Print R^2 scores
print("R2 score on training set : ", regressor.score(X_train, Y_train))
print("R2 score on test set : ", regressor.score(X_test, Y_test))

R2 score on training set :  0.02419859579741468
R2 score on test set :  0.017058651115981927


#### The baseline model has a score that is absolutely not interesting we will try the random forest regressor

In [14]:
print("Random Forest with default hyperparameters...")
regressor = RandomForestRegressor() # we must use a regressor here!
regressor.fit(X_train, Y_train)

Random Forest with default hyperparameters...


RandomForestRegressor()

In [15]:
print("R2 score on training set : ", regressor.score(X_train, Y_train))
print("R2 score on test set : ", regressor.score(X_test, Y_test))

R2 score on training set :  0.9668283551958815
R2 score on test set :  0.7776814323695133


In [16]:
regressor = RandomForestRegressor()
params={
    'max_depth':[10,12,14],
    'min_samples_split':[4,8],
    'n_estimators':[60,80,100]
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 3, verbose = 2)
gridsearch.fit(X_train, Y_train)
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)
# Print R^2 scores
print("R2 score on training set : ", gridsearch.score(X_train, Y_train))
print("R2 score on test set : ", gridsearch.score(X_test, Y_test))

# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# Print MAE
print("Mean Absolute Error on training set : ", mean_absolute_error(Y_train, Y_train_pred))
print("Mean Fare on training set : ", Y_train.mean())
print()
print("Mean Absolute Error on test set : ", mean_absolute_error(Y_test, Y_test_pred))
print("Mean Fare on test set : ", Y_test.mean())
print("Standard-deviation on test set : ", Y_test.std())

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END .max_depth=10, min_samples_split=4, n_estimators=60; total time=   1.5s
[CV] END .max_depth=10, min_samples_split=4, n_estimators=60; total time=   1.5s
[CV] END .max_depth=10, min_samples_split=4, n_estimators=60; total time=   1.5s
[CV] END .max_depth=10, min_samples_split=4, n_estimators=80; total time=   2.0s
[CV] END .max_depth=10, min_samples_split=4, n_estimators=80; total time=   2.0s
[CV] END .max_depth=10, min_samples_split=4, n_estimators=80; total time=   2.0s
[CV] END max_depth=10, min_samples_split=4, n_estimators=100; total time=   2.5s
[CV] END max_depth=10, min_samples_split=4, n_estimators=100; total time=   2.5s
[CV] END max_depth=10, min_samples_split=4, n_estimators=100; total time=   2.5s
[CV] END .max_depth=10, min_samples_split=8, n_estimators=60; total time=   1.5s
[CV] END .max_depth=10, min_samples_split=8, n_estimators=60; total time=   1.5s
[CV] END .max_depth=10, min_samples_split=8, n_e

In [17]:
# Feature importance 
column_names = []
for name, step, features_list in preprocessor.transformers_: # loop over steps of ColumnTransformer
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = step.get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names
        
print("Names of columns corresponding to each coefficient: ", column_names)

# Create a pandas DataFrame
feature_importance = pd.DataFrame(index = column_names, data = gridsearch.best_estimator_.feature_importances_, columns=["feature_importances"])
feature_importance = feature_importance.sort_values(by = 'feature_importances')
feature_importance

# Plot coefficients
fig = px.bar(feature_importance, orientation = 'h')
fig.update_layout(showlegend = False, 
                  margin = {'l': 120} # to avoid cropping of column names
                 )
fig.show()



Names of columns corresponding to each coefficient:  ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year', 'month', 'day', 'ride_distance', 'weekday_Monday', 'weekday_Saturday', 'weekday_Sunday', 'weekday_Thursday', 'weekday_Tuesday', 'weekday_Wednesday']
