Install Required Libraries

In [None]:
# pip install xgboost

In [None]:
# pip install pandas-gbq

In [None]:
# pip install google-cloud-bigquery

Import Required Libraries

In [1]:
from google.cloud import bigquery
import pandas as pd
import pandas_gbq
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import xgboost as xgb

Set Environment variable for Google Credentials

In [2]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.getcwd() + "/Key.json"

Create Big Query Client

In [3]:
bg_client = bigquery.Client(project='bigquery-public-data')

Get the data set and display all the tables from the given dataset

In [4]:
data_set_ref = bg_client.dataset('chicago_taxi_trips', project='bigquery-public-data')
data_set = bg_client.get_dataset(data_set_ref)
for tab in bg_client.list_tables(data_set):
    print(tab.table_id)

taxi_trips


List all the columns and the corresponding details for the given table

In [5]:
tab = bg_client.get_table(data_set.table('taxi_trips'))
tab.schema

[SchemaField('unique_key', 'STRING', 'REQUIRED', 'Unique identifier for the trip.', (), None),
 SchemaField('taxi_id', 'STRING', 'REQUIRED', 'A unique identifier for the taxi.', (), None),
 SchemaField('trip_start_timestamp', 'TIMESTAMP', 'NULLABLE', 'When the trip started, rounded to the nearest 15 minutes.', (), None),
 SchemaField('trip_end_timestamp', 'TIMESTAMP', 'NULLABLE', 'When the trip ended, rounded to the nearest 15 minutes.', (), None),
 SchemaField('trip_seconds', 'INTEGER', 'NULLABLE', 'Time of the trip in seconds.', (), None),
 SchemaField('trip_miles', 'FLOAT', 'NULLABLE', 'Distance of the trip in miles.', (), None),
 SchemaField('pickup_census_tract', 'INTEGER', 'NULLABLE', 'The Census Tract where the trip began. For privacy, this Census Tract is not shown for some trips.', (), None),
 SchemaField('dropoff_census_tract', 'INTEGER', 'NULLABLE', 'The Census Tract where the trip ended. For privacy, this Census Tract is not shown for some trips.', (), None),
 SchemaField('

In [6]:
tab.num_rows

194069509

Create a Sample query and assign the data into Pandas Dataframe

In [7]:
QUERY = """
    SELECT EXTRACT(YEAR FROM trip_start_timestamp) AS YEAR, COUNT(unique_key) as TRIPS
    FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
    GROUP BY YEAR
        """
df=pd.read_gbq(QUERY)
df

Downloading: 100%|██████████| 8/8 [00:00<00:00, 37.07rows/s]


Unnamed: 0,YEAR,TRIPS
0,2013,27217716
1,2017,24988003
2,2018,20732088
3,2016,31759339
4,2020,3113687
5,2019,16477365
6,2015,32385875
7,2014,37395436


# Get Data

Lets take the data corresponding to Feb-2020 to build a base model

In [8]:
QUERY = """
    SELECT *
    FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
    WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2020
    AND EXTRACT(MONTH FROM trip_start_timestamp) = 2
    """

df=pd.read_gbq(QUERY)
df.head(10)

Downloading: 100%|██████████| 1122124/1122124 [05:31<00:00, 3384.17rows/s]


Unnamed: 0,unique_key,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,...,extras,trip_total,payment_type,company,pickup_latitude,pickup_longitude,pickup_location,dropoff_latitude,dropoff_longitude,dropoff_location
0,81d77e9c651ca437ee16526eddfca2668e84ecbc,d0653a3c1a990641559a9e6e7c0e07db3765fe11e48ad7...,2020-02-01 00:00:00+00:00,2020-02-01 00:00:00+00:00,360.0,1.5,,,,,...,0.0,9.0,Credit Card,Choice Taxi Association,,,,,,
1,018a88115fcf57843a50a9ce0dd8abb2201b7ac4,7fd2bbba58b9c27f54afededbaa4f5fd4618e36e28feea...,2020-02-01 00:00:00+00:00,2020-02-01 00:00:00+00:00,360.0,0.0,,,32.0,8.0,...,0.0,8.75,Credit Card,Taxi Affiliation Services,41.878866,-87.625192,POINT (-87.6251921424 41.8788655841),41.899602,-87.633308,POINT (-87.6333080367 41.899602111)
2,23c0fad208015aa6f9e9bbdb6ad413209fcda715,07f6fa4e48d169ab189829ad0db1693534a26518982d79...,2020-02-01 00:00:00+00:00,2020-02-01 00:30:00+00:00,1740.0,12.66,,,56.0,8.0,...,4.0,46.2,Credit Card,Flash Cab,41.792592,-87.769615,POINT (-87.7696154528 41.7925923603),41.899602,-87.633308,POINT (-87.6333080367 41.899602111)
3,2df0c96d6b1ba75ff23d7a3fd5668a2eb73c320a,f5cd98dbe13bb8346311b0c44d0c9e9bf36e52125a8f65...,2020-02-01 00:00:00+00:00,2020-02-01 00:15:00+00:00,683.0,2.52,,,8.0,28.0,...,0.0,10.25,Cash,Nova Taxi Affiliation Llc,41.899602,-87.633308,POINT (-87.6333080367 41.899602111),41.874005,-87.663518,POINT (-87.6635175498 41.874005383)
4,0e195601e86785f1ec2216e4b679650b88d8019d,35057a271731c5b976bda25efe85aa0c1901d0a5fc9ba2...,2020-02-01 00:00:00+00:00,2020-02-01 00:00:00+00:00,420.0,1.2,,,8.0,32.0,...,1.0,9.75,Credit Card,Chicago Independents,41.899602,-87.633308,POINT (-87.6333080367 41.899602111),41.878866,-87.625192,POINT (-87.6251921424 41.8788655841)
5,1eb8af867ae06ad5be17db0d8ca873c4ad562687,dd44fc34c754eef9524e1f608dce6eea15b6f7208d08c4...,2020-02-01 00:00:00+00:00,2020-02-01 00:15:00+00:00,678.0,3.05,,,28.0,7.0,...,0.0,14.5,Credit Card,Chicago Carriage Cab Corp,41.874005,-87.663518,POINT (-87.6635175498 41.874005383),41.922686,-87.649489,POINT (-87.6494887289 41.9226862843)
6,b4fdf6c68de6d53c7cc337b4d2e3323324f11796,6bc3520aa3e5055fdd502ad4db2b419e40a2579182d817...,2020-02-01 00:00:00+00:00,2020-02-01 00:15:00+00:00,1118.0,4.17,,,6.0,6.0,...,0.0,13.25,Cash,Sun Taxi,41.944227,-87.655998,POINT (-87.6559981815 41.9442266014),41.944227,-87.655998,POINT (-87.6559981815 41.9442266014)
7,ed52441bc28f5f2d22515ba048a69e9066aa9b53,19bdccbc936f2f02882b69fd9e71ad97b9e783ba96a8c3...,2020-02-01 00:00:00+00:00,2020-02-01 00:00:00+00:00,212.0,0.4,,,6.0,6.0,...,0.0,4.5,Cash,Flash Cab,41.944227,-87.655998,POINT (-87.6559981815 41.9442266014),41.944227,-87.655998,POINT (-87.6559981815 41.9442266014)
8,ba8e167fd614d76bec89951efc3c2439e321e767,f1ed8794238deb1c8f7c3759aa096ae20e3e769cc4c9ad...,2020-02-01 00:00:00+00:00,2020-02-01 00:15:00+00:00,593.0,2.8,,,76.0,,...,4.0,17.7,Credit Card,Metro Jet Taxi A,41.980264,-87.913625,POINT (-87.913624596 41.9802643146),,,
9,af1e6f3a4b3b0a2d981fd6493d80fcc3fd6f0ffd,51d13e9ec1a04be36d87a3746e0e4f8c68ca5ed59e6f7a...,2020-02-01 00:00:00+00:00,2020-02-01 00:00:00+00:00,710.0,1.8,,,8.0,8.0,...,1.0,9.75,Cash,City Service,41.899602,-87.633308,POINT (-87.6333080367 41.899602111),41.899602,-87.633308,POINT (-87.6333080367 41.899602111)


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122124 entries, 0 to 1122123
Data columns (total 23 columns):
 #   Column                  Non-Null Count    Dtype              
---  ------                  --------------    -----              
 0   unique_key              1122124 non-null  object             
 1   taxi_id                 1122124 non-null  object             
 2   trip_start_timestamp    1122124 non-null  datetime64[ns, UTC]
 3   trip_end_timestamp      1122076 non-null  datetime64[ns, UTC]
 4   trip_seconds            1121943 non-null  float64            
 5   trip_miles              1122118 non-null  float64            
 6   pickup_census_tract     711601 non-null   float64            
 7   dropoff_census_tract    709129 non-null   float64            
 8   pickup_community_area   1042701 non-null  float64            
 9   dropoff_community_area  1021866 non-null  object             
 10  fare                    1122014 non-null  float64            
 11  tips       

For Model purpose, lets choose only the below fields
1. trip_start_timestamp
2. pickup_latitude, pickup_longitude
3. dropoff_latitude, dropoff_longitude
4. compare
5. fare - This field will be our label to predict

In [10]:
df_for_model = df[['trip_start_timestamp','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','company','fare']]
df_for_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122124 entries, 0 to 1122123
Data columns (total 7 columns):
 #   Column                Non-Null Count    Dtype              
---  ------                --------------    -----              
 0   trip_start_timestamp  1122124 non-null  datetime64[ns, UTC]
 1   pickup_latitude       1042806 non-null  float64            
 2   pickup_longitude      1042806 non-null  float64            
 3   dropoff_latitude      1024316 non-null  float64            
 4   dropoff_longitude     1024316 non-null  float64            
 5   company               1122124 non-null  object             
 6   fare                  1122014 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(5), object(1)
memory usage: 59.9+ MB


In [11]:
df_for_model=df_for_model.dropna()
df_for_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1010127 entries, 1 to 1122123
Data columns (total 7 columns):
 #   Column                Non-Null Count    Dtype              
---  ------                --------------    -----              
 0   trip_start_timestamp  1010127 non-null  datetime64[ns, UTC]
 1   pickup_latitude       1010127 non-null  float64            
 2   pickup_longitude      1010127 non-null  float64            
 3   dropoff_latitude      1010127 non-null  float64            
 4   dropoff_longitude     1010127 non-null  float64            
 5   company               1010127 non-null  object             
 6   fare                  1010127 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(5), object(1)
memory usage: 61.7+ MB


Set Jupyter notbook to display all fields

In [12]:
pd.options.display.max_columns = None

Find if we have any records that have fare value of less than minimum taxi fare in Chicago, which is $2.70 and remove those rows as well 

In [13]:
df_for_model = df_for_model.loc[df_for_model['fare'] >= 2.70]
df_for_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1009336 entries, 1 to 1122123
Data columns (total 7 columns):
 #   Column                Non-Null Count    Dtype              
---  ------                --------------    -----              
 0   trip_start_timestamp  1009336 non-null  datetime64[ns, UTC]
 1   pickup_latitude       1009336 non-null  float64            
 2   pickup_longitude      1009336 non-null  float64            
 3   dropoff_latitude      1009336 non-null  float64            
 4   dropoff_longitude     1009336 non-null  float64            
 5   company               1009336 non-null  object             
 6   fare                  1009336 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(5), object(1)
memory usage: 61.6+ MB


Add new columns to indicate day of the week and hour of the day columns

In [14]:
df_for_model['trip_day_of_week']=df_for_model['trip_start_timestamp'].dt.dayofweek
df_for_model['trip_hour_of_day']=df_for_model['trip_start_timestamp'].dt.hour
df_for_model=df_for_model.drop(['trip_start_timestamp'],axis=1)
df_for_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1009336 entries, 1 to 1122123
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   pickup_latitude    1009336 non-null  float64
 1   pickup_longitude   1009336 non-null  float64
 2   dropoff_latitude   1009336 non-null  float64
 3   dropoff_longitude  1009336 non-null  float64
 4   company            1009336 non-null  object 
 5   fare               1009336 non-null  float64
 6   trip_day_of_week   1009336 non-null  int64  
 7   trip_hour_of_day   1009336 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 69.3+ MB


Company field is a categorical value, so lets add dummy variables

In [15]:
df_dummy = pd.get_dummies(df_for_model['company'],prefix='comp', drop_first=True)
df_for_model = pd.concat([df_for_model,df_dummy],axis=1)
df_for_model=df_for_model.drop('company',axis=1)
df_for_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1009336 entries, 1 to 1122123
Data columns (total 51 columns):
 #   Column                                             Non-Null Count    Dtype  
---  ------                                             --------------    -----  
 0   pickup_latitude                                    1009336 non-null  float64
 1   pickup_longitude                                   1009336 non-null  float64
 2   dropoff_latitude                                   1009336 non-null  float64
 3   dropoff_longitude                                  1009336 non-null  float64
 4   fare                                               1009336 non-null  float64
 5   trip_day_of_week                                   1009336 non-null  int64  
 6   trip_hour_of_day                                   1009336 non-null  int64  
 7   comp_24 Seven Taxi                                 1009336 non-null  uint8  
 8   comp_2733 - 74600 Benny Jona                       1009336 non

Split the data into training and test set

In [16]:
X=df_for_model.drop(['fare'],axis=1)
y=df_for_model['fare']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20,random_state=123)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(807468, 50)
(201868, 50)
(807468,)
(201868,)


For this project, lets try the below models and get the Root Mean Squared error (RMSE) for each model
1. XGBoost
2. Random Forest
3. Linear Regression
4. Gradient Boosting

Since the dataset is huge we dont have to do K-Fold

In [18]:
model=xgb.XGBRegressor(objective='reg:squarederror',learning_rate=0.001, verbosity=1,n_estimators=50)
model.fit(X_train, y_train)
xgb_predict = model.predict(X_test)
xgb_srsme = np.sqrt(mean_squared_error(xgb_predict, y_test))
print('RMSE for XGBoost is ', xgb_srsme)

RMSE for XGBoost is  100.12233575627516


In [19]:
rfr = RandomForestRegressor(n_estimators = 50)
rfr.fit(X_train, y_train)
rfr_predict = rfr.predict(X_test)
rfr_srsme = np.sqrt(mean_squared_error(rfr_predict, y_test))
print('RMSE for Random Forest is ', rfr_srsme)

RMSE for Random Forest is  104.82434396735114


In [20]:
LR = LinearRegression()
LR.fit(X_train, y_train)
LR_predict = LR.predict(X_test)
LR_srsme = np.sqrt(mean_squared_error(LR_predict, y_test))
print('RMSE for Linear Regression is ', LR_srsme)

RMSE for Linear Regression is  96.6185247120466


In [21]:
GBR = GradientBoostingRegressor(n_estimators=50)
GBR.fit(X_train, y_train)
GBR_predict = LR.predict(X_test)
GBR_srsme = np.sqrt(mean_squared_error(GBR_predict, y_test))
print('RMSE for Gradient Boosting is ', GBR_srsme)

RMSE for Gradient Boosting is  96.6185247120466


Lets Calculate the baseline prediction and cross verify it against what we got

In [22]:
avg_fare=round(np.mean(y_train),2)
baseline_pred=np.repeat(avg_fare,y_test.shape[0])
baseline_rmse=np.sqrt(mean_squared_error(baseline_pred, y_test))
print("Basline RMSE of Validation data :",baseline_rmse)

Basline RMSE of Validation data : 99.75063527948782


Based on the above baseline, it seems XG Boost is giving the best prediction.  Lets take that as our prototype