In [32]:
!wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet

--2022-05-24 15:36:50--  https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet
Resolving nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)... 52.217.96.156
Connecting to nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)|52.217.96.156|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11886281 (11M) [binary/octet-stream]
Saving to: ‘fhv_tripdata_2021-01.parquet.2’


2022-05-24 15:36:51 (18.1 MB/s) - ‘fhv_tripdata_2021-01.parquet.2’ saved [11886281/11886281]



In [33]:
!wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet

--2022-05-24 15:36:51--  https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet
Resolving nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)... 52.217.196.249
Connecting to nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)|52.217.196.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10645466 (10M) [binary/octet-stream]
Saving to: ‘fhv_tripdata_2021-02.parquet.2’


2022-05-24 15:36:52 (16.1 MB/s) - ‘fhv_tripdata_2021-02.parquet.2’ saved [10645466/10645466]



In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt  

In [35]:
df_jan.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [36]:
df_feb.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00013,2021-02-01 00:01:00,2021-02-01 01:33:00,,,,B00014
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,,225.0,,B00037


In [37]:
df_jan = pd.read_parquet("fhv_tripdata_2021-01.parquet")
df_feb = pd.read_parquet("fhv_tripdata_2021-02.parquet")
df_jan.shape

(1154112, 7)

In [38]:
#çompute trip duration in minutes
df_jan["duration"] = df_jan.dropOff_datetime - df_jan.pickup_datetime
df_jan.duration = df_jan.duration.apply(lambda td: td.total_seconds()/60)

In [39]:
#Calculating the mean duration for January
df_jan.duration.mean()

19.1672240937939

In [40]:
df_jan.shape

(1154112, 8)

In [41]:
#Number of records to be removed which are not between 1 and 60mins
df_jan_removed = df_jan[(df_jan.duration<1)|(df_jan.duration >60)]
df_jan_removed.shape

(44286, 8)

In [42]:
#filter all trips which are not between 1 and 60mins
df_jan = df_jan[(df_jan.duration>=1) & (df_jan.duration <=60)]
df_jan.shape

(1109826, 8)

In [43]:
#count of missing values
df_jan.isnull().sum(axis = 0)

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID               927008
DOlocationID               147907
SR_Flag                   1109826
Affiliated_base_number        773
duration                        0
dtype: int64

In [44]:
#calculate the missing values
miss_val_percent = 100 * df_jan.isnull().sum() / len(df_jan)
miss_val_percent

dispatching_base_num        0.000000
pickup_datetime             0.000000
dropOff_datetime            0.000000
PUlocationID               83.527328
DOlocationID               13.327044
SR_Flag                   100.000000
Affiliated_base_number      0.069651
duration                    0.000000
dtype: float64

In [45]:
#replace the missing values with -1
df_jan = df_jan.replace(np.nan,-1)

In [46]:
PUmiss_val_percent = 100 * len(df_jan[df_jan.PUlocationID == -1.0]) / len(df_jan)
PUmiss_val_percent

83.52732770722618

In [47]:
DOmiss_val_percent = 100 * len(df_jan[df_jan.DOlocationID == -1.0]) / len(df_jan)
DOmiss_val_percent

13.327044059158823

In [48]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [49]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    #çompute trip duration in minutes
    df["duration"] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds()/60)

    #Select all trips less than 60
    df = df[(df.duration>=1) & (df.duration <=60)]
    
    #fill null values with -1
    df = df.replace(np.nan,-1)

    #Extract the most useful categorical and numerical variables from our dataset
    categorical = ['PUlocationID','DOlocationID']

    #Convert numerical variables to categorical variables
    df[categorical] = df[categorical].astype(str)
    
    return df

In [50]:
df_train = read_dataframe("fhv_tripdata_2021-01.parquet")
df_val = read_dataframe("fhv_tripdata_2021-02.parquet")
df_train.shape, df_val.shape

((1109826, 8), (990113, 8))

In [51]:
#Extract the most useful categorical variables from our dataset
categorical = ['PUlocationID','DOlocationID']

#perform data vectorization
dv = DictVectorizer()

In [52]:
#Convert our selected features to dictionaries for our vectorizers to work and fit
train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
X_train.shape

(1109826, 525)

In [53]:
#Convert our selected features to dictionaries for our vectorizers to work
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [54]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [55]:
#Apply Linear regression on Training dataset

#train our model
lr = LinearRegression()
lr.fit(X_train, y_train)

#model prediction
y_pred = lr.predict(X_train)

#evaluate our model
rmse_train = mean_squared_error(y_train, y_pred, squared=False)
rmse_train

10.528519107213093

In [56]:
#Apply Linear regression on Validation dataset

#model prediction
y_pred = lr.predict(X_val)

#evaluate our model
rmse_val = mean_squared_error(y_val, y_pred, squared=False)

rmse_val

11.014283206231301