In [3]:
import pandas as pd

In [26]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [5]:
def read_dataframe(filename):
    df_original = pd.read_parquet(filename)

    df_original['duration'] = (df_original.tpep_dropoff_datetime - df_original.tpep_pickup_datetime)
    df_original['duration'] = df_original.duration.dt.total_seconds()/60

    df = df_original[(df_original.duration >= 1) & (df_original.duration <= 60) ]
    return df

In [7]:
train_df = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet')
val_df = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-02.parquet')

In [8]:
train_df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.10,1.0,N,142,43,2,8.00,3.00,0.5,0.00,0.00,0.3,11.80,2.5,,6.033333
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.70,1.0,N,132,165,1,42.00,0.50,0.5,8.65,0.00,0.3,51.95,0.0,,27.600000
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.60,1.0,N,138,132,1,29.00,0.50,0.5,6.05,0.00,0.3,36.35,0.0,,15.216667
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1,16.50,0.50,0.5,4.06,0.00,0.3,24.36,2.5,,16.533333
5,1,2021-01-01 00:16:29,2021-01-01 00:24:30,1.0,1.60,1.0,N,224,68,1,8.00,3.00,0.5,2.35,0.00,0.3,14.15,2.5,,8.016667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1369763,2,2021-01-31 23:04:00,2021-01-31 23:18:00,,7.74,,,159,259,0,22.15,0.00,0.5,0.00,0.00,0.3,22.95,,,14.000000
1369764,2,2021-01-31 23:03:00,2021-01-31 23:33:00,,8.89,,,229,181,0,27.78,0.00,0.5,7.46,0.00,0.3,38.54,,,30.000000
1369765,2,2021-01-31 23:29:00,2021-01-31 23:51:00,,7.43,,,41,70,0,32.58,0.00,0.5,0.00,6.12,0.3,39.50,,,22.000000
1369766,2,2021-01-31 23:25:00,2021-01-31 23:38:00,,6.26,,,74,137,0,16.85,0.00,0.5,3.90,0.00,0.3,24.05,,,13.000000


In [9]:
val_df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
0,1,2021-02-01 00:40:47,2021-02-01 00:48:28,1.0,2.30,1.0,N,141,226,2,8.50,3.00,0.5,0.00,0.00,0.3,12.30,2.5,,7.683333
1,1,2021-02-01 00:07:44,2021-02-01 00:20:31,1.0,1.60,1.0,N,43,263,2,9.50,3.00,0.5,0.00,0.00,0.3,13.30,0.0,,12.783333
2,1,2021-02-01 00:59:36,2021-02-01 01:24:13,1.0,5.30,1.0,N,114,263,2,19.00,3.00,0.5,0.00,0.00,0.3,22.80,2.5,,24.616667
3,2,2021-02-01 00:03:26,2021-02-01 00:16:32,1.0,2.79,1.0,N,236,229,1,11.00,0.50,0.5,2.96,0.00,0.3,17.76,2.5,,13.100000
4,2,2021-02-01 00:20:20,2021-02-01 00:24:03,2.0,0.64,1.0,N,229,140,1,4.50,0.50,0.5,1.66,0.00,0.3,9.96,2.5,,3.716667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371703,2,2021-02-28 23:25:41,2021-02-28 23:42:35,,8.84,,,141,160,0,36.83,2.75,0.5,0.00,6.12,0.3,46.50,,,16.900000
1371705,2,2021-02-28 23:27:00,2021-02-28 23:41:00,,4.42,,,68,24,0,17.14,0.00,0.5,4.33,0.00,0.3,24.77,,,14.000000
1371706,2,2021-02-28 23:18:05,2021-02-28 23:26:48,,1.50,,,68,137,0,9.46,0.00,0.5,2.64,0.00,0.3,15.40,,,8.716667
1371707,2,2021-02-28 23:41:07,2021-03-01 00:13:44,,15.30,,,113,254,0,59.15,2.75,0.5,0.00,0.00,0.3,62.70,,,32.616667


In [10]:
dv = DictVectorizer()

In [14]:
categorical = ['PULocationID', 'DOLocationID']
train_df.loc[:, categorical] = train_df[categorical].astype(str)
val_df.loc[:, categorical] = val_df[categorical].astype(str)


  val_df.loc[:, categorical] = val_df[categorical].astype(str)
  val_df.loc[:, categorical] = val_df[categorical].astype(str)


In [13]:
train_df[categorical]

Unnamed: 0,PULocationID,DOLocationID
0,142,43
2,132,165
3,138,132
4,68,33
5,224,68
...,...,...
1369763,159,259
1369764,229,181
1369765,41,70
1369766,74,137


In [15]:
train_df_dicts = train_df[categorical].to_dict(orient='records')
val_df_dicts = val_df[categorical].to_dict(orient='records')


In [16]:
X_train = dv.fit_transform(train_df_dicts)
X_val = dv.fit_transform(val_df_dicts)


In [18]:
X_val

<1340859x518 sparse matrix of type '<class 'numpy.float64'>'
	with 2681718 stored elements in Compressed Sparse Row format>

In [20]:
target = 'duration'
y_train = train_df[target].values
y_val = val_df[target].values

In [28]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [29]:
y_pred = lr.predict(X_train)
y_pred

array([ 9.44069309, 36.28983788, 30.52815401, ..., 13.56134521,
        8.89557004, 26.79808284])

In [30]:
root_mean_squared_error(y_train, y_pred)

6.8456202458268525

In [32]:
y_val_pred = lr.predict(X_val)
y_val_pred

array([17.41515652,  9.51168649,  9.01362752, ..., 10.14813588,
       29.05700314, 12.89555611])

In [34]:
root_mean_squared_error(y_val, y_val_pred)

7.737343899500185