In [1]:
import pandas as pd
import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression
import pickle
import os

In [2]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

In [3]:
def read_dataframe(link):
    df = pd.read_parquet(link)
    df['duration'] = ((df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds())/ 60.0
    df = df.loc[(df['duration'] >= 1 ) & (df['duration'] <= 60 ), :]
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']
    df[categorical] = df[categorical].astype(str)
    return df

In [4]:
train_df = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet')
val_df = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet')

In [5]:
train_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0,19.8
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0,6.6
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1.0,N,236,79,1,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0,17.916667
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1.0,N,79,211,1,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0,8.3
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1.0,N,211,148,1,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0,6.1


In [7]:
train_dicts = train_df[categorical + numerical].to_dict(orient='records')

In [9]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = train_df['duration'].values

In [10]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [11]:
y_pred = lr.predict(X_train)

In [13]:
print(root_mean_squared_error(y_train, y_pred))

7.952029670782532


In [15]:
val_dicts = val_df[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = val_df['duration']
yval_pred = lr.predict(X_val)
print(root_mean_squared_error(yval_pred, y_val))

In [25]:
if not os.path.exists('models'):
    os.mkdir('models')

In [27]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)