In [1]:
import pickle

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [4]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds()/60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [5]:
df_train = read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet")
df_val = read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet")

df_train["PU_DO"] = df_train['PULocationID'] + "_" + df_train['DOLocationID']
df_val["PU_DO"] = df_val['PULocationID'] + "_" + df_val['DOLocationID']

In [17]:
categorical = ["PU_DO"]
numerical = ['trip_distance']

X_train = df_train[categorical + numerical]
X_val = df_val[categorical + numerical]

In [15]:
target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values

In [19]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred)

147.83335148572633

In [21]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump(lr, f_out)

In [23]:
la = Lasso(0.01)
la.fit(X_train, y_train)
y_pred = la.predict(X_val)

mean_squared_error(y_val, y_pred)

147.83335183304166

In [24]:
with open('models/lasso.bin', 'wb') as f_out:
    pickle.dump(la, f_out)

In [25]:
rd = Ridge(0.01)
rd.fit(X_train, y_train)
y_pred = rd.predict(X_val)

mean_squared_error(y_val, y_pred)

147.83335148572633

In [26]:
with open('models/ridge.bin', 'wb') as f_out:
    pickle.dump(rd, f_out)