In [1]:
!pip install pyarrow



In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [4]:
import os
cwd = os.getcwd()
cwd

'/home/Ivan/MLOps_ZoomCamp/notebooks'

In [7]:
def read_dataframe(filename):
    
    df = pd.read_parquet(filename)

    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df["duration"] = df["duration"].apply(lambda td: td.total_seconds()/60)

    df = df[(df.duration >=1) & (df.duration <=60)]
    categorical = ["PULocationID","DOLocationID"]
    df[categorical] = df[categorical].astype(str)
    
    return df

In [8]:
# MLOps_ZoomCamp/data/yellow_tripdata_2022-01.parquet
df_train = read_dataframe ("/home/Ivan/MLOps_ZoomCamp/data/yellow_tripdata_2022-01.parquet")
df_val = read_dataframe("/home/Ivan/MLOps_ZoomCamp/data/yellow_tripdata_2022-02.parquet")

In [9]:
len(df_train), len(df_val)

(2421440, 2918187)

In [None]:
df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

In [None]:
df_val

In [10]:
categorical = ["PU_DO"] #["PULocationID","DOLocationID"]
numerical = ["trip_distance"]

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient = "records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient = "records")
X_val = dv.transform(val_dicts)

In [11]:
target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values

In [12]:
lr = LinearRegression()
lr.fit(X_train,y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val,y_pred, squared=False)

7.786133771493894

In [None]:
la = Lasso(alpha=0.01)
la.fit(X_train,y_train)

y_pred = la.predict(X_val)

mean_squared_error(y_val,y_pred, squared=False)

In [None]:
lg = Ridge(alpha=0.01)
lg.fit(X_train,y_train)

y_pred = lg.predict(X_val)

mean_squared_error(y_val,y_pred, squared=False)

In [None]:
sns.distplot(y_pred, label = "prediction")
sns.distplot(y_train, label = "actual")

plt.legend();