In [4]:
import os
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [6]:
os.listdir("data")

['green_tripdata_2021-02.parquet', 'green_tripdata_2021-01.parquet']

In [14]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df["lpep_dropoff_datetime"] = pd.to_datetime(df["lpep_dropoff_datetime"])
    df["lpep_pickup_datetime"] = pd.to_datetime(df["lpep_pickup_datetime"])

    # create duration-column
    df["duration"] = df["lpep_dropoff_datetime"] - df["lpep_pickup_datetime"]
    df["duration"] = df["duration"].dt.total_seconds() / 60

    # remove outliers
    df = df[(df["duration"] >= 1) & (df["duration"] <= 60)].copy()

    # convert categorical features to strings (to get 1-hot encoding)
    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)

    return df

train_file = "data/green_tripdata_2021-01.parquet"
val_file = "data/green_tripdata_2021-02.parquet"

df_train = read_dataframe(train_file)
print("Train: ", df_train.shape)

df_val = read_dataframe(val_file)
print("Val: ", df_val.shape)

Train:  (73908, 21)
Val:  (61921, 21)


**Create new features**

In [15]:
df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]


In [18]:
categorical = ["PU_DO"]
numerical = ["trip_distance"]

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [19]:
target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values

**Linear Regression**

In [30]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

7.758715213828063

![Model Registry](imgs/model_registry.png)
Saving the models after training into the model registry.

In [31]:
with open("./models/lin_reg.bin", "wb") as f_out:
    pickle.dump((dv, lr), f_out)

**Lasso Regression**

Loggin results from training should be saved (**`Logged`**) to track the results. This allows comparison of results.

![Experiment Tracking](imgs/experiment_tracker.png)

In [32]:
N_lasso = 4
lasso_res = []
print("Lasso Regression")
for n in range(N_lasso):
    alpha = 0.1**n
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    lasso_res.append((alpha, rmse, lr))
    print(f"alpha = {alpha:.4f} | RMSE = {rmse:.2f}")

Lasso Regression
alpha = 1.0000 | RMSE = 12.21
alpha = 0.1000 | RMSE = 12.14
alpha = 0.0100 | RMSE = 11.17
alpha = 0.0010 | RMSE = 9.23


In [33]:
N_ridge = 4
ridge_res = []
print("Ridge Regression")
for n in range(N_ridge):
    alpha = 0.1**n
    lr = Ridge(alpha)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    ridge_res.append((alpha, rmse, lr))
    print(f"alpha = {alpha:.4f} | RMSE = {rmse:.2f}")


Ridge Regression
alpha = 1.0000 | RMSE = 7.70
alpha = 0.1000 | RMSE = 7.53
alpha = 0.0100 | RMSE = 7.51
alpha = 0.0010 | RMSE = 7.52


## Tracking the Training and the results

When traning a model, the results of each run / experiment is logged with the experiment-tracker. Alongside with this the models are saved in the model registry. Processes that are utilized for this usually are not done inside a jupyter notebook. For this so called "Machine Learning Pipelines" are used. The training should be done automated and the human is excluded from the trainin process as much as possible. This practice is borrowed from the DevOps process. Each process of the ML Pipeline has its own code:

![ML-Pipeline](imgs/ML_Pipeline.drawio.png)