In [1]:
import os
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [3]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2023/05/31 13:18:51 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/05/31 13:18:51 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/media/userl/Ubuntu-DataStora/Learning/MLOps_Zoomcamp/mlops-material/2-Experiment-Tracking/mlruns/1', creation_time=1685531933850, experiment_id='1', last_update_time=1685531933850, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

**Download the data**

- [NYC taxi data](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) (very large dataset > 1000000)
    - 2022, Yellow, January
    - 2022, Yellow, February
- OR (much smaller datasets < 100000)
    - 2021, Green, January
    - 2021, Green, February

In [4]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    if "2021" in filename:
        dropoff_datetime = "lpep_dropoff_datetime"
        pickup_datetime = "lpep_pickup_datetime"
    elif "2022" in filename:
        dropoff_datetime = "tpep_dropoff_datetime"
        pickup_datetime = "tpep_pickup_datetime"
    else:
        raise ValueError("Invalid dataset")
    
    df[dropoff_datetime] = pd.to_datetime(df[dropoff_datetime])
    df[pickup_datetime] = pd.to_datetime(df[pickup_datetime])

    # create duration-column
    df["duration"] = df[dropoff_datetime] - df[pickup_datetime]
    df["duration"] = df["duration"].dt.total_seconds() / 60

    # remove outliers
    df = df[(df["duration"] >= 1) & (df["duration"] <= 60)].copy()

    # convert categorical features to strings (to get 1-hot encoding)
    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)

    return df

# train_file = "data/yellow_tripdata_2022-01.parquet"
# val_file = "data/yellow_tripdata_2022-02.parquet"
train_file = "data/green_tripdata_2021-01.parquet"
val_file = "data/green_tripdata_2021-02.parquet"


df_train = read_dataframe(train_file)
print("Train: ", df_train.shape)

df_val = read_dataframe(val_file)
print("Val: ", df_val.shape)

Train:  (73908, 21)
Val:  (61921, 21)


**Create new features**

In [5]:
df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

In [6]:
categorical = ["PU_DO"]
numerical = ["trip_distance"]

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [7]:
target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values

**Linear Regression**

In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

7.758715213828063

In [9]:
models_dir = "./models/"
if not os.path.exists(models_dir):
    os.mkdir(models_dir)

with open("./models/lin_reg.bin", "wb") as f_out:
    pickle.dump((dv, lr), f_out)

## Experiment Tracking with MLflow

In [10]:
# Set everything inside a mlflow-run
with mlflow.start_run():

    # Setting tags/metadata
    mlflow.set_tag("developer", "johannes")
    # log data about the dataset
    mlflow.log_param("train-data-path", train_file)
    mlflow.log_param("valid-data-path", val_file)

    alpha = 0.1
    # logging the alpha parameter
    mlflow.log_param("alpha", alpha)

    lr = Lasso(alpha)
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    # Log metric
    mlflow.log_metric("rmse", rmse)

In [13]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# XGBoost requires the data to be of a certain type
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)


def objective(params):
    """ Objective function of XGBoost when used woth hyperopt.
    
    Parameters
    ----------
    params:
        Hyperparameters used in mlflow(logging) and xgboost(training)
    
    Returns
    -------
    dict:
        Dictionary of training results.

    """
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
    
    return { "loss": rmse, "status": STATUS_OK }


In [14]:
# Defining the search-space
search_space = {
    "max_depth": scope.int(hp.quniform("max_depth", 4, 100, 1)),  # [4, 100]
    "learning_rate": hp.loguniform("learning_rate", -3, 0),       # [exp(-3),  exp(0)] <~> [0.05, 1]
    "reg_alpha": hp.loguniform("reg_alpha", -5, -1),              # [exp(-5), exp(-1)] <~> [0.0067, 0.367]
    "reg_lambda": hp.loguniform("reg_lambda", -6, -1),            # [exp(-6), exp(-1)] <~> [0.0025, 0.367]
    "min_child_weight": hp.loguniform("min_child_weight", -1, 3), # [exp(-1),  exp(3)] <~> [0.367, 20.09]
    "objective": "reg:linear",
    "seed": 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:13.17691                          
[1]	validation-rmse:9.41081                           
[2]	validation-rmse:7.75659                           
[3]	validation-rmse:7.06956                           
[4]	validation-rmse:6.78344                           
[5]	validation-rmse:6.65586                           
[6]	validation-rmse:6.58549                           
[7]	validation-rmse:6.54835                           
[8]	validation-rmse:6.52564                           
[9]	validation-rmse:6.51204                           
[10]	validation-rmse:6.50337                          
[11]	validation-rmse:6.49726                          
[12]	validation-rmse:6.49180                          
[13]	validation-rmse:6.48893                          
[14]	validation-rmse:6.48622                          
[15]	validation-rmse:6.48155                          
[16]	validation-rmse:6.47715                          
[17]	validation-rmse:6.47372                          
[18]	valid

KeyboardInterrupt: 

### Train and and save the model with the obtained parameters

In [15]:
params = {
    "learning_rate": 0.2907320120988654,
    "max_depth": 21,
    "min_child_weight": 1.0853986931084512,
    "objective": "reg:linear",
    "reg_alpha": 0.07070755159719935,
    "reg_lambda": 0.015498739698424788,
    "seed": 42,
}

# Using autolog for logging with just 1 line of code
mlflow.xgboost.autolog()

booster = xgb.train(
    params=params,
    dtrain=train,
    num_boost_round=1000,
    evals=[(valid, 'validation')],
    early_stopping_rounds=50
)

2023/05/31 14:09:17 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'acebcc059e764080bb27627eceae2131', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:16.08374
[1]	validation-rmse:12.63889
[2]	validation-rmse:10.37800
[3]	validation-rmse:8.93036
[4]	validation-rmse:8.03154
[5]	validation-rmse:7.47945
[6]	validation-rmse:7.14374
[7]	validation-rmse:6.93426
[8]	validation-rmse:6.80258
[9]	validation-rmse:6.71586
[10]	validation-rmse:6.65976
[11]	validation-rmse:6.62235
[12]	validation-rmse:6.59675
[13]	validation-rmse:6.57607
[14]	validation-rmse:6.56670
[15]	validation-rmse:6.55887
[16]	validation-rmse:6.55261
[17]	validation-rmse:6.54728
[18]	validation-rmse:6.54017
[19]	validation-rmse:6.53673
[20]	validation-rmse:6.53140
[21]	validation-rmse:6.52605
[22]	validation-rmse:6.52327
[23]	validation-rmse:6.51978
[24]	validation-rmse:6.51768
[25]	validation-rmse:6.51525
[26]	validation-rmse:6.51318
[27]	validation-rmse:6.51112
[28]	validation-rmse:6.50780
[29]	validation-rmse:6.50350
[30]	validation-rmse:6.50072
[31]	validation-rmse:6.49888
[32]	validation-rmse:6.49650
[33]	validation-rmse:6.49435
[34]	validation-rmse:

