In [24]:
## Run if it´s necesary to install
# !pip install pyarrow

## Import packages

In [25]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [26]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [27]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/home/Ivan/MLOps_ZoomCamp/2-Experiment-tracking/notebooks/mlruns/1', creation_time=1687303853945, experiment_id='1', last_update_time=1687303853945, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

## Develop a funtion to load the data

In [28]:
def read_dataframe(filename):
    
    df = pd.read_parquet(filename)

    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df["duration"] = df["duration"].apply(lambda td: td.total_seconds()/60)

    df = df[(df.duration >=1) & (df.duration <=60)]
    categorical = ["PULocationID","DOLocationID"]
    df[categorical] = df[categorical].astype(str)
    
    return df

### loading the data

In [None]:
# MLOps_ZoomCamp/data/yellow_tripdata_2022-01.parquet
df_train = read_dataframe ("../data/yellow_tripdata_2022-01.parquet")
df_val = read_dataframe("../data/yellow_tripdata_2022-02.parquet")

In [None]:
len(df_train), len(df_val)

In [None]:
df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

## Split the data into trainning and validation data

In [None]:
categorical = ["PU_DO"] #["PULocationID","DOLocationID"]
numerical = ["trip_distance"]

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient = "records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient = "records")
X_val = dv.transform(val_dicts)

### Adjust target Variable


In [None]:
target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values

## Fitting some models

### Fit a linear model

In [None]:
lr = LinearRegression()
lr.fit(X_train,y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val,y_pred, squared=False)

### Save the best model

In [None]:
with open("models/lin_reg.bin", "wb") as f_out:
    pickle.dump((dv, lr), f_out)

### See the actual Y and predicted Y

In [None]:
# sns.distplot(y_pred, label = "prediction")
# sns.distplot(y_train, label = "actual")

# plt.legend();

### Fit a lasso model

In [None]:
# la = Lasso(alpha=0.01)
# la.fit(X_train,y_train)

# y_pred = la.predict(X_val)

# mean_squared_error(y_val,y_pred, squared=False)

### Fit a ridge model - tracking with mlflow

In [None]:
with mlflow.start_run():
    mlflow.set_tag("developer", "Ivan")
    
    mlflow.log_param("train-data-path","../data/yellow_tripdata_2022-01.parquet")
    mlflow.log_param("valid-data-path","../data/yellow_tripdata_2022-02.parquet")
    
    
    
    alpha = 0.1
    
    mlflow.log_param("alpha", alpha)
    
    
    
    lg = Ridge(alpha)
    lg.fit(X_train,y_train)

    y_pred = lg.predict(X_val)

    rmse = mean_squared_error(y_val,y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)
    
    mlflow.log_artifact(local_path = "models/lin_reg.bin", artifact_path="models_pickle")

# improving the model applying Xgboost Model

## Hyperparameter Optimizaiton Tracking:

In [None]:
%%time
import xgboost as xgb

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [26]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}



In [27]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

In [None]:
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

## Autologging:

Instead of logging the parameters by "Hand" by specifiying the logged parameters and passing them. We may use the Autologging feature in MLflow. There are two ways to use Autologging; First by enabling it globally in the code/Notebook using

In [18]:
mlflow.autolog()

2023/06/21 00:47:18 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2023/06/21 00:47:19 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/06/21 00:47:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


or by enabling the framework-specific autologger; ex with XGBoost:

In [None]:
params = {
    "learning_rate" : 0.09726255724350065,
    "max_depth": 51,
    "min_child_weight": 19.788154446519542,
    "objective": "reg:linear",
    "reg_alpha": 0.0802169697770356,
    "reg_lambda": 0.003109431432408728,
    "seed": 42
}

# mlflow.xgboost.autolog()

booster = xgb.train(
params=params,
dtrain=train,
num_boost_round=10,
evals=[(valid, 'validation')],
early_stopping_rounds=50)

2023/06/21 00:47:20 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '84c726ee884342eab3fff8605afffbd7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


Both must be done before running the experiments.

The autologger then not only stores the model parameters for ease of use, it also stores other files inside the model (can be specified) folder inside our experiment artifact folder, these files include:

conda.yaml and requirements.txt: Files which define the current envrionment for use with either conda or pip respectively MLmodel an internal MLflow file for organization Other framework-specific files such as the model itself

## Saving Models:


In [None]:
with mlflow.start_run():
    
    best_params = {
    "learning_rate" : 0.09726255724350065,
    "max_depth": 51,
    "min_child_weight": 19.788154446519542,
    "objective": "reg:linear",
    "reg_alpha": 0.0802169697770356,
    "reg_lambda": 0.003109431432408728,
    "seed": 42
    }
    
    mlflow.log_params(best_params)

    # mlflow.xgboost.autolog()

    booster = xgb.train(
    params=params,
    dtrain=train,
    num_boost_round=10,
    evals=[(valid, 'validation')],
    early_stopping_rounds=50)
    
    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val,y_pred, squared=False)
    mlflow.log_metric("rmse",rmse)
    
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")