In [1]:
!python -V

Python 3.10.14


In [2]:
import mlflow
import mlflow.sklearn

# Set the tracking URI to an SQLite database file
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Set the experiment name
mlflow.set_experiment("mlops-zoomcamlp")


<Experiment: artifact_location='/media/nhutpham/Work1/MLOpsZoomcap/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1725592297969, experiment_id='1', last_update_time=1725592297969, lifecycle_stage='active', name='mlops-zoomcamlp', tags={}>

In [3]:
import pandas as pd

In [4]:
import pickle

In [5]:
import matplotlib.pyplot as plt

In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error

In [13]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [14]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [15]:
len(df_train), len(df_val)

(73908, 61921)

In [16]:
duration_std = df_train['duration'].std() 

In [12]:
duration_std

np.float64(8.86526930029462)

In [13]:
# # Total number of records before removing outliers
# initial_count = len(df_train)

# # Remove outliers by keeping only rows where 'duration' is between 1 and 60 minutes
# df_train_filtered = df_train[(df_train['duration'] >= 1) & (df_train['duration'] <= 60)]

# # Total number of records after removing outliers
# final_count = len(df_train_filtered)

# # Calculate the fraction of records left
# fraction_left = final_count / initial_count

# print(f"Fraction of records left: {fraction_left:.2f}")

In [14]:
# # Cast the IDs to strings
# df_train['PULocationID'] = df_train['PULocationID'].astype(str)
# df_train['DOLocationID'] = df_train['DOLocationID'].astype(str)

# # Turn the DataFrame into a list of dictionaries using only 'PULocationID' and 'DOLocationID'
# data_dicts = df_train[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# # Fit a DictVectorizer
# vectorizer = DictVectorizer(sparse=False)
# feature_matrix = vectorizer.fit_transform(data_dicts)

# # Get the dimensionality of the matrix
# num_columns = feature_matrix.shape[1]

# print(f"The dimensionality of the matrix is: {num_columns} columns")

In [15]:
df_train.keys()

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration'],
      dtype='object')

In [17]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [18]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [19]:
df_train[categorical + numerical].head(1)

Unnamed: 0,PU_DO,trip_distance
0,43_151,1.01


In [20]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [66]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from mlflow.models.signature import infer_signature
import mlflow.xgboost
from hyperopt.pyll.base import scope
def objective(params):
    with mlflow.start_run(nested=True): 
        # Log hyperparameters
        mlflow.log_params(params)
        
        # Convert data to DMatrix (XGBoost data structure)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        
        # Train model
        model = xgb.train(params, 
                          dtrain, 
                          evals=[(dval, 'validation')], 
                          early_stopping_rounds=10,
                          num_boost_round=1000, 
                          verbose_eval=False)

        # Predict on validation set
        y_pred = model.predict(dval)

        # Calculate RMSE and log the metric
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        
        # Log model signature and input example
        signature = infer_signature(X_val, y_pred)
        input_example = X_val[0]
        mlflow.xgboost.log_model(model, "xgboost_model", signature=signature, input_example=input_example)

        return {'loss': rmse, 'status': STATUS_OK}

In [62]:
import numpy as np 
# Define search space for Hyperopt
search_space = {
    'objective': 'reg:squarederror',
    'max_depth': scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1000, 1)),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'reg_alpha': hp.loguniform('reg_alpha', np.log(0.01), np.log(10)),
    'reg_lambda': hp.loguniform('reg_lambda', np.log(0.01), np.log(10)),
}

In [67]:
train_data_url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet'
val_data_url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-02.parquet'
with mlflow.start_run(): 
    # Log the developer tag
    mlflow.set_tag("model", "xgboost")
    
    # Log data source URLs as tags
    mlflow.set_tag("train_data_url", train_data_url)
    mlflow.set_tag("val_data_url", val_data_url)

    # Read and log datasets
    mlflow.log_param("train_data_shape", df_train.shape)
    mlflow.log_param("validation_data_shape", df_val.shape)


    # Sample the first few rows of the datasets and log them as artifacts
    df_train.head(5).to_csv("train_sample.csv", index=False)
    df_val.head(5).to_csv("val_sample.csv", index=False)
    mlflow.log_artifact("train_sample.csv")
    mlflow.log_artifact("val_sample.csv")
    
    # Tune the XGBoost model using Hyperopt
    trials = Trials()
    best_params = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=50, trials=trials)

    # Log the best hyperparameters
    mlflow.log_params(best_params)

    # Optionally log the full dataset or additional visualizations

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

Parameters: { "n_estimators" } are not used.




  0%|          | 0/50 [06:02<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [72]:
best_params = {
    "colsample_bytree": 0.9447724401254488,
    "learning_rate": 0.2576928125052712,
    "max_depth": 10,
    "reg_alpha": 0.01735745977982242,
    "reg_lambda": 0.04766124633013455,
    "subsample": 0.7076825876246899
}
with mlflow.start_run(nested=True):
    # Log hyperparameters
    mlflow.log_params(best_params)
    
    # Convert data to DMatrix (XGBoost data structure)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Train model with early stopping
    model = xgb.train(best_params, 
                      dtrain, 
                      evals=[(dval, 'validation')], 
                      early_stopping_rounds=10,
                      num_boost_round=1000, 
                      verbose_eval=False)
    
    # Predict on validation set
    y_pred = model.predict(dval)

    # Calculate RMSE and log the metric
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    # Log model signature and input example
    signature = infer_signature(X_val, y_pred)
    
    # Provide an input example (1 sample) for model logging
    input_example = X_val[0:1]  # Adjust if needed for correct format
    mlflow.xgboost.log_model(model, "xgboost_model", signature=signature, input_example=input_example)




KeyboardInterrupt: 

In [22]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
# Start MLflow run
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
}
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Enable autologging
        mlflow.sklearn.autolog()

        # Train the model
        model.fit(X_train, y_train)

        # Predict on validation set
        y_pred = model.predict(X_val)

        # Calculate and log RMSE
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)



        print(f"Logged {model_name} with RMSE: {rmse}")



Logged LinearRegression with RMSE: 7.758715203341164




Logged RandomForest with RMSE: 6.913010836412057




Logged GradientBoosting with RMSE: 6.742303328497425


In [None]:
import os
import pickle

# Create the 'models' directory if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Save the model to a file
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)


In [56]:
import pickle

# Load the model from the file
with open('models/lin_reg.bin', 'rb') as f_in:
    dv, lr = pickle.load(f_in)


In [58]:
y_pred = lr.predict(X_val[0])

# root_mean_squared_error(y_val, y_pred)

In [59]:
y_pred

array([13.99325864])