In [2]:
!python -V

Python 3.10.14


In [1]:
import mlflow
import mlflow.sklearn

# Set the tracking URI to an SQLite database file
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Set the experiment name
mlflow.set_experiment("mlops-zoomcamlp")


2024/09/06 10:11:26 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/09/06 10:11:27 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/media/nhutpham/Work1/MLOpsZoomcap/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1725592297969, experiment_id='1', last_update_time=1725592297969, lifecycle_stage='active', name='mlops-zoomcamlp', tags={}>

In [3]:
import pandas as pd

In [4]:
import pickle

In [6]:
import matplotlib.pyplot as plt

In [7]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error

In [8]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [11]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-02.parquet')

In [12]:
len(df_train), len(df_val)

(1343254, 1340859)

In [13]:
duration_std = df_train['duration'].std() 

In [14]:
duration_std

np.float64(8.86526930029462)

In [11]:
# # Total number of records before removing outliers
# initial_count = len(df_train)

# # Remove outliers by keeping only rows where 'duration' is between 1 and 60 minutes
# df_train_filtered = df_train[(df_train['duration'] >= 1) & (df_train['duration'] <= 60)]

# # Total number of records after removing outliers
# final_count = len(df_train_filtered)

# # Calculate the fraction of records left
# fraction_left = final_count / initial_count

# print(f"Fraction of records left: {fraction_left:.2f}")

In [12]:
# # Cast the IDs to strings
# df_train['PULocationID'] = df_train['PULocationID'].astype(str)
# df_train['DOLocationID'] = df_train['DOLocationID'].astype(str)

# # Turn the DataFrame into a list of dictionaries using only 'PULocationID' and 'DOLocationID'
# data_dicts = df_train[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# # Fit a DictVectorizer
# vectorizer = DictVectorizer(sparse=False)
# feature_matrix = vectorizer.fit_transform(data_dicts)

# # Get the dimensionality of the matrix
# num_columns = feature_matrix.shape[1]

# print(f"The dimensionality of the matrix is: {num_columns} columns")

In [13]:
df_train.keys()

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration'],
      dtype='object')

In [15]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [16]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [17]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [23]:
import numpy as np 
train_data_url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet'
val_data_url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-02.parquet'

# Start MLflow run
with mlflow.start_run(): 
    # Log the developer tag
    mlflow.set_tag("developer", "nhutpham")
    
    # Log parameters
    alpha = 0.01
    mlflow.log_param("alpha", alpha)

    # Log data source URLs as tags
    mlflow.set_tag("train_data_url", train_data_url)
    mlflow.set_tag("val_data_url", val_data_url)

    # Read and log datasets
    df_train = read_dataframe(train_data_url)
    df_val = read_dataframe(val_data_url)

    mlflow.log_param("train_data_shape", df_train.shape)
    mlflow.log_param("validation_data_shape", df_val.shape)

    # Sample the first few rows of the datasets and log them as artifacts
    df_train.head(5).to_csv("train_sample.csv", index=False)
    df_val.head(5).to_csv("val_sample.csv", index=False)
    mlflow.log_artifact("train_sample.csv")
    mlflow.log_artifact("val_sample.csv")
    
    # Model training
    lr = Ridge(alpha=alpha)

    lr.fit(X_train, y_train)

    # Predict and log the predictions
    y_pred = lr.predict(X_val)
    np.savetxt("predictions.csv", y_pred, delimiter=",")
    mlflow.log_artifact("predictions.csv")

    # Log performance metrics
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # Log model
    mlflow.sklearn.log_model(lr, "ridge_model")

    # Optionally log the full dataset or additional visualizations



In [19]:
import os
import pickle

# Create the 'models' directory if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Save the model to a file
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)


In [20]:
import pickle

# Load the model from the file
with open('models/lin_reg.bin', 'rb') as f_in:
    dv, lr = pickle.load(f_in)


In [21]:
y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

np.float64(4.992299116111529)