In [16]:
import pandas as pd

In [36]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import root_mean_squared_error

In [18]:
import mlflow 
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment('nyc-taxi-experiment')

<Experiment: artifact_location='/Users/kpedido/code/learning_projects/mlops/2_experimental_tracking/mlruns/1', creation_time=1748290853483, experiment_id='1', last_update_time=1748290853483, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [19]:
def read_dataframe(filename):
    df_original = pd.read_parquet(filename)

    df_original['duration'] = (df_original.tpep_dropoff_datetime - df_original.tpep_pickup_datetime)
    df_original['duration'] = df_original.duration.dt.total_seconds()/60

    df = df_original[(df_original.duration >= 1) & (df_original.duration <= 60) ]
    return df

In [20]:
train_df = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet')
val_df = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-02.parquet')

In [21]:
train_df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.10,1.0,N,142,43,2,8.00,3.00,0.5,0.00,0.00,0.3,11.80,2.5,,6.033333
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.70,1.0,N,132,165,1,42.00,0.50,0.5,8.65,0.00,0.3,51.95,0.0,,27.600000
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.60,1.0,N,138,132,1,29.00,0.50,0.5,6.05,0.00,0.3,36.35,0.0,,15.216667
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1,16.50,0.50,0.5,4.06,0.00,0.3,24.36,2.5,,16.533333
5,1,2021-01-01 00:16:29,2021-01-01 00:24:30,1.0,1.60,1.0,N,224,68,1,8.00,3.00,0.5,2.35,0.00,0.3,14.15,2.5,,8.016667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1369763,2,2021-01-31 23:04:00,2021-01-31 23:18:00,,7.74,,,159,259,0,22.15,0.00,0.5,0.00,0.00,0.3,22.95,,,14.000000
1369764,2,2021-01-31 23:03:00,2021-01-31 23:33:00,,8.89,,,229,181,0,27.78,0.00,0.5,7.46,0.00,0.3,38.54,,,30.000000
1369765,2,2021-01-31 23:29:00,2021-01-31 23:51:00,,7.43,,,41,70,0,32.58,0.00,0.5,0.00,6.12,0.3,39.50,,,22.000000
1369766,2,2021-01-31 23:25:00,2021-01-31 23:38:00,,6.26,,,74,137,0,16.85,0.00,0.5,3.90,0.00,0.3,24.05,,,13.000000


In [22]:
val_df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
0,1,2021-02-01 00:40:47,2021-02-01 00:48:28,1.0,2.30,1.0,N,141,226,2,8.50,3.00,0.5,0.00,0.00,0.3,12.30,2.5,,7.683333
1,1,2021-02-01 00:07:44,2021-02-01 00:20:31,1.0,1.60,1.0,N,43,263,2,9.50,3.00,0.5,0.00,0.00,0.3,13.30,0.0,,12.783333
2,1,2021-02-01 00:59:36,2021-02-01 01:24:13,1.0,5.30,1.0,N,114,263,2,19.00,3.00,0.5,0.00,0.00,0.3,22.80,2.5,,24.616667
3,2,2021-02-01 00:03:26,2021-02-01 00:16:32,1.0,2.79,1.0,N,236,229,1,11.00,0.50,0.5,2.96,0.00,0.3,17.76,2.5,,13.100000
4,2,2021-02-01 00:20:20,2021-02-01 00:24:03,2.0,0.64,1.0,N,229,140,1,4.50,0.50,0.5,1.66,0.00,0.3,9.96,2.5,,3.716667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371703,2,2021-02-28 23:25:41,2021-02-28 23:42:35,,8.84,,,141,160,0,36.83,2.75,0.5,0.00,6.12,0.3,46.50,,,16.900000
1371705,2,2021-02-28 23:27:00,2021-02-28 23:41:00,,4.42,,,68,24,0,17.14,0.00,0.5,4.33,0.00,0.3,24.77,,,14.000000
1371706,2,2021-02-28 23:18:05,2021-02-28 23:26:48,,1.50,,,68,137,0,9.46,0.00,0.5,2.64,0.00,0.3,15.40,,,8.716667
1371707,2,2021-02-28 23:41:07,2021-03-01 00:13:44,,15.30,,,113,254,0,59.15,2.75,0.5,0.00,0.00,0.3,62.70,,,32.616667


In [23]:
dv = DictVectorizer()

In [24]:
categorical = ['PULocationID', 'DOLocationID']
train_df.loc[:, categorical] = train_df[categorical].astype(str)
val_df.loc[:, categorical] = val_df[categorical].astype(str)


  train_df.loc[:, categorical] = train_df[categorical].astype(str)
  train_df.loc[:, categorical] = train_df[categorical].astype(str)
  val_df.loc[:, categorical] = val_df[categorical].astype(str)
  val_df.loc[:, categorical] = val_df[categorical].astype(str)


In [25]:
train_df[categorical]

Unnamed: 0,PULocationID,DOLocationID
0,142,43
2,132,165
3,138,132
4,68,33
5,224,68
...,...,...
1369763,159,259
1369764,229,181
1369765,41,70
1369766,74,137


In [26]:
train_df_dicts = train_df[categorical].to_dict(orient='records')
val_df_dicts = val_df[categorical].to_dict(orient='records')


In [27]:
X_train = dv.fit_transform(train_df_dicts)
X_val = dv.fit_transform(val_df_dicts)


In [28]:
X_val

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2681718 stored elements and shape (1340859, 518)>

In [29]:
target = 'duration'
y_train = train_df[target].values
y_val = val_df[target].values

In [30]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [31]:
y_pred = lr.predict(X_train)
y_pred

array([ 9.44083611, 36.29019304, 30.528434  , ..., 13.56182908,
        8.89576656, 26.79785254], shape=(1343254,))

In [32]:
root_mean_squared_error(y_train, y_pred)

6.845620258734961

In [33]:
y_val_pred = lr.predict(X_val)
y_val_pred

array([17.41505208,  9.51166177,  9.01375596, ..., 10.14819727,
       29.05628312, 12.89625376], shape=(1340859,))

In [34]:
root_mean_squared_error(y_val, y_val_pred)

7.737370478957588

In [37]:
with mlflow.start_run():

    mlflow.set_tag('developer', 'kpedido')

    mlflow.log_param('training-data-path', 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet')
    mlflow.log_param('valid-data-path', 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-02.parquet')


    
    alpha = 0.01 
    mlflow.log_param('alpha', alpha)

    lr= Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)

    rmse = root_mean_squared_error(y_val, y_pred)

    mlflow.log_metric('rmse', rmse)

In [40]:
df = pd.read_parquet('../data/yellow_tripdata_2023-01.parquet')

In [41]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [42]:
import xgboost as xgb 

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [46]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [47]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, "validation")],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [49]:
params = {
    'learning_rate': 0.6550522874897998,
    'max_depth': 9,
    'min_child_weight': 11.828925955992666,
    'objective': 'reg:linear',
    'reg_alpha': 0.07066490658263495,
    'reg_lambda': 0.0025742386226546544,
    'seed': 42
}

mlflow.xgboost.autolog()

booster = xgb.train(
    params=params,
    dtrain=train,
    num_boost_round=1000,
    evals=[(valid, "validation")],
    early_stopping_rounds=50
)

2025/05/27 00:24:17 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8f1d6883c2cc475e8928d9708a12935b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:8.83639
[1]	validation-rmse:8.63831
[2]	validation-rmse:8.52618
[3]	validation-rmse:8.40978


  self.starting_round = model.num_boosted_rounds()


[4]	validation-rmse:8.32388
[5]	validation-rmse:8.20999
[6]	validation-rmse:8.13036
[7]	validation-rmse:8.08000
[8]	validation-rmse:8.02366
[9]	validation-rmse:7.94534
[10]	validation-rmse:7.88295
[11]	validation-rmse:7.83271
[12]	validation-rmse:7.79073
[13]	validation-rmse:7.73679
[14]	validation-rmse:7.69474
[15]	validation-rmse:7.66687
[16]	validation-rmse:7.63036
[17]	validation-rmse:7.59969
[18]	validation-rmse:7.57757
[19]	validation-rmse:7.54034
[20]	validation-rmse:7.50038
[21]	validation-rmse:7.54781
[22]	validation-rmse:7.52858
[23]	validation-rmse:7.50885
[24]	validation-rmse:7.47057
[25]	validation-rmse:7.43300
[26]	validation-rmse:7.41902
[27]	validation-rmse:7.40324
[28]	validation-rmse:7.37875
[29]	validation-rmse:7.36291
[30]	validation-rmse:7.35454
[31]	validation-rmse:7.33977
[32]	validation-rmse:7.31394
[33]	validation-rmse:7.27978
[34]	validation-rmse:7.27215
[35]	validation-rmse:7.26343
[36]	validation-rmse:7.24669
[37]	validation-rmse:7.22456
[38]	validation-rmse

