In [1]:
!python -V

Python 3.10.18


In [2]:
import os
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import (
    root_mean_squared_error,
    mean_squared_error, 
    mean_absolute_error, 
    r2_score, 
    mean_absolute_percentage_error,
    explained_variance_score,
    max_error,
    median_absolute_error
)
import math
from math import sqrt

In [3]:
import mlflow
mlflow.xgboost.autolog()

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2025/07/13 03:58:30 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/13 03:58:30 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/home/ubuntu/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1751630361732, experiment_id='1', last_update_time=1751630361732, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [4]:
def read_dataframe(filename):
    df = pd.read_csv(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [5]:
df_train = read_dataframe('./data/green_tripdata_2021-01.csv')
df_val = read_dataframe('./data/green_tripdata_2021-02.csv')

  df = pd.read_csv(filename)


In [6]:
len(df_train), len(df_val)

(73908, 61921)

In [7]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [8]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [9]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [10]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

In [11]:
root_mean_squared_error(y_val, y_pred)

7.7587152060111135

In [12]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [13]:
with mlflow.start_run():

    mlflow.set_tag("developer", "cristian")

    mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
    mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")

    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    os.makedirs("models", exist_ok=True)
    # Anda perlu menyimpan model Lasso ke 'lin_reg.bin' di sini
    # Contoh: import joblib; joblib.dump(lr, "models/lin_reg.bin")
    
    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle")

In [14]:
from datetime import datetime
import xgboost as xgb
import gc

In [15]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from hyperopt.pyll import scope
import logging

In [16]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [17]:
# Setup file untuk menyimpan hasil
results_file = "hyperopt_results.pkl"
checkpoint_file = "hyperopt_checkpoint.pkl"

def objective(params):
    # Cleanup sebelum mulai
    gc.collect()
    
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        
            
        booster = xgb.train(
            params=params,
            dtrain=train,  # Pastikan nama parameter benar
            num_boost_round=500,  # Kurangi dari 1000
            evals=[(valid, 'validation')],
            early_stopping_rounds=30,  # Kurangi dari 50
            verbose_eval=False,
            callbacks=None  # Hilangkan callback tambahan
        )
        
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

        input_example = train.slice(np.array([0])).get_data()

        mlflow.xgboost.log_model(
            xgb_model=booster,
            name="xgboost_model", # Menggunakan 'name' bukan 'artifact_path'
            input_example=input_example,
            registered_model_name="MyXGBoostModel"
        )
        
        # Cleanup langsung
        del booster, y_pred
        gc.collect()
        
    return {'loss': rmse, 'status': STATUS_OK}

In [18]:
# Search space yang lebih terbatas
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 8, 1)),  # Lebih kecil
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),  # Uniform lebih cepat
    'reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'min_child_weight': hp.uniform('min_child_weight', 1, 5),
    'objective': 'reg:squarederror',
    'seed': 42,
    'n_jobs': 1,  # Single thread
    'tree_method': 'hist'  # Lebih cepat
}

if os.path.exists(results_file):
    with open(results_file, 'rb') as f:
        saved_results = pickle.load(f)
    print(f"Loaded {len(saved_results)} previous results")
else:
    saved_results = []
"""
# Load checkpoint jika ada
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'rb') as f:
        trials = pickle.load(f)
    print(f"Resuming from {len(trials.trials)} trials")
    start_batch = len(trials.trials)
else:
    trials = Trials()
    start_batch = 0
"""
trials = Trials()
start_batch = 0

# Jalankan batch dengan checkpoint
total_batches = 4
batch_size = 2

for batch in range(start_batch // batch_size, total_batches):
    print(f"=== Batch {batch+1}/{total_batches} ===")
    
    # Jalankan hyperopt untuk batch ini
    batch_result = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=len(trials.trials) + batch_size,
        trials=trials,
        verbose=True
    )
    
    # Simpan hasil batch
    batch_info = {
        'batch': batch + 1,
        'timestamp': datetime.now().isoformat(),
        'best_params': batch_result,
        'trials_count': len(trials.trials),
        'best_loss': min([t['result']['loss'] for t in trials.trials])
    }
    
    saved_results.append(batch_info)
    
    # Simpan ke file
    with open(results_file, 'wb') as f:
        pickle.dump(saved_results, f)
    
    with open(checkpoint_file, 'wb') as f:
        pickle.dump(trials, f)
    
    print(f"Batch {batch+1} saved. Best RMSE: {batch_info['best_loss']:.4f}")
    print("-" * 50)
    
    # Jeda antar batch
    import time
    time.sleep(5)

Loaded 8 previous results
=== Batch 1/4 ===
  0%|                                                                                                                                             | 0/2 [00:00<?, ?trial/s, best loss=?]




  xgb_model.save_model(model_data_path)

2025/07/13 04:01:12 INFO mlflow.store.db.utils: Creating initial MLflow database tables...

2025/07/13 04:01:12 INFO mlflow.store.db.utils: Updating database tables

INFO  [alembic.runtime.migration] Context impl SQLiteImpl.

INFO  [alembic.runtime.migration] Will assume non-transactional DDL.

Registered model 'MyXGBoostModel' already exists. Creating a new version of this model...
Created version '17' of model 'MyXGBoostModel'.


 50%|█████████████████████████████████████████████████████████▌                                                         | 1/2 [02:24<02:24, 144.42s/trial, best loss: 6.449169310259481]




  xgb_model.save_model(model_data_path)

Registered model 'MyXGBoostModel' already exists. Creating a new version of this model...
Created version '18' of model 'MyXGBoostModel'.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [04:38<00:00, 139.41s/trial, best loss: 6.449169310259481]
Batch 1 saved. Best RMSE: 6.4492
--------------------------------------------------
=== Batch 2/4 ===
 50%|██████████████████████████████████████████████████████████████████████▌                                                                      | 2/4 [00:00<?, ?trial/s, best loss=?]




  xgb_model.save_model(model_data_path)

Registered model 'MyXGBoostModel' already exists. Creating a new version of this model...
Created version '19' of model 'MyXGBoostModel'.


 75%|██████████████████████████████████████████████████████████████████████████████████████▎                            | 3/4 [02:24<02:24, 144.12s/trial, best loss: 6.449169310259481]




  xgb_model.save_model(model_data_path)

Registered model 'MyXGBoostModel' already exists. Creating a new version of this model...
Created version '20' of model 'MyXGBoostModel'.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [04:57<00:00, 148.74s/trial, best loss: 6.446593301913683]
Batch 2 saved. Best RMSE: 6.4466
--------------------------------------------------
=== Batch 3/4 ===
 67%|██████████████████████████████████████████████████████████████████████████████████████████████                                               | 4/6 [00:00<?, ?trial/s, best loss=?]




  xgb_model.save_model(model_data_path)

Registered model 'MyXGBoostModel' already exists. Creating a new version of this model...
Created version '21' of model 'MyXGBoostModel'.


 83%|███████████████████████████████████████████████████████████████████████████████████████████████▊                   | 5/6 [02:19<02:19, 139.48s/trial, best loss: 6.446593301913683]




  xgb_model.save_model(model_data_path)

Registered model 'MyXGBoostModel' already exists. Creating a new version of this model...
Created version '22' of model 'MyXGBoostModel'.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [04:40<00:00, 140.28s/trial, best loss: 6.446593301913683]
Batch 3 saved. Best RMSE: 6.4466
--------------------------------------------------
=== Batch 4/4 ===
 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 6/8 [00:00<?, ?trial/s, best loss=?]




  xgb_model.save_model(model_data_path)

Registered model 'MyXGBoostModel' already exists. Creating a new version of this model...
Created version '23' of model 'MyXGBoostModel'.


 88%|████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 7/8 [02:32<02:32, 152.53s/trial, best loss: 6.446593301913683]




  xgb_model.save_model(model_data_path)

Registered model 'MyXGBoostModel' already exists. Creating a new version of this model...
Created version '24' of model 'MyXGBoostModel'.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [05:09<00:00, 154.88s/trial, best loss: 6.446593301913683]
Batch 4 saved. Best RMSE: 6.4466
--------------------------------------------------


In [19]:
# Tampilkan hasil akhir
print("\n=== FINAL RESULTS ===")
for result in saved_results:
    print(f"Batch {result['batch']}: RMSE = {result['best_loss']:.4f}")

# Hasil terbaik
best_trial = min(trials.trials, key=lambda x: x['result']['loss'])

# Ekstrak parameter secara manual untuk menghindari error space_eval
best_vals = best_trial['misc']['vals']
best_params = {}
for key, value in best_vals.items():
    if isinstance(value, list) and len(value) > 0:
        best_params[key] = value[0]  # Ambil nilai pertama jika berupa list
    else:
        best_params[key] = value


=== FINAL RESULTS ===
Batch 1: RMSE = 6.5201
Batch 2: RMSE = 6.5201
Batch 3: RMSE = 6.5201
Batch 4: RMSE = 6.4757
Batch 1: RMSE = 6.6112
Batch 2: RMSE = 6.4672
Batch 3: RMSE = 6.4576
Batch 4: RMSE = 6.4518
Batch 1: RMSE = 6.4492
Batch 2: RMSE = 6.4466
Batch 3: RMSE = 6.4466
Batch 4: RMSE = 6.4466


In [20]:
# Tambahkan parameter yang tidak dioptimasi
best_params['objective'] = 'reg:squarederror'
best_params['seed'] = 42
best_params['n_jobs'] = 1
best_params['tree_method'] = 'hist'

print(f"\nBest parameters: {best_params}")
print(f"Best RMSE: {best_trial['result']['loss']:.4f}")


Best parameters: {'learning_rate': np.float64(0.27283707400899754), 'max_depth': np.float64(7.0), 'min_child_weight': np.float64(3.6559755218812566), 'reg_alpha': np.float64(0.29806420649486387), 'reg_lambda': np.float64(0.5308607957707008), 'objective': 'reg:squarederror', 'seed': 42, 'n_jobs': 1, 'tree_method': 'hist'}
Best RMSE: 6.4466


In [27]:
with mlflow.start_run():
    
    best_params = {
        'learning_rate': 0.05442667102703115,
        'max_depth': 6,
        'min_child_weight': 1.4483212277452986,
        'n_jobs': 1,
        'objective': 'reg:squarederror',
        'reg_alpha': 0.5356786455472234,
        'reg_lambda': 0.09836262674462193,
        'seed': 42,
        'tree_method': 'hist'
    }
    
    mlflow.log_params(best_params)
    
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=500,
        evals=[(valid, 'validation')],
        early_stopping_rounds=30,
        verbose_eval=False,
        callbacks=None
    )
    
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    input_example = train.slice(np.array([0])).get_data()
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow", input_example=input_example)

  xgb_model.save_model(model_data_path)


In [45]:
mlflow.xgboost.autolog(disable=True)

In [46]:
# Buat DMatrix di luar mlflow.start_run()
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [48]:
# Pastikan folder models ada
os.makedirs("models", exist_ok=True)

with mlflow.start_run():
    
    best_params = {
        'learning_rate': 0.05442667102703115,
        'max_depth': 6,
        'min_child_weight': 1.4483212277452986,
        'n_jobs': 1,
        'objective': 'reg:squarederror',
        'reg_alpha': 0.5356786455472234,
        'reg_lambda': 0.09836262674462193,
        'seed': 42,
        'tree_method': 'hist'
    }
    
    mlflow.log_params(best_params)
    
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=500,
        evals=[(valid, 'validation')],
        early_stopping_rounds=30,
        verbose_eval=False,
        callbacks=None
    )
    
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    with open("models/preprocessor.b", 'wb') as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
    
    # Log model tanpa input_example
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")
    
    run_id = mlflow.active_run().info.run_id
    print(f"Run ID: {run_id}")

  xgb_model.save_model(model_data_path)


Run ID: 88e60341de4a4d628f04a1b19aa02885


In [38]:
logged_model = 'runs:/433039a7a89147358000fae4961164db/models_mlflow'
loaded_model = mlflow.pyfunc.load_model(logged_model)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [39]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: /home/ubuntu/mlops-zoomcamp/02-experiment-tracking/mlruns/1/models/m-cd88f4397a70467eb668c80d3c4ac20d/artifacts
  flavor: mlflow.xgboost
  run_id: 433039a7a89147358000fae4961164db

In [40]:
xgboost_model = mlflow.xgboost.load_model(logged_model)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [41]:
xgboost_model

<xgboost.core.Booster at 0x7be313f32140>

In [42]:
y_pred = xgboost_model.predict(valid)

In [43]:
y_pred[:10]

array([17.508398 ,  7.1751266, 20.776585 , 24.40614  , 10.17512  ,
       17.121979 , 14.032827 ,  9.695784 ,  8.902082 , 16.678173 ],
      dtype=float32)

In [44]:
# Set ke experiment yang sudah ada
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/home/ubuntu/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1751630361732, experiment_id='1', last_update_time=1751630361732, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

----

In [31]:
# =============================================================================
# 1. RANDOM FOREST REGRESSOR
# =============================================================================

def train_random_forest(X_train, y_train, X_val, y_val, dv, best_params=None):
    """Train Random Forest dengan MLflow logging"""
    
    if best_params is None:
        best_params = {
            'n_estimators': 100,
            'max_depth': 10,
            'min_samples_split': 5,
            'min_samples_leaf': 2,
            'max_features': 'sqrt',
            'random_state': 42,
            'n_jobs': -1
        }
    
    with mlflow.start_run():
        mlflow.set_tag("model", "random_forest")
        mlflow.log_params(best_params)
        
        # Train model
        rf_model = RandomForestRegressor(**best_params)
        rf_model.fit(X_train, y_train)
        
        # Predict and evaluate
        y_pred = rf_model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        
        # Save preprocessor
        os.makedirs("models", exist_ok=True)
        with open("models/preprocessor.b", 'wb') as f_out:
            pickle.dump(dv, f_out)
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
        
        # Log model
        input_example = X_train[:1]  # First sample as example
        mlflow.sklearn.log_model(
            sk_model=rf_model,
            artifact_path="models_mlflow",
            input_example=input_example,
            registered_model_name="MyRandomForestModel"
        )
        
        print(f"Random Forest RMSE: {rmse:.4f}")
        return rf_model, rmse

# Hyperparameter optimization untuk Random Forest
def objective_rf(params):
    gc.collect()
    
    with mlflow.start_run():
        mlflow.set_tag("model", "random_forest")
        mlflow.log_params(params)
        
        rf_model = RandomForestRegressor(**params)
        rf_model.fit(X_train, y_train)
        
        y_pred = rf_model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        
        input_example = X_train[:1]
        mlflow.sklearn.log_model(
            sk_model=rf_model,
            artifact_path="models_mlflow",
            input_example=input_example,
            registered_model_name="MyRandomForestModel"
        )
        
        del rf_model, y_pred
        gc.collect()
        
    return {'loss': rmse, 'status': STATUS_OK}

# Search space untuk Random Forest
search_space_rf = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 200, 10)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 20, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 5, 1)),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None]),
    'random_state': 42,
    'n_jobs': -1
}

In [32]:
# =============================================================================
# 2. EXTRA TREES REGRESSOR
# =============================================================================

def train_extra_trees(X_train, y_train, X_val, y_val, dv, best_params=None):
    """Train Extra Trees dengan MLflow logging"""
    
    if best_params is None:
        best_params = {
            'n_estimators': 100,
            'max_depth': 10,
            'min_samples_split': 5,
            'min_samples_leaf': 2,
            'max_features': 'sqrt',
            'random_state': 42,
            'n_jobs': -1
        }
    
    with mlflow.start_run():
        mlflow.set_tag("model", "extra_trees")
        mlflow.log_params(best_params)
        
        # Train model
        et_model = ExtraTreesRegressor(**best_params)
        et_model.fit(X_train, y_train)
        
        # Predict and evaluate
        y_pred = et_model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        
        # Save preprocessor
        os.makedirs("models", exist_ok=True)
        with open("models/preprocessor.b", 'wb') as f_out:
            pickle.dump(dv, f_out)
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
        
        # Log model
        input_example = X_train[:1]
        mlflow.sklearn.log_model(
            sk_model=et_model,
            artifact_path="models_mlflow",
            input_example=input_example,
            registered_model_name="MyExtraTreesModel"
        )
        
        print(f"Extra Trees RMSE: {rmse:.4f}")
        return et_model, rmse

# Hyperparameter optimization untuk Extra Trees
def objective_et(params):
    gc.collect()
    
    with mlflow.start_run():
        mlflow.set_tag("model", "extra_trees")
        mlflow.log_params(params)
        
        et_model = ExtraTreesRegressor(**params)
        et_model.fit(X_train, y_train)
        
        y_pred = et_model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        
        input_example = X_train[:1]
        mlflow.sklearn.log_model(
            sk_model=et_model,
            artifact_path="models_mlflow",
            input_example=input_example,
            registered_model_name="MyExtraTreesModel"
        )
        
        del et_model, y_pred
        gc.collect()
        
    return {'loss': rmse, 'status': STATUS_OK}

# Search space untuk Extra Trees (sama dengan Random Forest)
search_space_et = search_space_rf.copy()

In [33]:
# =============================================================================
# 3. LINEAR SVR
# =============================================================================

def train_linear_svr(X_train, y_train, X_val, y_val, dv, best_params=None):
    """Train Linear SVR dengan MLflow logging"""
    
    if best_params is None:
        best_params = {
            'epsilon': 0.1,
            'C': 1.0,
            'loss': 'epsilon_insensitive',
            'random_state': 42,
            'max_iter': 1000
        }
    
    with mlflow.start_run():
        mlflow.set_tag("model", "linear_svr")
        mlflow.log_params(best_params)
        
        # Train model
        svr_model = LinearSVR(**best_params)
        svr_model.fit(X_train, y_train)
        
        # Predict and evaluate
        y_pred = svr_model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        
        # Save preprocessor
        os.makedirs("models", exist_ok=True)
        with open("models/preprocessor.b", 'wb') as f_out:
            pickle.dump(dv, f_out)
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
        
        # Log model
        input_example = X_train[:1]
        mlflow.sklearn.log_model(
            sk_model=svr_model,
            artifact_path="models_mlflow",
            input_example=input_example,
            registered_model_name="MyLinearSVRModel"
        )
        
        print(f"Linear SVR RMSE: {rmse:.4f}")
        return svr_model, rmse

# Hyperparameter optimization untuk Linear SVR
def objective_svr(params):
    gc.collect()
    
    with mlflow.start_run():
        mlflow.set_tag("model", "linear_svr")
        mlflow.log_params(params)
        
        svr_model = LinearSVR(**params)
        svr_model.fit(X_train, y_train)
        
        y_pred = svr_model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        
        input_example = X_train[:1]
        mlflow.sklearn.log_model(
            sk_model=svr_model,
            artifact_path="models_mlflow",
            input_example=input_example,
            registered_model_name="MyLinearSVRModel"
        )
        
        del svr_model, y_pred
        gc.collect()
        
    return {'loss': rmse, 'status': STATUS_OK}

# Search space untuk Linear SVR
search_space_svr = {
    'epsilon': hp.uniform('epsilon', 0.01, 0.5),
    'C': hp.uniform('C', 0.1, 10.0),
    'loss': hp.choice('loss', ['epsilon_insensitive', 'squared_epsilon_insensitive']),
    'random_state': 42,
    'max_iter': 2000
}

In [34]:
# =============================================================================
# FUNGSI UNTUK MENJALANKAN HYPERPARAMETER OPTIMIZATION
# =============================================================================

def run_hyperopt_for_model(model_name, objective_func, search_space, 
                          results_file, checkpoint_file, 
                          total_batches=4, batch_size=2):
    """Generic function untuk menjalankan hyperopt dengan checkpoint"""
    
    # Load previous results
    if os.path.exists(results_file):
        with open(results_file, 'rb') as f:
            saved_results = pickle.load(f)
        print(f"Loaded {len(saved_results)} previous results for {model_name}")
    else:
        saved_results = []
    
    # Load checkpoint
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'rb') as f:
            trials = pickle.load(f)
        print(f"Resuming {model_name} from {len(trials.trials)} trials")
        start_batch = len(trials.trials)
    else:
        trials = Trials()
        start_batch = 0
    
    # Run batches
    for batch in range(start_batch // batch_size, total_batches):
        print(f"=== {model_name} Batch {batch+1}/{total_batches} ===")
        
        batch_result = fmin(
            fn=objective_func,
            space=search_space,
            algo=tpe.suggest,
            max_evals=len(trials.trials) + batch_size,
            trials=trials,
            verbose=True
        )
        
        batch_info = {
            'model': model_name,
            'batch': batch + 1,
            'timestamp': datetime.now().isoformat(),
            'best_params': batch_result,
            'trials_count': len(trials.trials),
            'best_loss': min([t['result']['loss'] for t in trials.trials])
        }
        
        saved_results.append(batch_info)
        
        # Save files
        with open(results_file, 'wb') as f:
            pickle.dump(saved_results, f)
        
        with open(checkpoint_file, 'wb') as f:
            pickle.dump(trials, f)
        
        print(f"{model_name} Batch {batch+1} saved. Best RMSE: {batch_info['best_loss']:.4f}")
        print("-" * 50)
        
        import time
        time.sleep(5)

In [35]:
# Assuming you have X_train, y_train, X_val, y_val, dv ready

# Train Random Forest
rf_model, rf_rmse = train_random_forest(X_train, y_train, X_val, y_val, dv)

# Train Extra Trees
et_model, et_rmse = train_extra_trees(X_train, y_train, X_val, y_val, dv)

# Train Linear SVR
svr_model, svr_rmse = train_linear_svr(X_train, y_train, X_val, y_val, dv)


# Contoh hyperparameter optimization

# Random Forest hyperopt
run_hyperopt_for_model(
    model_name="RandomForest",
    objective_func=objective_rf,
    search_space=search_space_rf,
    results_file="rf_hyperopt_results.pkl",
    checkpoint_file="rf_hyperopt_checkpoint.pkl"
)

# Extra Trees hyperopt
run_hyperopt_for_model(
    model_name="ExtraTrees", 
    objective_func=objective_et,
    search_space=search_space_et,
    results_file="et_hyperopt_results.pkl",
    checkpoint_file="et_hyperopt_checkpoint.pkl"
)

# Linear SVR hyperopt
run_hyperopt_for_model(
    model_name="LinearSVR",
    objective_func=objective_svr,
    search_space=search_space_svr,
    results_file="svr_hyperopt_results.pkl",
    checkpoint_file="svr_hyperopt_checkpoint.pkl"
)

2025/07/12 09:35:42 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/12 09:35:43 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Registered model 'MyRandomForestModel' already exists. Creating a new version of this model...
Created version '12' of model 'MyRandomForestModel'.


Random Forest RMSE: 11.7325


Registered model 'MyExtraTreesModel' already exists. Creating a new version of this model...
Created version '12' of model 'MyExtraTreesModel'.


Extra Trees RMSE: 12.0590




Linear SVR RMSE: 833.8257
Loaded 4 previous results for RandomForest
Resuming RandomForest from 8 trials
Loaded 4 previous results for ExtraTrees
Resuming ExtraTrees from 8 trials
Loaded 4 previous results for LinearSVR
Resuming LinearSVR from 8 trials


Registered model 'MyLinearSVRModel' already exists. Creating a new version of this model...
Created version '13' of model 'MyLinearSVRModel'.
