In [14]:
import mlflow
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score, train_test_split

In [4]:
iris = load_iris()
X = iris.data
y = iris.target

In [9]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('lab2-experiment')

2025/03/24 23:43:21 INFO mlflow.tracking.fluent: Experiment with name 'lab2-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/katelynvuong/Desktop/mlops/mlruns/3', creation_time=1742885001346, experiment_id='3', last_update_time=1742885001346, lifecycle_stage='active', name='lab2-experiment', tags={}>

In [10]:
with mlflow.start_run():
    # Set tags
    mlflow.set_tags({"Model":"decision-tree", "Train Data": "all-data"})
    
    # Set hyperparameter
    tree_depth = 5
    
    # Create and train the model
    dt = DecisionTreeClassifier(max_depth=tree_depth)
    dt.fit(X, y)
    
    # Calculate accuracy (using the same dataset for prediction)
    acc = accuracy_score(y, dt.predict(X))
    
    # Log parameters and metrics
    mlflow.log_param("max_depth", tree_depth)
    mlflow.log_metric("accuracy", acc)
    
mlflow.end_run()

In [11]:
with mlflow.start_run():
    mlflow.set_tags({"Model":"random-forest", "Train Data": "all-data"})

    ntree = 1000
    mtry = 4

    mlflow.log_params({'n_estimators':ntree, 'max_features':mtry})

    rf = RandomForestClassifier(n_estimators = ntree, max_features = mtry, oob_score = True)
    rf.fit(X,y)
    acc = rf.oob_score_
    #acc = accuracy_score(y, rf.predict(X))
    mlflow.log_metric('accuracy', acc)

mlflow.end_run()

In [15]:
alphas = [0.1, 1.0, 10.0, 100.0]
fits = ['normal', 'svd', 'cholesky']

for alpha in alphas:
    for fit in fits:
        with mlflow.start_run():
            mlflow.set_tags({"Model":"linear-regression", "Train Data": "all-data"})
            
            mlflow.log_params({
                'alpha': alpha, 
                'fit_intercept': True,
                'solver': fit
            })
            
            lr = LinearRegression(fit_intercept=True)
            lr.fit(X, y)
            
            # Predict and calculate metrics
            y_pred = lr.predict(X)
            mse = mean_squared_error(y, y_pred)
            r2 = r2_score(y, y_pred)
            
            mlflow.log_metric('mean_squared_error', mse)
            mlflow.log_metric('r2_score', r2)
        mlflow.end_run()


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

def objective(params):
    with mlflow.start_run():
        classifier_type = params['type']
        del params['type']
        if classifier_type == 'dt':
            clf = DecisionTreeClassifier(**params)
        elif classifier_type == 'rf':
            clf = RandomForestClassifier(**params)        
        else:
            return 0
        acc = cross_val_score(clf, X, y).mean()

        mlflow.set_tag("Model", classifier_type)
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", acc)
        mlflow.end_run()
        return {'loss': -acc, 'status': STATUS_OK}

search_space = hp.choice('classifier_type', [
    {
        'type': 'dt',
        'criterion': hp.choice('dtree_criterion', ['gini', 'entropy']),
        'max_depth': hp.choice('dtree_max_depth', [None, hp.randint('dtree_max_depth_int', 1,10)]),
        'min_samples_split': hp.randint('dtree_min_samples_split', 2,10)
    },
    {
        'type': 'rf',
        'n_estimators': hp.randint('rf_n_estimators', 20, 500),
        'max_features': hp.randint('rf_max_features', 2,9),
        'criterion': hp.choice('criterion', ['gini', 'entropy'])
    },
])

algo = tpe.suggest
trials = Trials()

In [17]:
best_result = fmin(
        fn=objective, 
        space=search_space,
        algo=algo,
        max_evals=32,
        trials=trials)

100%|██████████| 32/32 [00:12<00:00,  2.64trial/s, best loss: -0.9733333333333334]


In [18]:
best_result

{'classifier_type': 0,
 'dtree_criterion': 0,
 'dtree_max_depth': 1,
 'dtree_max_depth_int': 3,
 'dtree_min_samples_split': 8}