In [1]:
import os
import warnings
import sys
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import plotly
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import random
from random import choices
from string import ascii_lowercase, digits
import datetime
from pathlib import Path
from functools import partial
from itertools import starmap
from dotenv import load_dotenv
import requests

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.impute import SimpleImputer

import mlflow
import optuna
from mlflow import MlflowClient
import mlflow.sklearn
from mlflow.models import infer_signature
from mlflow.models import Model
from mlflow.data.pandas_dataset import PandasDataset

# set mlflow traking uri
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)

In [2]:
this_dir = pathlib.Path()
parent_dir = this_dir.resolve().parent
data_dir = this_dir / "data"

# create results directory, if not already existing
cwd = os.getcwd()
results_dir = os.path.join(cwd, r'results')
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

# Model training

In [3]:
# MODEL EVALUATION FUNCTIONS #

def eval_metrics(actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2

In [4]:
def get_best_model():
    
    # Opt. for now: get best run
    best_run = mlflow.search_runs(
        experiment_id, order_by=["metrics.rmse"], max_results=1
    )
    print(best_run.info)
    
    return None
    

In [5]:
## VISUALIZATION FUNCTIONS ##
def plot_residuals(y_test, y_pred, style="seaborn", plot_size=(10, 8)):
    """Source: https://mlflow.org/docs/latest/traditional-ml/hyperparameter-tuning-with-child-runs/notebooks/logging-plots-in-mlflow.html"""
    
    residuals = y_test - y_pred

    with plt.style.context(style=style):
        fig, ax = plt.subplots(figsize=plot_size)
        sns.residplot(
            x=y_pred,
            y=residuals,
            lowess=True,
            ax=ax,
            line_kws={"color": "red", "lw": 1},
        )

        ax.axhline(y=0, color="black", linestyle="--")
        ax.set_title("Residual Plot", fontsize=14)
        ax.set_xlabel("Predicted values", fontsize=12)
        ax.set_ylabel("Residuals", fontsize=12)

        for label in ax.get_xticklabels() + ax.get_yticklabels():
            label.set_fontsize(10)

        plt.tight_layout()

        plt.close(fig)
        return fig
    
    
def plot_prediction_error(y_test, y_pred, style="seaborn", plot_size=(10, 8)):
    """Source: https://mlflow.org/docs/latest/traditional-ml/hyperparameter-tuning-with-child-runs/notebooks/logging-plots-in-mlflow.html"""
    
    with plt.style.context(style=style):
        fig, ax = plt.subplots(figsize=plot_size)
        ax.scatter(y_pred, y_test - y_pred)
        ax.axhline(y=0, color="red", linestyle="--")
        ax.set_title("Prediction Error Plot", fontsize=14)
        ax.set_xlabel("Predicted Values", fontsize=12)
        ax.set_ylabel("Errors", fontsize=12)
        plt.tight_layout()
    plt.close(fig)
    return fig

def plot_qq(y_test, y_pred, style="seaborn", plot_size=(10, 8)):
    """Source: https://mlflow.org/docs/latest/traditional-ml/hyperparameter-tuning-with-child-runs/notebooks/logging-plots-in-mlflow.html"""
    
    residuals = y_test - y_pred
    with plt.style.context(style=style):
        fig, ax = plt.subplots(figsize=plot_size)
        stats.probplot(residuals, dist="norm", plot=ax)
        ax.set_title("QQ Plot", fontsize=14)
        plt.tight_layout()
    plt.close(fig)
    return fig

In [6]:
# Future methods
# Loading models: https://mlflow.org/docs/latest/python_api/mlflow.models.html#mlflow.models.Model.load

In [7]:
## DEFINE MODEL TRAINING FUNCTIONS##
# TODO: Separate preprocessing and training/evaluation

def train_lr(target, experiment_id, in_alpha=0.5, in_l1_ratio=0.5):
    mlflow.autolog()  # enable autologging
    mlflow.sklearn.autolog()
    
    #warnings.filterwarnings("ignore")
    np_max_int = np.iinfo(np.int32).max
    seed = np.random.randint(np_max_int)
    np.random.seed(seed)

    # Read and log the input data 
    data_filepath = data_dir / 'training_data_w_timestep.csv'
    data = pd.read_csv(data_filepath)
    data_artifact = mlflow.data.from_pandas(data)  # log when run is started
    
    
    ## DATA PREPROCESSING STEPS ##
    
    # remove noisy features: run_uuid, queue seconds, 
    data = data.drop(columns=['run_uuid', 'queue_seconds'])
    
    # Replace infty values with NaN
    data.replace(['inf', np.inf, -np.inf], np.nan, inplace=True)
    

    # Impute NaN values with mean
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    sample_data = data.sample(frac=0.25).dropna()
    imp_mean.fit(sample_data)
    data = pd.DataFrame(imp_mean.fit_transform(data), columns = data.columns)
    
    # Split the data into training and testing (.75, .25) split
    train, test = train_test_split(data)

    # The target column (a function parameter) can be "cpu_usage" or "mem_usage"
    if target == "cpu_usage_total":
        train_x = train.drop(["cpu_usage_total"], axis=1)
        test_x = test.drop(["cpu_usage_total"], axis=1)
        train_y = train[["cpu_usage_total"]]
        test_y = test[["cpu_usage_total"]]
    elif target == "mem_usage_total":
        train_x = train.drop(["mem_usage_total"], axis=1)
        test_x = test.drop(["mem_usage_total"], axis=1)
        train_y = train[["mem_usage_total"]]
        test_y = test[["mem_usage_total"]]

    
    ## storage settings ##
    model_type = 'Elasticnet'
    now = datetime.datetime.now().strftime("%Y_%m_%d_%I%M%S%p")
    run_name = model_type + '_' + now
    
    
    ## MLFLOW RUN ##
    # useful for multiple runs
    with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True, log_system_metrics=True):
        
        # Log original input data
        mlflow.log_input(data_artifact, "input")
        
        
        # SET AND LOG PARAMETERS # 
        # (some model parameters will be autologged mlflow but not optuna)
    
        # Set default values if no alpha is provided
        if float(in_alpha) is not None:
            alpha = float(in_alpha)


        # Set default values if no l1_ratio is provided
        if float(in_l1_ratio) is not None:
            l1_ratio = float(in_l1_ratio)

        # Log pre-defined seed
        mlflow.log_param("seed", seed)  
        
        
    
        # Execute ElasticNet
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=seed, fit_intercept=False)
        lr.fit(train_x, train_y)
    
        
        # Evaluate Metrics
        pred_y = lr.predict(test_x)
        (rmse, mae, r2) = eval_metrics(test_y, pred_y)
    
        # Print metrics
        # print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        # print("  RMSE: %s" % rmse)
        # print("  MAE: %s" % mae)
        # print("  R2: %s" % r2)

        if test_y.shape != pred_y.shape:
            test_y_dimensions = test_y.shape[0]
            pred_y = pred_y.reshape(test_y_dimensions,1)
            
        
        # Visualize and log plots
        fig_residuals = plot_residuals(test_y, pred_y)
        fig_error = plot_prediction_error(test_y, pred_y)
        
        mlflow.log_figure(fig_residuals, "residuals_plot.png")
        mlflow.log_figure(fig_error, "error_plot.png")

        
        # Create model artifact directory
        cwd = os.getcwd()
        artifacts_dir = os.path.join(cwd, run_name)
        if not os.path.exists(artifacts_dir):
            os.makedirs(artifacts_dir)
            
            
            
         ## IF AUTOLOGGING IS NOT ENABLED ##
        
        # Log training data --- only if autologging not on
        # train_dataset = mlflow.data.from_pandas(train, targets=target, source="data.csv")
        # mlflow.log_input(train_dataset, context="training")
        
        # # Log parameters --- only if autologging is not on
        # mlflow.log_param("alpha", alpha)
        # mlflow.log_param("l1_ratio", l1_ratio)

        # # Log metrics
        # mlflow.log_metric("rmse", rmse)
        # mlflow.log_metric("r2", r2)
        # mlflow.log_metric("mae", mae)
        
        # Log and download artifacts locally --- file upload not supported in NDP JupyterHub
        # active_run = mlflow.active_run()
        # mlflow.artifacts.download_artifacts(run_id = active_run.info.run_id, dst_path=artifacts_dir)
        # mlflow.sklearn.log_model(sk_model=lr, input_example=test_x, artifact_path=artifacts_dir)
        
        


    mlflow.end_run()   # END CURRENT RUN BEFORE STARTING NEW RUN

# Run experiments

In [8]:
## EXPERIMENT FUNCTION ##
def run_experiment(target="mem_usage_total", num_runs=1, experiment_name=None, train_func=train_lr):
    
    
    # Generate experiment name
    if experiment_name is None:
        random_suffix = "".join(choices(ascii_lowercase, k=2)+choices(digits, k=3))
        if target == "mem_usage_total":
            experiment_name = 'bp3d_mem_linreg_'+random_suffix
        elif target == "cpu_usage_total":
            experiment_name = 'bp3d_cpu_linreg_'+random_suffix

    
    # create experiment (if not existing)
    try:
        mlflow.create_experiment(experiment_name)
    except:
        pass
    mlflow.set_experiment(experiment_name)

    
    # Get experiment ID
    experiment = mlflow.get_experiment_by_name(experiment_name)
    experiment_id = experiment.experiment_id
    
    
    for i in range(num_runs):
        train_func(target, experiment_id)
        
    return None

In [9]:
run_experiment(target="mem_usage_total", num_runs=1)

2024/07/30 10:48:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/07/30 10:48:09 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/07/30 10:48:09 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  model = cd_fast.enet_coordinate_descent(
2024/07/30 10:48:13 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/07/30 10:48:13 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [10]:
run_experiment(target="cpu_usage_total", num_runs=1)

2024/07/30 10:48:13 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/07/30 10:48:13 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  model = cd_fast.enet_coordinate_descent(
2024/07/30 10:48:15 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/07/30 10:48:15 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
