# Base


## Setup

* Reload modules automatically
* Load environvent variables from .env
* Imports
* Load configs
* Set up logging

In [1]:
%load_ext autoreload
%load_ext dotenv

In [2]:
%autoreload

In [3]:
%dotenv ../.env

In [4]:
import logging
import sys

import mlflow
from omegaconf import OmegaConf
from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [5]:
cfg = OmegaConf.load("../config.yaml")

In [6]:
logger = logging.getLogger(__name__)
logging.basicConfig(
    stream=sys.stdout,
    format=cfg.logging.format,
    datefmt=cfg.logging.date_format,
    level=cfg.logging.level
)

## Load data

In [7]:
input_data = load_wine(as_frame=True)

## Prepare data

In [8]:
X = input_data.frame.copy()
y = X.pop(cfg.train.target_name)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=cfg.train.test_size,
    random_state=cfg.train.random_state,
    stratify=y
)

## Define model

In [10]:
pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(random_state=cfg.train.random_state, n_jobs=-1)
)

## Set up experiment tracking

In [11]:
if cfg.train.experiment_name:
    experiment = mlflow.get_experiment_by_name(name=cfg.train.experiment_name)
    if not experiment:
        mlflow.create_experiment(name=cfg.train.experiment_name)
    mlflow.set_experiment(experiment_name=cfg.train.experiment_name)

mlflow.autolog()

2023/09/19 18:00:53 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


## Train & evaluate

In [12]:
with mlflow.start_run() as run:
    run_info = run.info
    logger.info("Running experiment with id: %s", run_info.run_id)

    logger.info("Fitting model.")
    pipeline.fit(X_train, y_train)

    logger.info("Evaluating trained model.")
    model_path = mlflow.get_artifact_uri(artifact_path="model")
    test_data = X_test.copy()
    test_data["target"] = y_test
    mlflow.evaluate(
        model=model_path,
        data=test_data,
        targets="target",
        model_type=cfg.train.model_type
    )
    logger.info("Finished experiment run %s", run_info)

[2023-09-19 18:00:53] Running experiment with id: d8db31d36ca14eb8b488989f35b7f6d5
[2023-09-19 18:00:53] Fitting model.




[2023-09-19 18:01:00] Evaluating trained model.


  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  data = data.applymap(_hash_array_like_element_as_bytes)
  data = data.applymap(_hash_array_like_element_as_bytes)
  return _infer_schema(self._df)
2023/09/19 18:01:01 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/09/19 18:01:01 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2023/09/19 18:01:01 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as multiclass dataset, number of classes is inferred as 3


[2023-09-19 18:01:04] Finished experiment run <RunInfo: artifact_uri='file:///home/julia/Code/mlruns/873066992355515183/d8db31d36ca14eb8b488989f35b7f6d5/artifacts', end_time=None, experiment_id='873066992355515183', lifecycle_stage='active', run_id='d8db31d36ca14eb8b488989f35b7f6d5', run_name='chill-frog-555', run_uuid='d8db31d36ca14eb8b488989f35b7f6d5', start_time=1695139253944, status='RUNNING', user_id='julia'>


<Figure size 1050x700 with 0 Axes>