In [27]:
import sys
import pickle
import pandas as pd
from typing import Tuple

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [28]:
def get_data(test_size:float=0.3):
    """
    Load test iris dataset. Split into X_train, X_test, y_train, y_test.
    
    :param test_size: Percentage of dataset to use for test set.
    
    :returns:         X_train, X_test, y_train, y_test
    """
    X, y = load_iris(return_X_y=True, as_frame=True)
    return train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [29]:
def str_to_class(str:str):
    """
    Turns a string into Python class. Used to dynamically load Sklearn model classes.
    Note: Desired class must be imported at top of script.
    
    :param str: String to dynamically load as Python class.
    
    :returns:   Python class corresponding to string.
    """
    return eval(str)

In [30]:
def build_model(model_class:str, model_params:dict):
    """
    Build Sklearn model of certain type with parameters.
    
    :param model_class:  Sklearn class to create.
    :param model_params: Dict of model parameters to use.
    
    :returns:            Newly built sklearn model.
    """
    model_class = str_to_class(model_class)
    return model_class(**model_params)

In [31]:
def train_model(
    model,
    hyperparameters:dict,
    X_train: pd.DataFrame,
    y_train: pd.Series
):
    """
    Train Sklearn model with random hyperparameter search.
    
    :param model:           Sklearn model to train.
    :param hyperparameters: Hyperparameter grid to search.
    :param X_train:         Training data.
    :param y_train:         Training labels.
    
    :returns:               Best trained Sklearn model.
    """
    clf = RandomizedSearchCV(model, hyperparameters, random_state=0)
    search = clf.fit(X_train, y_train)
    return search.best_estimator_

In [32]:
def evaluate_model(model, X_test: pd.DataFrame, y_test: pd.Series) -> dict:
    """
    Evaluates trained SKlearn model with common metrics.
    
    :param model:  Trained Sklearn model.
    :param X_test: Test data to evaluate with.
    
    :returns:      Dict of evaluation metrics.
    """
    y_pred = model.predict(X_test)
    return {
        "accuracy" : accuracy_score(y_test, y_pred),
        "f1" : f1_score(y_test, y_pred, average="micro"),
        "precision" : precision_score(y_test, y_pred, average="micro"),
        "recall" : recall_score(y_test, y_pred, average="micro"),
    }

In [36]:
def main(model_config: dict):
    """
    Main training function. Loads data, trains models using specified
    classes/parameters/hyperparameters, evaluates models, and exports
    models to disk.
    
    :param model_config: Dict of model classes and corresponding parameters
                         and hyperparameters to use while training.
    """
    # Get datasets
    X_train, X_test, y_train, y_test = get_data()
    
    # For all models in config
    for name, config in model_config.items():

        # Build base model
        print(f"Building: {name}")
        model = build_model(model_class=name, model_params=config["params"])

        # Train model with hyperparameter tuning
        print(f"Training: {name}")
        model = train_model(model, config["hyperparameters"], X_train, y_train)
        print(f"Best parameters: {model.get_params()}")

        # Evaluate model
        print(f"Evaluating: {name}")
        metrics = evaluate_model(model, X_test, y_test)
        print(f"Evaluation metrics: {metrics}")

        # Export model to disk
        print(f"Saving: {name}")
        pickle.dump(model, open(f"{name}.pkl", 'wb'))
        print()

In [37]:
model_config = {
    "LogisticRegression": {
        "params" : {'solver': 'saga', 'tol': 0.01, 'max_iter': 200, 'random_state': 0},
        "hyperparameters" : {"penalty": ['l2', 'l1'], "C" : [0.6, 0.8, 1.0, 1.1, 1.2]}
    },
    "RandomForestClassifier" : {
        "params" : {"max_depth": 2, "random_state": 0},
        "hyperparameters" : {"n_estimators" : [10, 50, 100, 200], "criterion": ["gini", "entropy"]}
    }
}

In [38]:
main(model_config)

Building: LogisticRegression
Training: LogisticRegression
Best parameters: {'C': 0.6, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 200, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 0, 'solver': 'saga', 'tol': 0.01, 'verbose': 0, 'warm_start': False}
Evaluating: LogisticRegression
Evaluation metrics: {'accuracy': 0.9555555555555556, 'f1': 0.9555555555555556, 'precision': 0.9555555555555556, 'recall': 0.9555555555555556}
Saving: LogisticRegression

Building: RandomForestClassifier
Training: RandomForestClassifier




Best parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}
Evaluating: RandomForestClassifier
Evaluation metrics: {'accuracy': 0.9333333333333333, 'f1': 0.9333333333333333, 'precision': 0.9333333333333333, 'recall': 0.9333333333333333}
Saving: RandomForestClassifier

