In [177]:
import numpy as np
import itertools
import optuna
import pandas as pd

In [178]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [179]:
import yaml

In [180]:
from time_series import time_series_models
from time_series import kernels
from time_series import evaluators

model_library = {name:model for name, model in time_series_models.__dict__.items() if "_" not in name}
kernel_library = {name:kernel for name, kernel in kernels.__dict__.items() if "_" not in name}
evaluator_library = {name:evaluator for name, evaluator in evaluators.__dict__.items() if "_" not in name}

In [181]:
from tqdm import tqdm

In [182]:
class TimeSeriesData:
    def __init__(self, X, y=None, train_val_test_split=None, **kwargs):
        self.__dict__.update(kwargs)
        self.X = X
        self.y = y

        self.N = len(X)
        self.indices = np.arange(self.N)
        self.tvt_split = train_val_test_split

    def train_data(self):
        train_idx = self.indices[:int(self.tvt_split[0]*self.N)]

        if type(self.y) == type(None):
            return self.X[train_idx], None

        return self.X[train_idx], self.y[train_idx]

    def val_data(self, lag=0):
        val_idx = self.indices[int(self.tvt_split[1]*self.N) - lag:int(self.tvt_split[2]*self.N)]

        if type(self.y) == type(None):
            return self.X[val_idx], None

        return self.X[val_idx], self.y[val_idx]

    def test_data(self, lag=0):
        test_idx = self.indices[int(self.tvt_split[2]*self.N)-lag:]

        if type(self.y) == type(None):
            return self.X[test_idx], None

        return self.X[test_idx], self.y[test_idx]

In [183]:
class KernelContainer:
    def __init__(
        self,
        kernel_name,
        kernel_class,
        kernel_parameters, 
        hyperparameters = None
    ):
        self.kernel_name = kernel_name
        self.kernel_class = kernel_class
        self.kernel_parameters = kernel_parameters
        self.hyperparameters = hyperparameters

    def update_parameters(self, **update_params):
        self.kernel_parameters.update(update_params)
    
    def build_kernel(self):
        return self.kernel_class(**self.kernel_parameters)
    

In [184]:
class ModelContainer:
    def __init__(
        self,
        model_name,
        model_class,
        model_parameters,
        model_kernels,
        hyperparameters = None
    ):
        self.model_name = model_name
        self.model = model_class
        self.parameters = model_parameters
        self.model_kernels = model_kernels
        self.hyperparameters = hyperparameters

    def update_parameters(self, dct):
        self.parameters.update(dct)

    def __repr__(self):
        rep = f"{str(self.model)}: {str(self.parameters)}"

        return rep

In [None]:
class SubExperiment:
    def __init__(
        self, 
        experiment_name,
        model_container,
        dataset,
        evaluators,
        kernels,
        hyperparameter_evaluator=None
    ):
        self.experiment_name = experiment_name
        self.model_container = model_container
        self.dataset = dataset
        self.evaluators = evaluators
        self.kernels = kernels
        self.hyperparameter_evaluator = hyperparameter_evaluator

        name = f"{self.dataset.dataset_name} - {self.model_container.model_name}"

        self.results = dict(exp_name=name)        

    def build_model(self):
        model_class = self.model_container.model
        model_parameters = self.model_container.parameters
        model_kernels = self.model_container.model_kernels

        # build_model
        model = model_class(
            kernels=[k.build_kernel() for k in model_kernels], 
            **model_parameters
        )

        return model
    
    def tune_parameters(self):
        model_mapping = {self.model_container.model_name:self.model_container}
        kernel_mapping = {k_name:kernel for k_name, kernel in self.kernels.items()}

        object_mapping = dict()
        object_mapping.update(model_mapping)
        object_mapping.update(kernel_mapping)

        model_hparams = self.model_container.hyperparameters if self.model_container.hyperparameters else []
        X_train, y_train = self.dataset.train_data()
        X_val, y_val = self.dataset.val_data()

        evaluator = self.hyperparameter_evaluator()

        def objective(trial):
            # Update model parameters
            if self.model_container.hyperparameters:
                model_parameters = {}
                for hparam, hparam_conf in model_hparams.items():
                    if hparam_conf["type"] == "float":
                        model_parameters[hparam] = trial.suggest_float(
                            self.model_container.model_name + " " + hparam, 
                            hparam_conf["min"], 
                            hparam_conf["max"]
                        )
                    elif hparam_conf["type"] == "int":
                        model_parameters[hparam] = trial.suggest_int(
                            self.model_container.model_name + " " + hparam, 
                            hparam_conf["min"], 
                            hparam_conf["max"]
                        )
                    else:
                        raise ValueError("Expecting float or int")

                self.model_container.update_parameters(model_parameters)

            # Update kernel parameters
            for k_name, kernel in self.kernels.items():
                if kernel.hyperparameters:
                    params = {}
                    for hparam, hparam_conf in kernel.hyperparameters.items():
                        if hparam_conf["type"] == "float":
                            params[hparam] = trial.suggest_float(
                                k_name + " " + hparam, 
                                hparam_conf["min"], 
                                hparam_conf["max"]
                            )
                        elif hparam_conf["type"] == "int":
                            params[hparam] = trial.suggest_int(
                                k_name + " " + hparam, 
                                hparam_conf["min"], 
                                hparam_conf["max"]
                            )
                        else:
                            raise ValueError("Expecting float or int")
                        
                    kernel.update_parameters(params)

            model = self.build_model()
            model.fit(X_train, y_train)

            y_pred = model.predict(X_val)
            return evaluator(y_val, y_pred)

        study = optuna.create_study()
        study.optimize(
            objective, 
            n_trials=30, 
            timeout=60*20, # Optimise for n_trails or timeout seconds
        )

        best_params = study.best_params
        # Need to map best_params back to model and objects
        ## Also, different kernels with the same hparram name (ie bandwidth) will raise an error. Need to differentiate.

        for param, value in best_params.items():
            obj_name, param_name = param.split()
            object_mapping[obj_name].update_parameters({param_name:value})
                            
    def run_experiment(self):
        X_train, y_train = self.dataset.train_data()
        X_test, y_test = self.dataset.test_data()

        # Tune hyperparameters
        tune_hparams = False
        if self.model_container.hyperparameters:
            tune_hparams = True
        for kernel in self.model_container.model_kernels:
            if kernel.hyperparameters:
                tune_hparams = True
        
        if tune_hparams:
            self.tune_parameters()

        model = self.build_model()

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        for evaluator_name, evaluator in self.evaluators.items():
            self.results[evaluator_name] = evaluator(y_test, y_pred)

    def get_results(self):
        return self.results

    def __repr__(self):
        rep = f"Experiment:\n {str(self.model_container)} \n {str(self.dataset)}"

        return rep

In [186]:
class Experiment:
    def __init__(self, filepath):
        self.filepath = filepath

        self.sub_experiments = self.parse_config(filepath)

        self.completed_experiments = []

    def load_dataset(self, filepath):
        data = pd.read_csv(filepath, index_col=0).values
        return data
    
    def parse_config(self, config_path):
        with open(config_path, "r") as file:
            config = yaml.safe_load(file)

        for experiment, experiment_confs in config.items():
            models = experiment_confs["models"]
            datasets = experiment_confs["datasets"]
            metrics = experiment_confs["metrics"]

            # Load evaluators
            hparam_evaluator = evaluator_library[metrics["hyperparameter_tuning"]]
            evaluators = {i:evaluator_library[i]() for i in metrics["evaluation"]}

            # Process kernels
            kernels = {
                k_name:KernelContainer(
                    kernel_name=k_name,
                    kernel_class=kernel_library[k_conf["kernel"]],
                    kernel_parameters=k_conf["parameters"]
                )
                for k_name, k_conf in experiment_confs["kernels"].items()
            }

            product = itertools.product(datasets, models)
            
            for dataset_name, model_name in product:
                # Load dataset
                data = self.load_dataset(datasets[dataset_name]["filepath"])
                X, y = data[:-1], data[1:]
                tsp = datasets[dataset_name]["train_test_split"]


                # Load model
                model_class = model_library[models[model_name]["model"]]
                model_params = models[model_name]["parameters"]
                kernel_names = models[model_name]["kernels"]
                model_hparams = models[model_name]["hyperparameters"] if "hyperparameters" in models[model_name] else None

                yield SubExperiment(
                    experiment_name=experiment,
                    dataset = TimeSeriesData(
                        X, 
                        y, 
                        train_val_test_split=tsp,
                        dataset_name = dataset_name
                    ),
                    model_container= ModelContainer(
                        model_name=model_name,
                        model_class=model_class,
                        model_parameters=model_params,
                        model_kernels = [kernels[k] for k in kernel_names],
                        hyperparameters = model_hparams
                    ),
                    evaluators=evaluators,
                    hyperparameter_evaluator = hparam_evaluator,
                    kernels = {k:v for k, v in kernels.items() if k in kernel_names}
                )
    
    def run_experiments(self):
        for sub_exp in tqdm(self.sub_experiments):
            self.completed_experiments.append(sub_exp)

            sub_exp.run_experiment()

            # self.completed_experiments.append(sub_exp)

    def get_results(self):
        return pd.DataFrame(map(lambda x: x.get_results(), self.completed_experiments)).set_index("exp_name")

In [187]:
experiment = Experiment("experiment.yaml")

In [188]:
experiment.run_experiments()

4it [00:04,  1.01s/it]


In [190]:
experiment.get_results()

Unnamed: 0_level_0,MeanSquaredError
exp_name,Unnamed: 1_level_1
dataset1 - model_krr1,0.23877
dataset1 - model_krr2,0.000507
dataset2 - model_krr1,0.055354
dataset2 - model_krr2,0.000507
