## Benchmark models using the missingness simulator

In [1]:
from sklearn.datasets import load_diabetes
from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.benchmarks import compare_models

imputer = Imputers().get(
    "hyperimpute",  # the name of the imputation method.
    # The rest of the kwargs are specific to the method
    # optimizer: str. The optimizer to use: simple, hyperband, bayesian
    optimizer="hyperband",
    # classifier_seed: list. Model search pool for categorical columns.
    classifier_seed=["logistic_regression", "catboost", "xgboost", "random_forest"],
    # regression_seed: list. Model search pool for continuous columns.
    regression_seed=[
        "linear_regression",
        "catboost_regressor",
        "xgboost_regressor",
        "random_forest_regressor",
    ],
    # class_threshold: int. how many max unique items must be in the column to be is associated with categorical
    class_threshold=5,
    # imputation_order: int. 0 - ascending, 1 - descending, 2 - random
    imputation_order=2,
    # n_inner_iter: int. number of imputation iterations
    n_inner_iter=10,
    # select_model_by_column: bool. If true, select a different model for each column. Else, it reuses the model chosen for the first column.
    select_model_by_column=True,
    # select_model_by_iteration: bool. If true, selects new models for each iteration. Else, it reuses the models chosen in the first iteration.
    select_model_by_iteration=True,
    # select_lazy: bool. If false, starts the optimizer on every column unless other restrictions apply. Else, if for the current iteration there is a trend(at least to columns of the same type got the same model from the optimizer), it reuses the same model class for all the columns without starting the optimizer.
    select_lazy=True,
    # select_patience: int. How many iterations without objective function improvement to wait.
    select_patience=5,
)

# Load baseline dataset
X, _ = load_diabetes(as_frame=True, return_X_y=True)

# Run benchmarks
_ = compare_models(
    name="example",
    evaluated_model=imputer,
    X_raw=X,
    ref_methods=["sklearn_ice"],
    scenarios=["MAR"],
    miss_pct=[0.3, 0.5],
    n_iter=2,
    n_jobs=1,
)

In [1]:
from hyperimpute.plugins.imputers import Imputers

imputers = Imputers()

methods_pool = imputers.list()

['median',
 'softimpute',
 'gain',
 'sklearn_missforest',
 'sinkhorn',
 'miwae',
 'most_frequent',
 'mice',
 'missforest',
 'sklearn_ice',
 'EM',
 'hyperimpute',
 'miracle',
 'nop',
 'ice',
 'mean']

In [6]:
import pandas as pd
import numpy as np
from hyperimpute.plugins.imputers import Imputers

X = pd.DataFrame([[1, 1, 1, 1], [4, 5, np.nan, np.nan], [3, 3, 9, 9], [2, 2, 2, 2]])

method = "gain"

plugin = Imputers().get(method)
out = plugin.fit_transform(X.copy())
print(out.values)
print(method, out)

[[1.         1.         1.         1.        ]
 [4.         5.         7.28479147 7.64277411]
 [3.         3.         9.         9.        ]
 [2.         2.         2.         2.        ]]
gain      0    1         2         3
0  1.0  1.0  1.000000  1.000000
1  4.0  5.0  7.284791  7.642774
2  3.0  3.0  9.000000  9.000000
3  2.0  2.0  2.000000  2.000000
