# Imputation Plugins

Missing data is a crucial issue when applying machine learning algorithms to real-world datasets.

**HyperImpute** provides a set of default imputation plugins and can be extended with any number of other plugins.

### Setup

In [1]:
import sys
import warnings
import time
from tqdm import tqdm
from math import sqrt

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from hyperimpute.plugins.utils.metrics import RMSE
from hyperimpute.plugins.utils.simulate import simulate_nan
from hyperimpute.utils.serialization import load, save

import xgboost as xgb

from IPython.display import HTML, display
import tabulate

if not sys.warnoptions:
    warnings.simplefilter("ignore")

### Loading the Imputation plugins

Make sure that you have installed HyperImpute in your workspace.

You can do that by running `pip install .` in the root of the repository.

In [2]:
from hyperimpute.plugins.imputers import Imputers, ImputerPlugin

imputers = Imputers()

### List the existing plugins

In [3]:
imputers.list()

['softimpute',
 'hyperimpute',
 'mean',
 'missforest',
 'sklearn_missforest',
 'sklearn_ice',
 'ice',
 'EM',
 'gain',
 'mice',
 'median',
 'miracle',
 'miwae',
 'nop',
 'most_frequent',
 'sinkhorn']

### Adding a new Imputation plugin

By default, HyperImpute automatically loads the imputation plugins with the pattern `hyperimpute/plugins/imputers/plugin_*`. 

Alternatively, you can call `Imputers().add(<name>, <ImputerPlugin derived class>)` at runtime.

Next, we show two examples of custom Imputation plugins.

In [None]:
custom_ice_plugin = "custom_ice"


class NewPlugin(ImputerPlugin):
    def __init__(self):
        super().__init__()
        lr = LinearRegression()
        self._model = IterativeImputer(
            estimator=lr, max_iter=500, tol=1e-10, imputation_order="roman"
        )

    @staticmethod
    def name():
        return custom_ice_plugin

    @staticmethod
    def hyperparameter_space():
        return []

    def _fit(self, *args, **kwargs) -> "NewPlugin":
        self._model.fit(*args, **kwargs)
        return self

    def _transform(self, *args, **kwargs):
        return self._model.transform(*args, **kwargs)

    def save(self) -> bytes:
        raise NotImplemented("placeholder")

    @classmethod
    def load(cls, buff: bytes) -> "NewPlugin":
        raise NotImplemented("placeholder")


imputers.add(custom_ice_plugin, NewPlugin)

assert imputers.get(custom_ice_plugin) is not None

### List the existing plugins

Now we should see the new plugins loaded.

In [3]:
imputers.list()

['gain',
 'mean',
 'miracle',
 'sinkhorn',
 'hyperimpute',
 'mice',
 'nop',
 'sklearn_ice',
 'median',
 'ice',
 'miwae',
 'sklearn_missforest',
 'missforest',
 'EM',
 'softimpute',
 'most_frequent']

### Testing the performance

We simulate some testing datasets using 3 amputation strategies:
- **Missing Completely At Random** (MCAR) if the probability of being missing is the same for all observations
- **Missing At Random** (MAR) if the probability of being missing only depends on observed values.
- **Missing Not At Random** (MNAR) if the unavailability of the data depends on both observed and unobserved data such as its value itself.

#### Load the dataset

In [4]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd



preproc = MinMaxScaler()


def dataset():
    X, y = load_breast_cancer(return_X_y=True)
    X = pd.DataFrame(preproc.fit_transform(X, y))
    y = pd.Series(y)

    return train_test_split(X, y, test_size=0.2)


def ampute(x, mechanism, p_miss):
    x_simulated = simulate_nan(np.asarray(x), p_miss, mechanism)

    mask = x_simulated["mask"]
    x_miss = x_simulated["X_incomp"]

    return pd.DataFrame(x), pd.DataFrame(x_miss), pd.DataFrame(mask)

In [None]:
train_datasets = {}
test_datasets = {}

headers = ["Plugin"]

pct = 0.3

mechanisms = ["MAR", "MNAR", "MCAR"]
percentages = [pct]

plugins = [
    "gain", 
    "miracle",
    "hyperimpute"
    ]  # imputers.list()  # default plugins

X_train, X_test, y_train, y_test = dataset()

# print(X_test, y_test)
for ampute_mechanism in mechanisms:
    for p_miss in percentages:
        if ampute_mechanism not in train_datasets:
            train_datasets[ampute_mechanism] = {}

        headers.append(ampute_mechanism + "-" + str(p_miss))
        train_datasets[ampute_mechanism][p_miss] = ampute(X_train, ampute_mechanism, p_miss)

        if ampute_mechanism not in test_datasets:
            test_datasets[ampute_mechanism] = {}

        headers.append(ampute_mechanism + "-" + str(p_miss))
        test_datasets[ampute_mechanism][p_miss] = ampute(X_test, ampute_mechanism, p_miss)



#### Evaluation

We compare the methods in terms of root mean squared error (RMSE) to the initial dataset.

In [11]:
train_results = []
test_results = []
train_duration = []
test_duration = []


for plugin in tqdm(plugins):
    train_plugin_results = [plugin]
    train_plugin_duration = [plugin]

    test_plugin_results = [plugin]
    test_plugin_duration = [plugin]

    for ampute_mechanism in mechanisms:
        for p_miss in percentages:
            ctx = imputers.get(plugin)
            train_x_true, train_x_miss, mask = train_datasets[ampute_mechanism][p_miss]
            
            # print(train_x_true)

            start1 = time.time() 
            # Fit on training data
            train_x_imp = ctx.fit_transform(train_x_miss)

            # ctx.fit(train_x_miss)
            train_mse = mean_squared_error(train_x_true.values, train_x_imp.values)  # MSE
            train_rmse = root_mean_squared_error(train_x_true.values, train_x_imp.values)  # RMSE
            buff = save(ctx)  # get the model as bytes
            # print(f"Train imputed: {train_x_imp}")
            # print(f"Train mse: {train_mse}, RMSE: {train_rmse}")

            end1 = time.time()

            train_plugin_duration.append(round(time.time() * 1000 - start1, 4))
            train_plugin_results.append(train_rmse)
#  =========== start test ==============
            start2 = time.time() * 1000
            # x_imp = ctx.fit_transform(x_miss)
            test_x_true, test_x_miss, mask = test_datasets[ampute_mechanism][p_miss]
            # print(test_x_true)

            # start = time.time() * 1000
            # Fit on training data
            test_x_imp = ctx.transform(test_x_miss)

            test_mse = mean_squared_error(test_x_true.values, test_x_imp.values)  # MSE
            test_rmse = root_mean_squared_error(test_x_true.values, test_x_imp.values)  # RMSE
            # buff = save(ctx)  # get the model as bytes
            # print(f"TEST imputed: {test_x_imp}")
            print(f"{ampute_mechanism}, {p_miss}, TEST mse: {test_mse}, RMSE: {test_rmse}")
            # print(test_mse)
            test_plugin_duration.append(round(time.time() * 1000 - start2, 4))
            test_plugin_results.append(test_rmse)

            # print(plugin_results)
    train_results.append(train_plugin_results)
    train_duration.append(train_plugin_duration)
    
    test_results.append(test_plugin_results)
    test_duration.append(test_plugin_duration)


  0%|          | 0/1 [00:00<?, ?it/s]

MAR, 0.3, TEST mse: 0.0009260710420521995, RMSE: 0.0196453152782611
MNAR, 0.3, TEST mse: 0.0015821174530861162, RMSE: 0.03688286178111923


100%|██████████| 1/1 [00:28<00:00, 28.95s/it]

MCAR, 0.3, TEST mse: 0.0014857501581170308, RMSE: 0.03438636737391567





### Reconstruction error(RMSE)

__Interpretation__ : The following table shows the reconstruction error -  the __Root Mean Square Error(RMSE)__ for each method applied on the original full dataset and the imputed dataset.

In [12]:
display(HTML(tabulate.tabulate(train_results, headers=headers, tablefmt="html")))
display(HTML(tabulate.tabulate(test_results, headers=headers, tablefmt="html")))

Plugin,MAR-0.3,MAR-0.3.1,MNAR-0.3
hyperimpute,0.0173651,0.0319501,0.0264947


Plugin,MAR-0.3,MAR-0.3.1,MNAR-0.3
hyperimpute,0.0196453,0.0368829,0.0343864


### XGBoost test score after imputation

__Interpretation__ The following table shows different metrics on the test set for an XGBoost classifier, after imputing the dataset with each method.
Metrics:
 - accuracy

In [None]:
from sklearn import metrics


def get_metrics(X_train, y_train, X_test, y_test):
    xgb_clf = xgb.XGBClassifier(verbosity=0)
    xgb_clf = xgb_clf.fit(X_train, y_train)

    y_pred = xgb_clf.predict(X_test)

    score = xgb_clf.score(X_test, y_test)

    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    auroc = metrics.auc(fpr, tpr)

    prec, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred)
    aurpc = metrics.auc(recall, prec)

    return score, auroc, aurpc


metrics_headers = ["Plugin", "Accuracy", "AUROC", "AURPC"]
xgboost_test_score = []


x, x_miss, mask = datasets["MAR"][pct]

xgboost_test_score.append(
    ["original dataset", *get_metrics(X_train, y_train, X_test, y_test)]
)

for plugin in plugins:
    X_train_imp = imputers.get(plugin).fit_transform(x_miss.copy())

    score, auroc, aurpc = get_metrics(X_train_imp, y_train, X_test, y_test)

    xgboost_test_score.append([plugin, score, auroc, aurpc])

In [None]:
display(
    HTML(
        tabulate.tabulate(xgboost_test_score, headers=metrics_headers, tablefmt="html")
    )
)

### Duration(ms) results

__Info__ : Here we measure the duration of imputing the dataset with each method.

In [None]:
display(HTML(tabulate.tabulate(duration, headers=headers, tablefmt="html")))

## Debugging

HyperImpute supports **debug** logging. __WARNING__: Don't use it for release builds. 

In [None]:
from hyperimpute import logger

imputers = Imputers()

logger.add(sink=sys.stderr, level="DEBUG")

x, x_miss, mask = datasets["MAR"][pct]

x_imp = imputers.get("EM").fit_transform(x)

imputers.get("softimpute").fit_transform(x_miss)