In [1]:
# On the Diabetes Dataset

from typing import List, Type

import pandas as pd
import numpy as np

from microimpute.comparisons import *
from microimpute.evaluations import *
from microimpute.config import RANDOM_STATE
from microimpute.models import *
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

import warnings

warnings.filterwarnings("ignore")

# 1. Prepare data
diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)

# Add random boolean variable
np.random.seed(RANDOM_STATE)
df["bool"] = np.random.choice([True, False], size=len(df))

# Add random categorical variable with three categories
df["categorical"] = np.random.choice(["one", "two", "three"], size=len(df))

df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,bool,categorical
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,True,one
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,False,three
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,True,three
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,True,two
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,True,three


In [2]:
X_train, X_test, dummy_info = preprocess_data(
    df,
)

X_train.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,bool,categorical_one,categorical_three,categorical_two
17,0.070769,0.05068,0.012117,0.056301,0.034206,0.049416,-0.039719,0.034309,0.027364,-0.001078,0.0,0.0,0.0,1.0
66,-0.009147,0.05068,-0.018062,-0.033213,-0.020832,0.012152,-0.072854,0.07121,0.000272,0.019633,0.0,1.0,0.0,0.0
137,0.005383,-0.044642,0.04984,0.097615,-0.015328,-0.016345,-0.006584,-0.002592,0.017036,-0.013504,1.0,0.0,1.0,0.0
245,-0.02731,-0.044642,-0.035307,-0.02977,-0.056607,-0.05862,0.030232,-0.039493,-0.049872,-0.129483,0.0,0.0,0.0,1.0
31,-0.023677,-0.044642,-0.065486,-0.081413,-0.03872,-0.05361,0.059685,-0.076395,-0.037129,-0.042499,1.0,0.0,0.0,1.0


In [3]:
print(dummy_info)

{'original_dtypes': {'bool': 'bool', 'categorical': 'categorical'}, 'column_mapping': {'bool': ['bool'], 'categorical': ['categorical_one', 'categorical_three', 'categorical_two']}, 'original_categories': {'categorical': ['one', 'three', 'two']}}


In [None]:
predictors = ["age", "sex", "bmi", "bp"]
imputed_variables = ["categorical"]

Y_test: pd.DataFrame = X_train[imputed_variables]

# 2. Run imputation methods
ols = OLS()
fitted_ols = ols.fit(X_train, predictors, imputed_variables)
ols_predictions = fitted_ols.predict(X_test)

In [5]:
imputations = postprocess_imputations(ols_predictions, dummy_info)

imputations[0.5].head()

No dummy columns found for categorical variable categorical


Unnamed: 0,bool
287,True
211,True
72,True
321,True
73,False
