In [16]:
from sklearn.datasets import load_diabetes  # to load data
from julearn.pipeline import PipelineCreator
from julearn import run_cross_validation
from julearn.inspect import preprocess
import numpy as np

# Load in the data
df_features, target = load_diabetes(return_X_y=True, as_frame=True)

# Introduce missing values randomly in the dataset
rng = np.random.default_rng(seed=42)  # Reproducible results
missing_rate = 0.1  # 10% missing values
n_missing_samples = int(np.floor(missing_rate * df_features.size))

# Randomly select indices to introduce missing values
missing_indices = (rng.integers(0, df_features.shape[0], n_missing_samples), 
                   rng.integers(0, df_features.shape[1], n_missing_samples))
df_features.values[missing_indices] = np.nan

X = ["age", "sex", "bmi", "bp", "s1", "s2", "s3", "s4", "s5", "s6"]
y = "target"
X_types ={
    "continuous":[
        "age", "sex", "bmi", "bp", "s1", "s2", "s3", "s4", "s5", "s6"
    ]
}


In [17]:
print("Features:\n", df_features.head())
print("Target:\n", target.describe())

Features:
         age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299       NaN  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3       NaN -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  
0 -0.002592  0.019907 -0.017646  
1 -0.039493 -0.068332 -0.092204  
2 -0.002592  0.002861 -0.025930  
3  0.034309  0.022688 -0.009362  
4       NaN -0.031988 -0.046641  
Target:
 count    442.000000
mean     152.133484
std       77.093005
min       25.000000
25%       87.000000
50%      140.500000
75%      211.500000
max      346.000000
Name: target, dtype: float64


In [18]:
data = df_features.copy()
data["target"] = target


In [30]:
#  Univariate feature imputaiton
# SimpleImupter
pipeline_creator_1 = PipelineCreator(problem_type="regression")
pipeline_creator_1.add("impute_simple", apply_to="*",strategy="mean")
pipeline_creator_1.add("ridge")


scores_1, model_1 = run_cross_validation(
    X=X,
    y=y,
    data=data,
    X_types=X_types,
    model=pipeline_creator_1,
    scoring="neg_mean_absolute_error",
    return_estimator="final",
)
print(scores_1)

   fit_time  score_time  test_score  n_train  n_test  repeat  fold  \
0  0.010266    0.003275  -48.331823      353      89       0     0   
1  0.007122    0.002846  -49.530916      353      89       0     1   
2  0.006144    0.002516  -53.480182      354      88       0     2   
3  0.005812    0.002509  -46.746818      354      88       0     3   
4  0.005509    0.002593  -52.019808      354      88       0     4   

                           cv_mdsum  
0  b10eef89b4192178d482d7a1587a248a  
1  b10eef89b4192178d482d7a1587a248a  
2  b10eef89b4192178d482d7a1587a248a  
3  b10eef89b4192178d482d7a1587a248a  
4  b10eef89b4192178d482d7a1587a248a  
-50.02190930609343


In [24]:
# Multivariate feature imputation
# IterativeImputer
pipeline_creator_2 = PipelineCreator(problem_type="regression")
pipeline_creator_2.add("impute_iterative", apply_to="*",max_iter=100, random_state=0)
pipeline_creator_2.add("ridge")


scores_2, model_2 = run_cross_validation(
    X=X,
    y=y,
    data=data,
    X_types=X_types,
    model=pipeline_creator_2,
    scoring="neg_mean_absolute_error",
    return_estimator="final",
)
print(scores_2)

   fit_time  score_time  test_score  n_train  n_test  repeat  fold  \
0  0.333696    0.017555  -48.017559      353      89       0     0   
1  0.387783    0.021610  -48.473483      353      89       0     1   
2  0.411441    0.021787  -53.032081      354      88       0     2   
3  0.438956    0.024860  -46.196489      354      88       0     3   
4  0.347118    0.019651  -51.038093      354      88       0     4   

                           cv_mdsum  
0  b10eef89b4192178d482d7a1587a248a  
1  b10eef89b4192178d482d7a1587a248a  
2  b10eef89b4192178d482d7a1587a248a  
3  b10eef89b4192178d482d7a1587a248a  
4  b10eef89b4192178d482d7a1587a248a  


In [27]:
# Nearest neighbors imputation
pipeline_creator_3 = PipelineCreator(problem_type="regression")
pipeline_creator_3.add("impute_knn", apply_to="*",n_neighbors=2, weights='uniform')
pipeline_creator_3.add("ridge")


scores_3, model_3 = run_cross_validation(
    X=X,
    y=y,
    data=data,
    X_types=X_types,
    model=pipeline_creator_3,
    scoring="neg_mean_absolute_error",
    return_estimator="final",
)
print(scores_3)

   fit_time  score_time  test_score  n_train  n_test  repeat  fold  \
0  0.022339    0.006216  -48.032922      353      89       0     0   
1  0.017084    0.004863  -48.568661      353      89       0     1   
2  0.014643    0.006140  -53.357545      354      88       0     2   
3  0.015991    0.008623  -47.376895      354      88       0     3   
4  0.013672    0.004129  -51.612284      354      88       0     4   

                           cv_mdsum  
0  b10eef89b4192178d482d7a1587a248a  
1  b10eef89b4192178d482d7a1587a248a  
2  b10eef89b4192178d482d7a1587a248a  
3  b10eef89b4192178d482d7a1587a248a  
4  b10eef89b4192178d482d7a1587a248a  
