In [25]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.base import RegressorMixin, BaseEstimator
from tqdm import tqdm

In [18]:
df_dev_set = pd.read_csv("genentech-404-challenge/dev_set.csv").set_index(
    ["RID_HASH", "VISCODE"]
)
df_dev_1 = pd.read_csv("genentech-404-challenge/dev_1.csv").set_index(
    ["RID_HASH", "VISCODE"]
)
df_dev_2 = pd.read_csv("genentech-404-challenge/dev_2.csv").set_index(
    ["RID_HASH", "VISCODE"]
)
df_dev_3 = pd.read_csv("genentech-404-challenge/dev_3.csv").set_index(
    ["RID_HASH", "VISCODE"]
)
df_test_a = pd.read_csv("genentech-404-challenge/test_A.csv").set_index(
    ["RID_HASH", "VISCODE"]
)
df_test_b = pd.read_csv("genentech-404-challenge/test_B.csv").set_index(
    ["RID_HASH", "VISCODE"]
)


In [19]:
scaler = MinMaxScaler()


In [20]:
df_dev_set_standardized = pd.DataFrame(
    scaler.fit_transform(df_dev_set), columns=df_dev_set.columns, index=df_dev_set.index
)
df_dev_1_standardized = pd.DataFrame(
    scaler.transform(df_dev_1), columns=df_dev_1.columns, index=df_dev_1.index
)
df_dev_2_standardized = pd.DataFrame(
    scaler.transform(df_dev_2), columns=df_dev_2.columns, index=df_dev_2.index
)
df_dev_3_standardized = pd.DataFrame(
    scaler.transform(df_dev_3), columns=df_dev_3.columns, index=df_dev_3.index
)
df_test_a_standardized = pd.DataFrame(
    scaler.transform(df_test_a), columns=df_test_a.columns, index=df_test_a.index
)
df_test_b_standardized = pd.DataFrame(
    scaler.transform(df_test_b), columns=df_test_b.columns, index=df_test_b.index
)


In [21]:
df_x, df_y = [], []
for i, df in enumerate(
    [df_dev_1_standardized, df_dev_2_standardized, df_dev_3_standardized]
):
    mask = df.isnull().sum(axis=1) > 0
    df_temp = df[mask].reset_index()
    df_temp["pattern"] = i
    df_x.append(df_temp)
    df_y.append(df_dev_set_standardized[mask])
df_x = pd.concat(df_x).set_index(["RID_HASH", "VISCODE", "pattern"])
df_y = pd.concat(df_y)


In [22]:
def get_mse(
    y_true: np.ndarray, y_pred: np.ndarray, *, multioutput: str = "uniform_average"
):
    assert y_true.shape == y_pred.shape
    return mean_squared_error(y_true, y_pred, multioutput=multioutput)


In [23]:
class ImputerRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, imputer) -> None:
        self.imputer = imputer

    def fit(self, x, y):
        self.imputer.fit(x, y)

    def predict(self, x):
        return self.imputer.transform(x)


In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    df_x.values,
    df_y.values,
    test_size=0.2,
    random_state=42,
    stratify=df_x.reset_index().pattern,
)

pipelines = [
    ImputerRegressor(SimpleImputer()),
    ImputerRegressor(KNNImputer()),
    ImputerRegressor(IterativeImputer()),
]

param_si = [{"imputer__strategy": ["mean", "median", "most_frequent"]}]

param_knn = [
    {
        "imputer__n_neighbors": [5, 10, 32, 64, 128, 256, 512, 1024, 2048],
        "imputer__weights": ["uniform", "distance"],
    }
]

param_it = [
    {
        "imputer__estimator": [linear_model.BayesianRidge()],
        "imputer__max_iter": [100],
        "imputer__imputation_order": ["ascending", "descending", "random"],
    },
    # {
    #     "imputer__estimator": [linear_model.HuberRegressor()],
    #     "imputer__estimator__epsilon": [1.01, 1.2, 1.35, 1.5, 2.0],
    #     "imputer__estimator__alpha": [0.00001, 0.0001, 0.001, 0.01],
    #     "imputer__max_iter": [500],
    #     "imputer__imputation_order": ["ascending", "descending", "random"],
    # },
    {
        "imputer__estimator": [GradientBoostingRegressor(random_state=0)],
        "imputer__estimator__loss": ["squared_error", "huber"],
        "imputer__estimator__learning_rate": [0.5, 0.1, 0.05, 0.01, 0.005],
        "imputer__estimator__max_depth": [1, 3, 5],
        "imputer__max_iter": [500],
        "imputer__imputation_order": ["ascending", "descending", "random"],
    },
]
params = [param_si, param_knn, param_it]
names = ["simple_imputer", "knn", "iterative_imputer"]

searchcvs = {}
inner_cv = KFold(n_splits=2, shuffle=True, random_state=4)

for pgrid, est, name in zip(
    params[2:],
    pipelines[2:],
    names[2:],
):
    # gcv = GridSearchCV(
    #     estimator=est,
    #     param_grid=pgrid,
    #     scoring=make_scorer(get_mse, greater_is_better=False),
    #     n_jobs=1,
    #     cv=inner_cv,
    #     verbose=0,
    #     refit=True,
    # )
    gcv = RandomizedSearchCV(
        estimator=est,
        param_distributions=pgrid,
        # scoring=make_scorer(get_mse, greater_is_better=False),
        scoring="neg_mean_squared_error",
        n_jobs=1,
        cv=inner_cv,
        verbose=0,
        refit=True,
        n_iter=2,
    )

    searchcvs[name] = gcv


In [32]:
outer_cv = KFold(n_splits=3, shuffle=True, random_state=5)
outer_scores = {}

for name, gs_est in tqdm(sorted(searchcvs.items())):
    nested_score = cross_val_score(
        gs_est,
        X=X_train,
        y=y_train,
        cv=outer_cv,
        scoring="neg_mean_squared_error",
        n_jobs=-1,
    )
    outer_scores[name] = nested_score
    print(f"{name}: outer mse {-nested_score.mean():.5f} +/- {nested_score.std():.5f}")


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
algo_best = 'knn'
algo = searchcvs[algo_best]
algo.fit(X_train, y_train)
train_perf = mean_squared_error(y_true=y_train, y_pred=algo.predict(X_train))
test_perf = mean_squared_error(y_true=y_test, y_pred=algo.predict(X_test))

# evaluate performance and compare to cross-validation results
print(f'MSE (mean cross-vaidated score of the best_estimator): {-algo.best_score_:.5f}')
print(f'Best Parameters: {searchcvs[algo_best].best_params_}')

print(f'Training MSE: {train_perf:.5f}')
print(f'Test MSE: {test_perf:.5f}')

MSE (mean cross-vaidated score of the best_estimator): -0.01935
Best Parameters: {'imputer__weights': 'uniform', 'imputer__n_neighbors': 1024}
Training MSE: 0.01883
Test MSE: 0.01908
