In [None]:
from icu_experiments.load_data import load_data_for_prediction
from icu_experiments.preprocessing import make_feature_preprocessing, make_anchor_preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor, Booster
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import itertools
from ivmodels import AnchorRegression

outcome = "hr"

def load_data(name, outcome):
    Xy = load_data_for_prediction([name], outcome=outcome, log_transform=True)
    Xy_test = Xy[name]['train']
    Xy_tuning_data = Xy[name]['test']
    return Xy_test, Xy_tuning_data

Xy_train, Xy_test = load_data('eicu', outcome)
Xy_test_hirid, Xy_tuning_hirid = load_data('hirid', outcome)
Xy_test_mimic, Xy_tuning_mimic = load_data('mimic', outcome) 
Xy_test_miiv, Xy_tuning_miiv = load_data('miiv', outcome) 

preprocessing_steps = make_feature_preprocessing(missing_indicator=True)
preprocessor = ColumnTransformer(transformers=preprocessing_steps).set_output(transform="pandas") # Allow to preprocess subbsets of data differently

anchor_columns = ['hospital_id']
anchor_preprocessing_steps = make_anchor_preprocessing(anchor_columns)
anchor_preprocessor = ColumnTransformer(
        anchor_preprocessing_steps + preprocessing_steps #preprocessing_steps
    ).set_output(transform="pandas")

p1 = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', LGBMRegressor())
])
p2 = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', LGBMRegressor())
])
p3 = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', LinearRegression())
])

p4 = Pipeline(steps=[
    ('preprocessing', anchor_preprocessor),
    ('model', AnchorRegression())
])

## Compare Performance from Training to Target Data - Parameters chosen via GridCV on Training Data

In [3]:
param_grid_lgbm = {
    'model__boosting_type': ['gbdt'],
    'model__learning_rate': [0.01, 0.1, 0.3],
    'model__n_estimators': [100, 800],
    'model__num_leaves': [50, 200, 1024],
    'model__feature_fraction': [0.5, 0.9]
}

param_grid_rf = {
    'model__boosting_type': ['rf'],
    'model__learning_rate': [0.01, 0.1, 0.3],
    'model__n_estimators': [100, 800],
    'model__num_leaves': [50, 200, 1024],
    'model__feature_fraction': [0.5, 0.9]
}

search = GridSearchCV(p1, param_grid=param_grid_lgbm)
search.fit(Xy_train, Xy_train['outcome'])
print("Best parameter (CV score=%0.3f):" % search.best_score_)
grid_search_params_lgbm = search.best_params_
print(search.best_params_)

search = GridSearchCV(p2, param_grid=param_grid_rf)
search.fit(Xy_train, Xy_train['outcome'])
print("Best parameter (CV score=%0.3f):" % search.best_score_)
grid_search_params_rf = search.best_params_
print(search.best_params_)

p1.set_params(**grid_search_params_lgbm)
p1.fit(Xy_train, Xy_train['outcome'])
p2.set_params(**grid_search_params_rf)
p2.fit(Xy_train, Xy_train['outcome'])

mse_grid_lgbm_hirid = mean_squared_error(Xy_test_hirid['outcome'], p1.predict(Xy_test_hirid))
mse_grid_rf_hirid = mean_squared_error(Xy_test_hirid['outcome'], p2.predict(Xy_test_hirid))

mse_grid_lgbm_mimic = mean_squared_error(Xy_test_mimic['outcome'], p1.predict(Xy_test_mimic))
mse_grid_rf_mimic = mean_squared_error(Xy_test_mimic['outcome'], p2.predict(Xy_test_mimic))

mse_grid_lgbm_miiv = mean_squared_error(Xy_test_miiv['outcome'], p1.predict(Xy_test_miiv))
mse_grid_rf_miiv = mean_squared_error(Xy_test_miiv['outcome'], p2.predict(Xy_test_miiv))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013656 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12253
[LightGBM] [Info] Number of data points in the train set: 58145, number of used features: 99
[LightGBM] [Info] Start training from score 84.924568
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014302 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12228
[LightGBM] [Info] Number of data points in the train set: 58145, number of used features: 99
[LightGBM] [Info] Start training from score 84.928896
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005306 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12230
[LightGBM] [Info] Number of data points in the trai

## Compare Performance from Training to Target Data - Parameters chosen via Evaluation on Target Data

Approach: 
- Train Data with different parameters on Training set
- Evaluate Train Data on fine tuning data from target set 
- choose the best performing parameters
- do this for all possible n from the fine tuning data set

In [4]:
param_grid_lgbm = {
    'boosting_type': ['gbdt'],
    'learning_rate': [0.01, 0.1, 0.3], # Gradient learning rate
    'n_estimators': [100, 800], # number of boosting iterations
    'num_leaves': [50, 200, 1024], # Control tree structure - max. number of leaves in tree (num_leaves < 2^max depth)
    'feature_fraction': [0.5, 0.9] # % of features to sample when training each tree
}
param_grid_rf = {
    'boosting_type': ['rf'],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 800], 
    'num_leaves': [50, 200, 1024], 
    'feature_fraction': [0.5, 0.9]
}
param_grid_anchor = {
    'gamma': [1, 10, 10000],
    'instrument_regex': ['anchor'],
    'alpha': [0.00001, 0.001, 0.1]
}

def find_best_parameters(Xy_train, Xy_tuning_data, p, param_grid):
    results_for_n = []

    for n in [25, 50, 100, 200, 400, 800, 1600]:
        best_params = None
        best_mse = float('inf')  # Initialize with a large value

        # Iterate over all possible combinations of hyperparameters
        for param_set in itertools.product(*param_grid.values()):
            params = dict(zip(param_grid.keys(), param_set))
        
            p.named_steps['model'].set_params(**params)
            p.fit(Xy_train, Xy_train['outcome'])
            y_pred = p.predict(Xy_tuning_data.head(n))
            mse = mean_squared_error(Xy_tuning_data['outcome'].head(n), y_pred)

            if mse < best_mse:
                best_mse = mse
                best_params = params

        results_for_n.append({'n': n, 'best_params': best_params, 'best_mse': best_mse})

    return results_for_n

results_p1_hirid = find_best_parameters(Xy_train, Xy_tuning_hirid, p1, param_grid_lgbm)
results_p2_hirid = find_best_parameters(Xy_train, Xy_tuning_hirid, p2, param_grid_rf)
results_p4_hirid = find_best_parameters(Xy_train, Xy_tuning_hirid, p4, param_grid_anchor)

results_p1_mimic = find_best_parameters(Xy_train, Xy_tuning_mimic, p1, param_grid_lgbm)
results_p2_mimic = find_best_parameters(Xy_train, Xy_tuning_mimic, p2, param_grid_rf)
results_p4_mimic = find_best_parameters(Xy_train, Xy_tuning_mimic, p4, param_grid_anchor)

results_p1_miiv = find_best_parameters(Xy_train, Xy_tuning_miiv, p1, param_grid_lgbm)
results_p2_miiv = find_best_parameters(Xy_train, Xy_tuning_miiv, p2, param_grid_rf)
results_p4_miiv = find_best_parameters(Xy_train, Xy_tuning_miiv, p4, param_grid_anchor)

def calculate_mse(X_train, y_train, X_test, y_test, p, results):
    mse_for_n = []
    i = 0
    for n in [25, 50, 100, 200, 400, 800, 1600]:
        p.named_steps['model'].set_params(**results[i]['best_params'])
        p.fit(X_train, y_train)
        y_pred = p.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_for_n.append({'n': n, 'mse': mse})
        i += 1
    return mse_for_n

mse_eicu_to_hirid_p1 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_hirid, Xy_test_hirid['outcome'], p1, results_p1_hirid)
mse_eicu_to_hirid_p2 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_hirid, Xy_test_hirid['outcome'], p2, results_p2_hirid)
mse_eicu_to_hirid_p4 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_hirid, Xy_test_hirid['outcome'], p4, results_p4_hirid)

mse_eicu_to_mimic_p1 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic, Xy_test_mimic['outcome'], p1, results_p1_mimic)
mse_eicu_to_mimic_p2 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic, Xy_test_mimic['outcome'], p2, results_p2_mimic)
mse_eicu_to_mimic_p4 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic, Xy_test_mimic['outcome'], p4, results_p4_mimic)

mse_eicu_to_miiv_p1 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_miiv, Xy_test_miiv['outcome'], p1, results_p1_miiv)
mse_eicu_to_miiv_p2 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_miiv, Xy_test_miiv['outcome'], p2, results_p2_miiv)
mse_eicu_to_miiv_p4 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_miiv, Xy_test_miiv['outcome'], p4, results_p4_miiv)

# OLS MSE Calculation
p3.fit(Xy_train, Xy_train['outcome'])
mse_eicu_to_hirid_p3 = mean_squared_error(Xy_test_hirid['outcome'], p3.predict(Xy_test_hirid))
mse_eicu_to_hirid_dummy_prediction = mean_squared_error(Xy_test_hirid['outcome'], np.full_like(Xy_test_hirid['outcome'],Xy_train[outcome].mean()))

mse_eicu_to_mimic_p3 = mean_squared_error(Xy_test_mimic['outcome'], p3.predict(Xy_test_mimic))
mse_eicu_to_mimic_dummy_prediction = mean_squared_error(Xy_test_mimic['outcome'], np.full_like(Xy_test_mimic['outcome'],Xy_train[outcome].mean()))

mse_eicu_to_miiv_p3 = mean_squared_error(Xy_test_miiv['outcome'], p3.predict(Xy_test_miiv))
mse_eicu_to_miiv_dummy_prediction = mean_squared_error(Xy_test_miiv['outcome'], np.full_like(Xy_test_miiv['outcome'],Xy_train[outcome].mean()))

def plotting(mse_p1, mse_p2, mse_p3, mse_p4, mse_baseline1, mse_baseline2, train, target):
    n = [25, 50, 100, 200, 400, 800, 1600]
    plt.plot(n, ([item['mse'] for item in mse_p1]), marker='o', linestyle='-', label = 'LGBM')
    plt.plot(n, [item['mse'] for item in mse_p2], marker='o', linestyle='-', label = 'RF')
    plt.plot(n, [item['mse'] for item in mse_p4], marker='o', linestyle='-', label = 'Anchor')
    plt.axhline(y=mse_p3, color='black', linestyle='-', label='OLS Baseline')
    plt.axhline(y=mse_baseline1, color='green', linestyle='-', label='LGBM Baseline')
    plt.axhline(y=mse_baseline2, color='purple', linestyle='-', label='RF Baseline')
    #plt.axhline(y = mean_squared_error(Xy_test_new['outcome'], np.full_like(Xy_test_new['outcome'],Xy_train[outcome].mean())), color = 'black', label='Train Average')
    plt.title(f'Parameters for Model chosen with evaluation on n Data Points from Target Distribution {target}')
    plt.xlabel('Number of Data Points (n)')
    plt.ylabel('Mean Squared Error (MSE)')
    plt.legend()
    plt.grid(True)
    plt.show()

plotting(mse_eicu_to_hirid_p1, mse_eicu_to_hirid_p2, mse_eicu_to_hirid_p3, mse_eicu_to_hirid_p4, mse_grid_lgbm_hirid, mse_grid_rf_hirid, 'Eicu', 'Hirid')
plotting(mse_eicu_to_mimic_p1, mse_eicu_to_mimic_p2, mse_eicu_to_mimic_p3, mse_eicu_to_mimic_p4, mse_grid_lgbm_mimic, mse_grid_rf_mimic, 'Eicu', 'Mimic')
plotting(mse_eicu_to_miiv_p1, mse_eicu_to_miiv_p2, mse_eicu_to_miiv_p3, mse_eicu_to_miiv_p4, mse_grid_lgbm_miiv, mse_grid_rf_miiv, 'Eicu', 'Miiv')

n = 25:   0%|          | 0/36 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004146 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 25:   0%|          | 0/36 [06:29<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004377 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004815 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004096 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 50:   0%|          | 0/36 [06:27<?, ?it/s]]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004819 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004098 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 100:   0%|          | 0/36 [06:28<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004143 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004770 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004192 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 200:   0%|          | 0/36 [06:31<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006467 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006360 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

n = 400:   0%|          | 0/36 [06:28<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 800:   0%|          | 0/36 [06:24<?, ?it/s]]


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004133 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004939 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004142 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 1600:   0%|          | 0/36 [06:30<?, ?it/s]




n = 25:   0%|          | 0/36 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004199 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004846 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004169 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 25:   0%|          | 0/36 [07:43<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004848 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004112 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 50:   0%|          | 0/36 [07:39<?, ?it/s]]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004290 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004874 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004156 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 100:   0%|          | 0/36 [07:38<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004546 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004806 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 200:   0%|          | 0/36 [07:30<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004583 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023178 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

n = 400:   0%|          | 0/36 [07:40<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004483 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004748 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 800:   0%|          | 0/36 [07:39<?, ?it/s]]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018752 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022321 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019391 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Star

n = 1600:   0%|          | 0/36 [07:28<?, ?it/s]




  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
n = 25:   0%|          | 0/9 [00:21<?, ?it/s]
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005989 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017556 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006765 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

n = 25:   0%|          | 0/36 [06:29<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004597 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004964 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005900 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 50:   0%|          | 0/36 [06:28<?, ?it/s]]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004229 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 100:   0%|          | 0/36 [06:26<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004830 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004146 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 200:   0%|          | 0/36 [06:29<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006483 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004831 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 400:   0%|          | 0/36 [06:27<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004096 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004121 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 800:   0%|          | 0/36 [06:29<?, ?it/s]]


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004821 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004072 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 1600:   0%|          | 0/36 [06:25<?, ?it/s]




n = 25:   0%|          | 0/36 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 25:   0%|          | 0/36 [07:34<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004855 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 50:   0%|          | 0/36 [07:39<?, ?it/s]]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004835 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 100:   0%|          | 0/36 [07:33<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004701 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004825 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004127 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 200:   0%|          | 0/36 [07:40<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004779 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004149 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 400:   0%|          | 0/36 [07:33<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004054 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 800:   0%|          | 0/36 [07:32<?, ?it/s]]






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014442 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004806 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

n = 1600:   0%|          | 0/36 [07:31<?, ?it/s]




  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
n = 25:   0%|          | 0/9 [00:21<?, ?it/s]
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004820 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004146 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 25:   0%|          | 0/36 [06:20<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004169 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004999 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004123 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 50:   0%|          | 0/36 [06:26<?, ?it/s]]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004526 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004757 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 100:   0%|          | 0/36 [06:28<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004817 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 200:   0%|          | 0/36 [06:28<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005842 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004235 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 400:   0%|          | 0/36 [06:28<?, ?it/s]






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014607 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004831 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004125 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

n = 800:   0%|          | 0/36 [06:29<?, ?it/s]]


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004732 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004025 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 1600:   0%|          | 0/36 [06:28<?, ?it/s]




n = 25:   0%|          | 0/36 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004236 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004853 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004290 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 25:   0%|          | 0/36 [07:38<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 50:   0%|          | 0/36 [07:33<?, ?it/s]]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004777 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004807 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004227 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 100:   0%|          | 0/36 [07:37<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004612 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004828 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004122 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 200:   0%|          | 0/36 [07:33<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004149 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004853 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 400:   0%|          | 0/36 [07:37<?, ?it/s]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004863 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 800:   0%|          | 0/36 [07:39<?, ?it/s]]






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004558 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004887 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

n = 1600:   0%|          | 0/36 [07:39<?, ?it/s]




  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
n = 25:   0%|          | 0/9 [00:21<?, ?it/s]
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004798 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008024 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004716 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005089 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004772 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005003 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12251
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 99
[LightGBM] [Info] Start training from score 84.940655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004129 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)
  if pd.api.types.is_categorical_dtype(dtype) and (column in df)


## Store results

In [12]:
results_dict = {}
mse_dict = {}

results_dict['p1_baseline'] = grid_search_params_lgbm
results_dict['p2_baseline'] = grid_search_params_rf
results_dict['p1_hirid'] = results_p1_hirid
results_dict['p2_hirid'] = results_p2_hirid
results_dict['p4_hirid'] = results_p4_hirid

results_dict['p1_mimic'] = results_p1_mimic
results_dict['p2_mimic'] = results_p2_mimic
results_dict['p4_mimic'] = results_p4_mimic

results_dict['p1_miiv'] = results_p1_miiv
results_dict['p2_miiv'] = results_p2_miiv
results_dict['p4_miiv'] = results_p4_miiv

mse_dict['eicu_to_hirid_p1_baseline'] = mse_grid_lgbm_hirid
mse_dict['eicu_to_hirid_p1'] = mse_eicu_to_hirid_p1
mse_dict['eicu_to_hirid_p2'] = mse_eicu_to_hirid_p2
mse_dict['eicu_to_hirid_p3'] = mse_eicu_to_hirid_p3
mse_dict['eicu_to_hirid_p4'] = mse_eicu_to_hirid_p4

mse_dict['eicu_to_mimic_p1_baseline'] = mse_grid_lgbm_mimic
mse_dict['eicu_to_mimic_p1'] = mse_eicu_to_mimic_p1
mse_dict['eicu_to_mimic_p2'] = mse_eicu_to_mimic_p2
mse_dict['eicu_to_mimic_p3'] = mse_eicu_to_mimic_p3
mse_dict['eicu_to_mimic_p4'] = mse_eicu_to_mimic_p4

mse_dict['eicu_to_miiv_p1_baseline'] = mse_grid_lgbm_miiv
mse_dict['eicu_to_miiv_p1'] = mse_eicu_to_miiv_p1
mse_dict['eicu_to_miiv_p2'] = mse_eicu_to_miiv_p2
mse_dict['eicu_to_miiv_p3'] = mse_eicu_to_miiv_p3
mse_dict['eicu_to_miiv_p4'] = mse_eicu_to_miiv_p4

mse_dict['eicu_to_hirid_dummy_prediction'] = mse_eicu_to_hirid_dummy_prediction
mse_dict['eicu_to_mimic_dummy_prediction'] = mse_eicu_to_mimic_dummy_prediction
mse_dict['eicu_to_miiv_dummy_prediction'] = mse_eicu_to_miiv_dummy_prediction

import pickle

# Define the file paths to save the dictionaries
results_file_path = 'Woche5/parameters_dict.pkl'
mse_file_path = 'Woche5/mse_dict.pkl'

# Save the results dictionary to a file
with open(results_file_path, 'wb') as results_file:
    pickle.dump(results_dict, results_file)

# Save the MSE dictionary to a file
with open(mse_file_path, 'wb') as mse_file:
    pickle.dump(mse_dict, mse_file)

# Optionally, you can print a message to confirm the saving process
print(f"Results saved to {results_file_path}")
print(f"MSE values saved to {mse_file_path}")

Results saved to Woche5/parameters_dict.pkl
MSE values saved to Woche5/mse_dict.pkl


## Custom Anchor

In [None]:
class CustomizedAnchor(BaseEstimator, RegressorMixin):
    def __init__(self, anchor_params=None, lgbm_params=None):
        # Initialize parameters
        self.anchor_params = anchor_params if anchor_params is not None else {}
        self.lgbm_params = lgbm_params if lgbm_params is not None else {}

    def fit(self, X, y):
        # Initialize and fit the Anchor Regression model
        self.anchor_model = AnchorRegression(**self.anchor_params)
        self.anchor_model.fit(X, y)

        # Calculate residuals
        residuals = y - self.anchor_model.predict(X)

        # Initialize and fit the LGBMRegressor with residuals
        self.lgbm_model = LGBMRegressor(**self.lgbm_params)
        self.lgbm_model.fit(X, residuals)

        return self

    def predict(self, X):
        # Check if fit has been called
        if not hasattr(self, 'anchor_model') or not hasattr(self, 'lgbm_model'):
            raise AttributeError("Models have not been fitted. Call fit() first.")

        # Make predictions
        anchor_predictions = self.anchor_model.predict(X)
        lgbm_predictions = self.lgbm_model.predict(X)

        # Combine predictions
        return anchor_predictions + lgbm_predictions
    
p5 = Pipeline(steps=[
    ('preprocessing', anchor_preprocessor),
    ('model', CustomizedAnchor())
])


def find_custom_parameters(Xy_train, Xy_tuning_data, p, param1_grid, param2_grid):
    results_for_n = []

    for n in [25, 50, 100, 200, 400, 800, 1600]:
        best_params1 = None
        best_params2 = None
        best_mse = float('inf') 

        param1_combinations = list(itertools.product(*param1_grid.values()))
        param1_combinations_bar = tqdm(param1_combinations, desc=f"n = {n}")

        for param1_set in itertools.product(*param1_grid.values()):
            param1 = dict(zip(param1_grid.keys(), param1_set))

            for param2_set in itertools.product(*param2_grid.values()):
                param2 = dict(zip(param2_grid.keys(), param2_set))
                

                p.named_steps['model'].set_params(anchor_params= param1, lgbm_params=param2)
                p.fit(Xy_train, Xy_train['outcome'])
                y_pred = p.predict(Xy_tuning_data.head(n))
                mse = mean_squared_error(Xy_tuning_data['outcome'].head(n), y_pred)

                if mse < best_mse:
                    best_mse = mse
                    best_params1 = param1
                    best_params2 = param2

        results_for_n.append({'n': n, 'best_params set 1': best_params1, 'best_params set 2': best_params2, 'best_mse': best_mse})

    return results_for_n

param_grid_lgbm = {
    'boosting_type': ['gbdt'],
    'learning_rate': [0.01, 0.1, 0.3], # Gradient learning rate
    'n_estimators': [100, 800], # number of boosting iterations
    'num_leaves': [50, 1024], # Control tree structure - max. number of leaves in tree (num_leaves < 2^max depth)
    'feature_fraction': [0.5, 0.9] # % of features to sample when training each tree
}
param_grid_anchor = {
    'gamma': [1, 10, 10000],
    'instrument_regex': ['anchor'],
    'alpha': [0.00001, 0.001, 0.1]
}

results_p5_hirid = find_custom_parameters(Xy_train, Xy_tuning_hirid, p5, param_grid_anchor, param_grid_lgbm)
#results_p5_mimic = find_custom_parameters(Xy_train, Xy_tuning_mimic, p5, param_grid_anchor, param_grid_lgbm)
#results_p5_miiv = find_custom_parameters(Xy_train, Xy_tuning_miiv, p5, param_grid_anchor, param_grid_lgbm)

In [None]:
def calculate_mse(X_train, y_train, X_test, y_test, p, results):
    mse_for_n = []
    i = 0
    for n in [25, 50, 100, 200, 400, 800, 1600]:
        p.named_steps['model'].set_params(anchor_params= results[i]['best_params set 1'], lgbm_params= results[i]['best_params set 2'])
        p.fit(X_train, y_train)
        y_pred = p.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_for_n.append({'n': n, 'mse': mse})
        i += 1
    return mse_for_n

mse_eicu_to_hirid_p5 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_hirid, Xy_test_hirid['outcome'], p5, results_p5_hirid)
#mse_eicu_to_mimic_p5 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic, Xy_test_mimic['outcome'], p5, results_p5_mimic)
#mse_eicu_to_miiv_p5 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic, Xy_test_mimic['outcome'], p5, results_p5_miiv)

  if pd.api.types.is_categorical_dtype(dtype) and (column in df)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026594 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12611
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 279
[LightGBM] [Info] Start training from score -0.000000


  if pd.api.types.is_categorical_dtype(dtype) and (column in df)




  if pd.api.types.is_categorical_dtype(dtype) and (column in df)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12611
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 279
[LightGBM] [Info] Start training from score -0.000000


  if pd.api.types.is_categorical_dtype(dtype) and (column in df)




  if pd.api.types.is_categorical_dtype(dtype) and (column in df)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027669 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12611
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 279
[LightGBM] [Info] Start training from score -0.000000


  if pd.api.types.is_categorical_dtype(dtype) and (column in df)




  if pd.api.types.is_categorical_dtype(dtype) and (column in df)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028505 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12611
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 279
[LightGBM] [Info] Start training from score -0.000000


  if pd.api.types.is_categorical_dtype(dtype) and (column in df)




  if pd.api.types.is_categorical_dtype(dtype) and (column in df)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030820 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12611
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 279
[LightGBM] [Info] Start training from score -0.000000


  if pd.api.types.is_categorical_dtype(dtype) and (column in df)




  if pd.api.types.is_categorical_dtype(dtype) and (column in df)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040782 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12611
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 279
[LightGBM] [Info] Start training from score -0.000000


  if pd.api.types.is_categorical_dtype(dtype) and (column in df)




  if pd.api.types.is_categorical_dtype(dtype) and (column in df)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044403 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12611
[LightGBM] [Info] Number of data points in the train set: 72682, number of used features: 279
[LightGBM] [Info] Start training from score -0.000000


  if pd.api.types.is_categorical_dtype(dtype) and (column in df)


# Observations eICU to X

```markdown
The hyperparameters were selected to minimize the Mean Squared Error (MSE) on the fine-tuning dataset of the target distribution. This fine-tuning dataset consists of various sizes, including n = 25, 50, 100, 200, 400, 800, and 1600 data points from the target distribution. Initially, we randomly selected 1600 data points from the target data and named it Xy_tuning_data, which is distinct from the final evaluation dataset used to generate the plotted MSE after model training, called Xy_test_new. 
```
**Evaluation Process:**
```markdown
Our evaluation process follows these steps:

1. For each combination of the parameters, we train the model on the training data.
2. Next, we calculate the MSE on the fine-tuning data from the training distribution.
3. For each n value, we select the parameter combination that minimizes the MSE on the fine-tuning data.

We have four distinct pipelines for our models:

- LGBM pipeline: p1
- Random Forest pipeline: p2
- OLS pipeline: p3
- Anchor pipeline: p4

For OLS, we follow a slightly different approach. We train the model on the training data and evaluate it directly on the target data.

In a subsequent step, we repeat the parameter selection process on the training data and calculate the MSE on the target data - we call this approach the Baseline. The plot displays the model's performance along with the Baseline.
```

**Model Performance:**
```markdown
Interestingly, none of the models managed to substentially outperform the Baselines on any dataset. 

eICU --> Hirid:
- p1/LGBM: 
    - In an overall trend, LGBM is able improve its precision with increasing n, however, it is still not able to get the same precision as the Baseline
    - Similar to the RF, it restricts the number of leafs drastically
- p2/RF: 
    - The same parameters have been chosen every time
    - Its parameters coincide with those choosen by GridCV for n < 1600 
    - Its performance decreses when allowing 1600 fine-tuning datapoints, aka as soon as the distr. shift becomes noticable
    - Interestengly, it choses a small number of leaves compared to the size of the available fine-tuning dataset
- p4/Anchor and p3/OLS:
    - Surprisingly, Anchor fails to identify a significant distributional shift, aka. it chooses consistently gamma = 1, i.e. it coincides with OLS
    - Unsurprisingly, it performs almost identical to OLS, only the regularization influences the performance
    - The more fine-tuning data we allow, the less regularization it choses
    - Not able to beat the LGBM Baseline

eICU --> Mimic:
- p1/LGBM: 
    - In an overall trend, LGBM is able improve its precision with increasing n, and it is able to consistently beat its Baseline
    - It is able to improve its performance significantly by restricting itself to a small number of leafs
- p2/RF: 
    - The same parameters have been chosen for every n 
    - Its parameters coincide with those choosen by GridCV and the performance too
    - It does not seem to notice a distr. shift
    - Interestengly, it choses a small number of leaves compared to the size of the available fine-tuning dataset
- p4/Anchor and p3/OLS:
    - Surprisingly, Anchor fails to identify a significant distributional shift, aka. it chooses consistently gamma = 1, i.e. it coincides with OLS
    - Best performing model 

eICU --> Miiv:
- p1/LGBM: 
    - In an overall trend, LGBM is able improve its precision with increasing n, but not able to consistently beat its Baseline
    - It is able to improve its performance significantly by restricting itself to a small number of leafs
- p2/RF: 
    - It is able to adapt itself to the baseline parameters and coincides most of the time with the Baseline
    - It does not seem to notice a distr. shift
    - Interestengly, it choses a small number of leaves compared to the size of the available fine-tuning dataset
- p4/Anchor and p3/OLS:
    - Surprisingly, Anchor a distributional shift in the beginning, but fails to identify it consistently
    - However, Anchor is able to outperform OLS by a margin, most likely due to the regularization / OLS seems to have highly correlated features that destroy its prediction
    - Not able to beat the LGBM Baseline

The evaluation mse on the fine-tuning data when performing parameter selection does not seem to have any predictive power of the outcome of the mse on the target data

This observation could be attributed to the limited available hyperparameters. It would be intriguing to investigate whether the models can surpass their Baseline when provided with more possibilities. A potential follow-up question is whether predictive performance improves with n=2000 (Hypothesis: Yes, as the prediction benefits from more accurate data).

Remarkably, all models outperformed the average prediction of the training data by a substantial margin

## Parameter Grid

```markdown
The hyperparameters were chosen from three distinct parameter grids:
```

**LightGBM (param_grid_lgbm):**
```python
param_grid_lgbm = {
    'boosting_type': ['gbdt'],
    'learning_rate': [0.01, 0.1, 0.3], # Gradient learning rate
    'n_estimators': [100, 800], # number of boosting iterations
    'num_leaves': [50, 200, 1024], # Control tree structure - max. number of leaves in tree (num_leaves < 2^max depth)
    'feature_fraction': [0.5, 0.9] # % of features to sample when training each tree
}
```

**RF (param_grid_rf):**
```python
param_grid_rf = {
    'boosting_type': ['rf'],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 800], 
    'num_leaves': [50, 200, 1024], 
    'feature_fraction': [0.5, 0.9]
}
```

**Anchor (param_grid_anchor):**
```python
param_grid_anchor = {
    'gamma': [1, 10, 10000],
    'instrument_regex': ['anchor'],
    'alpha': [0.00001, 0.001, 0.1]
}
```

**Custom Anchor:**
```python
param_grid_lgbm = {
    'boosting_type': ['gbdt'],
    'learning_rate': [0.01, 0.3], # Gradient learning rate
    'n_estimators': [100, 800], # number of boosting iterations
    'num_leaves': [50, 1024], # Control tree structure - max. number of leaves in tree (num_leaves < 2^max depth)
    'feature_fraction': [0.5, 0.9] # % of features to sample when training each tree
}
param_grid_anchor = {
    'gamma': [1, 10],
    'instrument_regex': ['anchor'],
    'alpha': [0.001, 0.1]
}
```

### Conclusion

We see that for both, LGBM and RF the algorithm chooses the same set of parameters when evaluated on the tuning set from the target date every time. Hence unsurprisingly, the mse is constant. Similar for Anchor, we choose alomst every time the same set of parameters and do not improve the mse.

For LGBM: The set of parameters chosen by grid search outperforms the parameters chosen by evaluation on the target. 

For RF: The set of parameters is the same, i.e. same performance 

For Anchor: No CV on train

However, Anchor is again able to beat the predictive performance from OLS with the available parameters. 

For CustomAnchor (Anchor + LGBM Boosting): The set of parameters improves when increasing the evaluation data from the target. This method outperforms all other methods when having 1600 fine-tuning data points available.

We conclude by noting that it is of utmost importance to include hyperparameters that prevent overfitting of the tree methods (compare results from parameterset 1 and parameterset 2) and are curious if the performance of CustomAnchor can be improved too when allowing these kind of parameters. 

# Comparison Mimic without children

In [None]:
Xy_test_mimic_no_children, Xy_tuning_mimic_no_children = Xy_test_mimic[Xy_test_mimic['age'] > 18], Xy_tuning_mimic[Xy_tuning_mimic['age'] > 18]

mse_grid_lgbm_mimic_nC = mean_squared_error(Xy_test_mimic_no_children['outcome'], p1.predict(Xy_test_mimic_no_children))
mse_grid_rf_mimic_nC = mean_squared_error(Xy_test_mimic_no_children['outcome'], p2.predict(Xy_test_mimic_no_children))

results_p1_mimic_nC = find_best_parameters(Xy_train, Xy_tuning_mimic_no_children, p1, param_grid_lgbm)
results_p2_mimic_nC = find_best_parameters(Xy_train, Xy_tuning_mimic_no_children, p2, param_grid_rf)
results_p4_mimic_nC = find_best_parameters(Xy_train, Xy_tuning_mimic_no_children, p4, param_grid_anchor)

mse_eicu_to_mimic_p1_nC = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic_no_children, Xy_test_mimic_no_children['outcome'], p1, results_p1_mimic_nC)
mse_eicu_to_mimic_p2_nC = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic_no_children, Xy_test_mimic_no_children['outcome'], p2, results_p2_mimic_nC)
mse_eicu_to_mimic_p4_nC = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic_no_children, Xy_test_mimic_no_children['outcome'], p4, results_p4_mimic_nC)

mse_eicu_to_mimic_p3_nC = mean_squared_error(Xy_test_mimic_no_children['outcome'], p3.predict(Xy_test_mimic_no_children))
mse_eicu_to_mimic_dummy_prediction_nC = mean_squared_error(Xy_test_mimic_no_children['outcome'], np.full_like(Xy_test_mimic_no_children['outcome'],Xy_train[outcome].mean()))

plotting(mse_eicu_to_mimic_p1_nC, mse_eicu_to_mimic_p2_nC, mse_eicu_to_mimic_p3_nC, mse_eicu_to_mimic_p4_nC, mse_grid_lgbm_mimic_nC, mse_grid_rf_mimic_nC, 'Eicu', 'Mimic no Children')


# ToDo:

- Malte Fragen beantworten: 
    - peak: Hab mit falschen Parametern getestet
    - Sowohl RF wie LGBM mit tuning auf target distr. data sind schlechter (nie besser) als die Baselines, egal wie gross “n” ist. Wieso?
    - Dein MSE der OLS baseline eICU -> MIMIC III ist signifikant besser als das was ich in dem pdf das ich dir mal geschickt hatte habe (~175). Was machst du anders? ################## das ist eICU -> Hirid
- Refit implementieren und anschauen
- Euler
- Connect to ADA and run jobs - muss ich dann icu & iv neu installieren --- wie?
- ML Flow lernen 