In [None]:
from icu_experiments.load_data import load_data_for_prediction
from icu_experiments.preprocessing import make_feature_preprocessing, make_anchor_preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor, Booster
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import itertools
from ivmodels import AnchorRegression
from plotting import plot_tuning

outcome = "hr"

sources = ['eicu', 'hirid', 'mimic', 'miiv']
regressors = ['lgbm', 'rf', 'ols', 'anchor']
_data = load_data_for_prediction(sources,  outcome=outcome, log_transform=True)

preprocessor_ols = ColumnTransformer(transformers=make_feature_preprocessing(missing_indicator=True)).set_output(transform="pandas") # Allow to preprocess subbsets of data differently


#### LGBM Categorical
preprocessor_lgbm = ColumnTransformer(transformers=make_feature_preprocessing(missing_indicator=False)).set_output(transform="pandas") # Allow to preprocess subbsets of data differently


anchor_columns = ['hospital_id']
anchor_preprocessor = ColumnTransformer(
        make_anchor_preprocessing(anchor_columns) + make_feature_preprocessing(missing_indicator=True) #preprocessing_steps
    ).set_output(transform="pandas")

pipeline_lgbm = Pipeline(steps=[
    ('preprocessing', preprocessor_lgbm),
    ('model', LGBMRegressor())
])
pipeline_rf = Pipeline(steps=[
    ('preprocessing', preprocessor_lgbm),
    ('model', LGBMRegressor())
])
pipeline_ols = Pipeline(steps=[
    ('preprocessing', preprocessor_ols),
    ('model', LinearRegression())
])

pipeline_anchor = Pipeline(steps=[
    ('preprocessing', anchor_preprocessor),
    ('model', AnchorRegression())
])

pipelines = {'lgbm': pipeline_lgbm,
             'rf': pipeline_rf,
             'ols': pipeline_ols,
             'anchor': pipeline_anchor}

In [75]:
params = {}
params['lgbm'] = {
    'boosting_type': ['gbdt'],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 800],
    'num_leaves': [50, 200, 1024],
    'feature_fraction': [0.5, 0.9],
    'verbose': [-1]
}
params['rf'] = {
    'boosting_type': ['gbdt'],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 800],
    'num_leaves': [50, 200, 1024],
    'feature_fraction': [0.5, 0.9],
    'verbose': [-1]
}
params['anchor'] = {
    'gamma': [1, 10, 10000],
    'instrument_regex': ['anchor'],
    'alpha': [0.00001, 0.001, 0.1]
}

In [28]:
params['lgbm']

{'boosting_type': ['gbdt'],
 'learning_rate': [0.01, 0.1, 0.3],
 'n_estimators': [100, 800],
 'num_leaves': [50, 200, 1024],
 'feature_fraction': [0.5, 0.9],
 'verbose': [0]}

In [22]:
_data['hirid']['train']

Unnamed: 0,source,stay_id,cai,ptt,ast,be,bili,urine,mg,ca,...,year,urgency,ethnic,icu_adm_dow,hosp_adm_dow,adm_caregiver,adm_provider,numbedscategory,teachingstatus,region
3774,hirid,15177,0.141433,,4.999626,6.716667,-1.047533,5.623972,0.552218,,...,,,,Thursday,Thursday,[-1],[missing],,,
4551,hirid,18350,0.133510,,2.833213,-1.466667,-1.229855,5.734066,0.556503,,...,,,,Monday,Monday,[-1],[missing],,,
5796,hirid,23413,0.123886,3.543854,,-2.540000,,5.560046,,,...,,,,Sunday,Sunday,[-1],[missing],,,
3345,hirid,13514,0.131028,,,0.300000,,5.808582,,,...,,,,Tuesday,Tuesday,[-1],[missing],,,
6057,hirid,24392,,,,,,5.306136,,,...,,,,Monday,Monday,[-1],[missing],,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4373,hirid,17659,0.133817,4.264840,5.017280,-3.537500,0.492912,5.624400,0.377477,,...,,,,Thursday,Thursday,[-1],[missing],,,
7891,hirid,31946,0.143076,,,-6.620000,,4.917566,0.737480,,...,,,,Monday,Monday,[-1],[missing],,,
4859,hirid,19693,0.256127,,2.564949,-2.020000,-1.452998,4.216416,0.760469,,...,,,,Monday,Monday,[-1],[missing],,,
3264,hirid,13171,0.130990,,,-1.583333,,5.084565,,,...,,,,Tuesday,Tuesday,[-1],[missing],,,


## Compare Performance from Training to Target Data - Parameters chosen via GridCV on Training Data

In [41]:
{'model__' + key : value for key, value in params['lgbm'].items()}

{'model__boosting_type': ['gbdt'],
 'model__learning_rate': [0.01, 0.1, 0.3],
 'model__n_estimators': [100, 800],
 'model__num_leaves': [50, 200, 1024],
 'model__feature_fraction': [0.5, 0.9],
 'model__verbose': [-1]}

In [None]:
mse_grid_search = {}

for name, pipe in pipelines.items():
    print(name)
    if name not in ['ols', 'anchor']:
        search = GridSearchCV(pipe, param_grid= {'model__' + key : value for key, value in params[name].items()})
        search.fit(_data['eicu']['train'], _data['eicu']['train']['outcome'])
        print('finsihed GCV')
        pipe.set_params(**search.best_params_)
    
    pipe.fit(_data['eicu']['train'], _data['eicu']['train']['outcome'])
    
    for source in sources: 
        print(source)
        if source != 'eicu':
            if name not in ['ols', 'anchor']:
                mse_grid_search[name] = {'parameters': search.best_params_,
                'MSE on {source}' : mean_squared_error(_data[source]['train']['outcome'], pipe.predict(_data[source]['train']))}
            else: 
                mse_grid_search[name] = {'parameters': None,
                'MSE on {source}' : mean_squared_error(_data[source]['train']['outcome'], pipe.predict(_data[source]['train']))}
        print(f'Completed {name} run on {source}')

## Compare Performance from Training to Target Data - Parameters chosen via Evaluation on Target Data

Approach: 
- Train Data with different parameters on Training set
- Evaluate Train Data on fine tuning data from target set 
- choose the best performing parameters
- do this for all possible n from the fine tuning data set

In [63]:
for comb, param_set in enumerate(itertools.product(*params['lgbm'].values())):
    print(comb, param_set)

0 ('gbdt', 0.01, 100, 50, 0.5, -1)
1 ('gbdt', 0.01, 100, 50, 0.9, -1)
2 ('gbdt', 0.01, 100, 200, 0.5, -1)
3 ('gbdt', 0.01, 100, 200, 0.9, -1)
4 ('gbdt', 0.01, 100, 1024, 0.5, -1)
5 ('gbdt', 0.01, 100, 1024, 0.9, -1)
6 ('gbdt', 0.01, 800, 50, 0.5, -1)
7 ('gbdt', 0.01, 800, 50, 0.9, -1)
8 ('gbdt', 0.01, 800, 200, 0.5, -1)
9 ('gbdt', 0.01, 800, 200, 0.9, -1)
10 ('gbdt', 0.01, 800, 1024, 0.5, -1)
11 ('gbdt', 0.01, 800, 1024, 0.9, -1)
12 ('gbdt', 0.1, 100, 50, 0.5, -1)
13 ('gbdt', 0.1, 100, 50, 0.9, -1)
14 ('gbdt', 0.1, 100, 200, 0.5, -1)
15 ('gbdt', 0.1, 100, 200, 0.9, -1)
16 ('gbdt', 0.1, 100, 1024, 0.5, -1)
17 ('gbdt', 0.1, 100, 1024, 0.9, -1)
18 ('gbdt', 0.1, 800, 50, 0.5, -1)
19 ('gbdt', 0.1, 800, 50, 0.9, -1)
20 ('gbdt', 0.1, 800, 200, 0.5, -1)
21 ('gbdt', 0.1, 800, 200, 0.9, -1)
22 ('gbdt', 0.1, 800, 1024, 0.5, -1)
23 ('gbdt', 0.1, 800, 1024, 0.9, -1)
24 ('gbdt', 0.3, 100, 50, 0.5, -1)
25 ('gbdt', 0.3, 100, 50, 0.9, -1)
26 ('gbdt', 0.3, 100, 200, 0.5, -1)
27 ('gbdt', 0.3, 100, 200, 0

In [71]:
results = {}

for name, pipe in pipelines.items():
    if name not in ['ols']:
        results[name] = {}
        print(name)
        for comb, param_set in enumerate(itertools.product(*params[name].values())):
            para = dict(zip(params[name].keys(), param_set))
            pipe.named_steps['model'].set_params(**para)
            pipe.fit(_data['eicu']['train'], _data['eicu']['train']['outcome'])
            results[name][comb] = {}
            for source in sources: 
                results[name][comb][source] = {}
                if source != 'eicu':
                    for n in [25, 50, 100, 200, 400, 800, 1600]:
                        y_pred_eval = pipe.predict(_data[source]['test'].head(n))
                        y_pred_test = pipe.predict(_data[source]['train'])
                        
                        mse_eval = mean_squared_error(_data[source]['test']['outcome'].head(n), y_pred_eval)
                        mse_test = mean_squared_error(_data[source]['train']['outcome'], y_pred_test)
                        
                        results[name][comb][source][n] = {
                            'params': para,
                            'MSE on Eval. Set from Target': mse_eval,
                            'MSE on Target': mse_test
                        }
                print(f'finished {comb} on source {source}')
        clear_output()

lgbm
finished 0 on source eicu
finished 0 on source hirid
finished 0 on source mimic
finished 0 on source miiv
finished 1 on source eicu
finished 1 on source hirid
finished 1 on source mimic
finished 1 on source miiv
finished 2 on source eicu
finished 2 on source hirid
finished 2 on source mimic
finished 2 on source miiv
finished 3 on source eicu
finished 3 on source hirid
finished 3 on source mimic
finished 3 on source miiv
finished 4 on source eicu
finished 4 on source hirid
finished 4 on source mimic
finished 4 on source miiv
finished 5 on source eicu
finished 5 on source hirid
finished 5 on source mimic
finished 5 on source miiv
finished 6 on source eicu
finished 6 on source hirid
finished 6 on source mimic
finished 6 on source miiv
finished 7 on source eicu
finished 7 on source hirid
finished 7 on source mimic
finished 7 on source miiv
finished 8 on source eicu
finished 8 on source hirid
finished 8 on source mimic
finished 8 on source miiv
finished 9 on source eicu
finished 9 on s

NameError: name 'clear_output' is not defined

In [None]:
def calculate_mse(X_train, y_train, X_test, y_test, p, results):
    mse_for_n = []
    i = 0
    for n in [25, 50, 100, 200, 400, 800, 1600]:
        p.named_steps['model'].set_params(**results[i]['best_params'])
        p.fit(X_train, y_train)
        y_pred = p.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_for_n.append({'n': n, 'mse': mse})
        i += 1
    return mse_for_n

mse_eicu_to_hirid_p1 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_hirid, Xy_test_hirid['outcome'], p1, results_p1_hirid)
mse_eicu_to_hirid_p2 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_hirid, Xy_test_hirid['outcome'], p2, results_p2_hirid)
mse_eicu_to_hirid_p4 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_hirid, Xy_test_hirid['outcome'], p4, results_p4_hirid)

mse = {}

for source in sources: 
    for name, pipe in pipelines.items():
        
        for n in [25, 50, 100, 200, 400, 800, 1600]:
            p.named_steps['model'].set_params(**results[i]['best_params'])
            p.fit(X_train, y_train)
            y_pred = p.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            mse_for_n.append({'n': n, 'mse': mse})

    mse[source] = 

In [None]:
def find_best_parameters(Xy_train, Xy_tuning_data, p, param_grid):
    results_for_n = []

    for n in [25, 50, 100, 200, 400, 800, 1600]:
        best_params = None
        best_mse = float('inf')  # Initialize with a large value

        # Iterate over all possible combinations of hyperparameters
        for param_set in itertools.product(*param_grid.values()):
            params = dict(zip(param_grid.keys(), param_set))
        
            p.named_steps['model'].set_params(**params)
            p.fit(Xy_train, Xy_train['outcome'])
            y_pred = p.predict(Xy_tuning_data.head(n))
            mse = mean_squared_error(Xy_tuning_data['outcome'].head(n), y_pred)

            if mse < best_mse:
                best_mse = mse
                best_params = params

        results_for_n.append({'n': n, 'best_params': best_params, 'best_mse': best_mse})

    return results_for_n

results_p1_hirid = find_best_parameters(Xy_train, Xy_tuning_hirid, p1, param_grid_lgbm)
results_p2_hirid = find_best_parameters(Xy_train, Xy_tuning_hirid, p2, param_grid_rf)
results_p4_hirid = find_best_parameters(Xy_train, Xy_tuning_hirid, p4, param_grid_anchor)

results_p1_mimic = find_best_parameters(Xy_train, Xy_tuning_mimic, p1, param_grid_lgbm)
results_p2_mimic = find_best_parameters(Xy_train, Xy_tuning_mimic, p2, param_grid_rf)
results_p4_mimic = find_best_parameters(Xy_train, Xy_tuning_mimic, p4, param_grid_anchor)

results_p1_miiv = find_best_parameters(Xy_train, Xy_tuning_miiv, p1, param_grid_lgbm)
results_p2_miiv = find_best_parameters(Xy_train, Xy_tuning_miiv, p2, param_grid_rf)
results_p4_miiv = find_best_parameters(Xy_train, Xy_tuning_miiv, p4, param_grid_anchor)

def calculate_mse(X_train, y_train, X_test, y_test, p, results):
    mse_for_n = []
    i = 0
    for n in [25, 50, 100, 200, 400, 800, 1600]:
        p.named_steps['model'].set_params(**results[i]['best_params'])
        p.fit(X_train, y_train)
        y_pred = p.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_for_n.append({'n': n, 'mse': mse})
        i += 1
    return mse_for_n

mse_eicu_to_hirid_p1 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_hirid, Xy_test_hirid['outcome'], p1, results_p1_hirid)
mse_eicu_to_hirid_p2 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_hirid, Xy_test_hirid['outcome'], p2, results_p2_hirid)
mse_eicu_to_hirid_p4 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_hirid, Xy_test_hirid['outcome'], p4, results_p4_hirid)

mse_eicu_to_mimic_p1 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic, Xy_test_mimic['outcome'], p1, results_p1_mimic)
mse_eicu_to_mimic_p2 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic, Xy_test_mimic['outcome'], p2, results_p2_mimic)
mse_eicu_to_mimic_p4 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic, Xy_test_mimic['outcome'], p4, results_p4_mimic)

mse_eicu_to_miiv_p1 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_miiv, Xy_test_miiv['outcome'], p1, results_p1_miiv)
mse_eicu_to_miiv_p2 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_miiv, Xy_test_miiv['outcome'], p2, results_p2_miiv)
mse_eicu_to_miiv_p4 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_miiv, Xy_test_miiv['outcome'], p4, results_p4_miiv)

# OLS MSE Calculation
p3.fit(Xy_train, Xy_train['outcome'])
mse_eicu_to_hirid_p3 = mean_squared_error(Xy_test_hirid['outcome'], p3.predict(Xy_test_hirid))
mse_eicu_to_hirid_dummy_prediction = mean_squared_error(Xy_test_hirid['outcome'], np.full_like(Xy_test_hirid['outcome'],Xy_train[outcome].mean()))

mse_eicu_to_mimic_p3 = mean_squared_error(Xy_test_mimic['outcome'], p3.predict(Xy_test_mimic))
mse_eicu_to_mimic_dummy_prediction = mean_squared_error(Xy_test_mimic['outcome'], np.full_like(Xy_test_mimic['outcome'],Xy_train[outcome].mean()))

mse_eicu_to_miiv_p3 = mean_squared_error(Xy_test_miiv['outcome'], p3.predict(Xy_test_miiv))
mse_eicu_to_miiv_dummy_prediction = mean_squared_error(Xy_test_miiv['outcome'], np.full_like(Xy_test_miiv['outcome'],Xy_train[outcome].mean()))

def plotting(mse_p1, mse_p2, mse_p3, mse_p4, mse_baseline1, mse_baseline2, train, target):
    n = [25, 50, 100, 200, 400, 800, 1600]
    plt.plot(n, ([item['mse'] for item in mse_p1]), marker='o', linestyle='-', label = 'LGBM')
    plt.plot(n, [item['mse'] for item in mse_p2], marker='o', linestyle='-', label = 'RF')
    plt.plot(n, [item['mse'] for item in mse_p4], marker='o', linestyle='-', label = 'Anchor')
    plt.axhline(y=mse_p3, color='black', linestyle='-', label='OLS Baseline')
    plt.axhline(y=mse_baseline1, color='green', linestyle='-', label='LGBM Baseline')
    plt.axhline(y=mse_baseline2, color='purple', linestyle='-', label='RF Baseline')
    #plt.axhline(y = mean_squared_error(Xy_test_new['outcome'], np.full_like(Xy_test_new['outcome'],Xy_train[outcome].mean())), color = 'black', label='Train Average')
    plt.title(f'Parameters for Model chosen with evaluation on n Data Points from Target Distribution {target}')
    plt.xlabel('Number of Data Points (n)')
    plt.ylabel('Mean Squared Error (MSE)')
    plt.legend()
    plt.grid(True)
    plt.show()

plotting(mse_eicu_to_hirid_p1, mse_eicu_to_hirid_p2, mse_eicu_to_hirid_p3, mse_eicu_to_hirid_p4, mse_grid_lgbm_hirid, mse_grid_rf_hirid, 'Eicu', 'Hirid')
plotting(mse_eicu_to_mimic_p1, mse_eicu_to_mimic_p2, mse_eicu_to_mimic_p3, mse_eicu_to_mimic_p4, mse_grid_lgbm_mimic, mse_grid_rf_mimic, 'Eicu', 'Mimic')
plotting(mse_eicu_to_miiv_p1, mse_eicu_to_miiv_p2, mse_eicu_to_miiv_p3, mse_eicu_to_miiv_p4, mse_grid_lgbm_miiv, mse_grid_rf_miiv, 'Eicu', 'Miiv')

## Store results

In [12]:
results_dict = {}
mse_dict = {}

results_dict['p1_baseline'] = grid_search_params_lgbm
results_dict['p2_baseline'] = grid_search_params_rf
results_dict['p1_hirid'] = results_p1_hirid
results_dict['p2_hirid'] = results_p2_hirid
results_dict['p4_hirid'] = results_p4_hirid

results_dict['p1_mimic'] = results_p1_mimic
results_dict['p2_mimic'] = results_p2_mimic
results_dict['p4_mimic'] = results_p4_mimic

results_dict['p1_miiv'] = results_p1_miiv
results_dict['p2_miiv'] = results_p2_miiv
results_dict['p4_miiv'] = results_p4_miiv

mse_dict['eicu_to_hirid_p1_baseline'] = mse_grid_lgbm_hirid
mse_dict['eicu_to_hirid_p1'] = mse_eicu_to_hirid_p1
mse_dict['eicu_to_hirid_p2'] = mse_eicu_to_hirid_p2
mse_dict['eicu_to_hirid_p3'] = mse_eicu_to_hirid_p3
mse_dict['eicu_to_hirid_p4'] = mse_eicu_to_hirid_p4

mse_dict['eicu_to_mimic_p1_baseline'] = mse_grid_lgbm_mimic
mse_dict['eicu_to_mimic_p1'] = mse_eicu_to_mimic_p1
mse_dict['eicu_to_mimic_p2'] = mse_eicu_to_mimic_p2
mse_dict['eicu_to_mimic_p3'] = mse_eicu_to_mimic_p3
mse_dict['eicu_to_mimic_p4'] = mse_eicu_to_mimic_p4

mse_dict['eicu_to_miiv_p1_baseline'] = mse_grid_lgbm_miiv
mse_dict['eicu_to_miiv_p1'] = mse_eicu_to_miiv_p1
mse_dict['eicu_to_miiv_p2'] = mse_eicu_to_miiv_p2
mse_dict['eicu_to_miiv_p3'] = mse_eicu_to_miiv_p3
mse_dict['eicu_to_miiv_p4'] = mse_eicu_to_miiv_p4

mse_dict['eicu_to_hirid_dummy_prediction'] = mse_eicu_to_hirid_dummy_prediction
mse_dict['eicu_to_mimic_dummy_prediction'] = mse_eicu_to_mimic_dummy_prediction
mse_dict['eicu_to_miiv_dummy_prediction'] = mse_eicu_to_miiv_dummy_prediction

import pickle

# Define the file paths to save the dictionaries
results_file_path = 'Woche5/parameters_dict.pkl'
mse_file_path = 'Woche5/mse_dict.pkl'

# Save the results dictionary to a file
with open(results_file_path, 'wb') as results_file:
    pickle.dump(results_dict, results_file)

# Save the MSE dictionary to a file
with open(mse_file_path, 'wb') as mse_file:
    pickle.dump(mse_dict, mse_file)

# Optionally, you can print a message to confirm the saving process
print(f"Results saved to {results_file_path}")
print(f"MSE values saved to {mse_file_path}")

Results saved to Woche5/parameters_dict.pkl
MSE values saved to Woche5/mse_dict.pkl


## Custom Anchor

In [None]:
class CustomizedAnchor(BaseEstimator, RegressorMixin):
    def __init__(self, anchor_params=None, lgbm_params=None):
        # Initialize parameters
        self.anchor_params = anchor_params if anchor_params is not None else {}
        self.lgbm_params = lgbm_params if lgbm_params is not None else {}

    def fit(self, X, y):
        # Initialize and fit the Anchor Regression model
        self.anchor_model = AnchorRegression(**self.anchor_params)
        self.anchor_model.fit(X, y)

        # Calculate residuals
        residuals = y - self.anchor_model.predict(X)

        # Initialize and fit the LGBMRegressor with residuals
        self.lgbm_model = LGBMRegressor(**self.lgbm_params)
        self.lgbm_model.fit(X, residuals)

        return self

    def predict(self, X):
        # Check if fit has been called
        if not hasattr(self, 'anchor_model') or not hasattr(self, 'lgbm_model'):
            raise AttributeError("Models have not been fitted. Call fit() first.")

        # Make predictions
        anchor_predictions = self.anchor_model.predict(X)
        lgbm_predictions = self.lgbm_model.predict(X)

        # Combine predictions
        return anchor_predictions + lgbm_predictions
    
p5 = Pipeline(steps=[
    ('preprocessing', anchor_preprocessor),
    ('model', CustomizedAnchor())
])


def find_custom_parameters(Xy_train, Xy_tuning_data, p, param1_grid, param2_grid):
    results_for_n = []

    for n in [25, 50, 100, 200, 400, 800, 1600]:
        best_params1 = None
        best_params2 = None
        best_mse = float('inf') 

        param1_combinations = list(itertools.product(*param1_grid.values()))
        param1_combinations_bar = tqdm(param1_combinations, desc=f"n = {n}")

        for param1_set in itertools.product(*param1_grid.values()):
            param1 = dict(zip(param1_grid.keys(), param1_set))

            for param2_set in itertools.product(*param2_grid.values()):
                param2 = dict(zip(param2_grid.keys(), param2_set))
                

                p.named_steps['model'].set_params(anchor_params= param1, lgbm_params=param2)
                p.fit(Xy_train, Xy_train['outcome'])
                y_pred = p.predict(Xy_tuning_data.head(n))
                mse = mean_squared_error(Xy_tuning_data['outcome'].head(n), y_pred)

                if mse < best_mse:
                    best_mse = mse
                    best_params1 = param1
                    best_params2 = param2

        results_for_n.append({'n': n, 'best_params set 1': best_params1, 'best_params set 2': best_params2, 'best_mse': best_mse})

    return results_for_n

param_grid_lgbm = {
    'boosting_type': ['gbdt'],
    'learning_rate': [0.01, 0.1, 0.3], # Gradient learning rate
    'n_estimators': [100, 800], # number of boosting iterations
    'num_leaves': [50, 1024], # Control tree structure - max. number of leaves in tree (num_leaves < 2^max depth)
    'feature_fraction': [0.5, 0.9] # % of features to sample when training each tree
}
param_grid_anchor = {
    'gamma': [1, 10, 10000],
    'instrument_regex': ['anchor'],
    'alpha': [0.00001, 0.001, 0.1]
}

results_p5_hirid = find_custom_parameters(Xy_train, Xy_tuning_hirid, p5, param_grid_anchor, param_grid_lgbm)
#results_p5_mimic = find_custom_parameters(Xy_train, Xy_tuning_mimic, p5, param_grid_anchor, param_grid_lgbm)
#results_p5_miiv = find_custom_parameters(Xy_train, Xy_tuning_miiv, p5, param_grid_anchor, param_grid_lgbm)

In [None]:
def calculate_mse(X_train, y_train, X_test, y_test, p, results):
    mse_for_n = []
    i = 0
    for n in [25, 50, 100, 200, 400, 800, 1600]:
        p.named_steps['model'].set_params(anchor_params= results[i]['best_params set 1'], lgbm_params= results[i]['best_params set 2'])
        p.fit(X_train, y_train)
        y_pred = p.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_for_n.append({'n': n, 'mse': mse})
        i += 1
    return mse_for_n

mse_eicu_to_hirid_p5 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_hirid, Xy_test_hirid['outcome'], p5, results_p5_hirid)
#mse_eicu_to_mimic_p5 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic, Xy_test_mimic['outcome'], p5, results_p5_mimic)
#mse_eicu_to_miiv_p5 = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic, Xy_test_mimic['outcome'], p5, results_p5_miiv)

# Observations eICU to X

```markdown
The hyperparameters were selected to minimize the Mean Squared Error (MSE) on the fine-tuning dataset of the target distribution. This fine-tuning dataset consists of various sizes, including n = 25, 50, 100, 200, 400, 800, and 1600 data points from the target distribution. Initially, we randomly selected 1600 data points from the target data and named it Xy_tuning_data, which is distinct from the final evaluation dataset used to generate the plotted MSE after model training, called Xy_test_new. 
```
**Evaluation Process:**
```markdown
Our evaluation process follows these steps:

1. For each combination of the parameters, we train the model on the training data.
2. Next, we calculate the MSE on the fine-tuning data from the training distribution.
3. For each n value, we select the parameter combination that minimizes the MSE on the fine-tuning data.

We have four distinct pipelines for our models:

- LGBM pipeline: p1
- Random Forest pipeline: p2
- OLS pipeline: p3
- Anchor pipeline: p4

For OLS, we follow a slightly different approach. We train the model on the training data and evaluate it directly on the target data.

In a subsequent step, we repeat the parameter selection process on the training data and calculate the MSE on the target data - we call this approach the Baseline. The plot displays the model's performance along with the Baseline.
```

**Model Performance:**
```markdown
Interestingly, none of the models managed to substentially outperform the Baselines on any dataset. 

eICU --> Hirid:
- p1/LGBM: 
    - In an overall trend, LGBM is able improve its precision with increasing n, however, it is still not able to get the same precision as the Baseline
    - Similar to the RF, it restricts the number of leafs drastically
- p2/RF: 
    - The same parameters have been chosen every time
    - Its parameters coincide with those choosen by GridCV for n < 1600 
    - Its performance decreses when allowing 1600 fine-tuning datapoints, aka as soon as the distr. shift becomes noticable
    - Interestengly, it choses a small number of leaves compared to the size of the available fine-tuning dataset
- p4/Anchor and p3/OLS:
    - Surprisingly, Anchor fails to identify a significant distributional shift, aka. it chooses consistently gamma = 1, i.e. it coincides with OLS
    - Unsurprisingly, it performs almost identical to OLS, only the regularization influences the performance
    - The more fine-tuning data we allow, the less regularization it choses
    - Not able to beat the LGBM Baseline

eICU --> Mimic:
- p1/LGBM: 
    - In an overall trend, LGBM is able improve its precision with increasing n, and it is able to consistently beat its Baseline
    - It is able to improve its performance significantly by restricting itself to a small number of leafs
- p2/RF: 
    - The same parameters have been chosen for every n 
    - Its parameters coincide with those choosen by GridCV and the performance too
    - It does not seem to notice a distr. shift
    - Interestengly, it choses a small number of leaves compared to the size of the available fine-tuning dataset
- p4/Anchor and p3/OLS:
    - Surprisingly, Anchor fails to identify a significant distributional shift, aka. it chooses consistently gamma = 1, i.e. it coincides with OLS
    - Best performing model 

eICU --> Miiv:
- p1/LGBM: 
    - In an overall trend, LGBM is able improve its precision with increasing n, but not able to consistently beat its Baseline
    - It is able to improve its performance significantly by restricting itself to a small number of leafs
- p2/RF: 
    - It is able to adapt itself to the baseline parameters and coincides most of the time with the Baseline
    - It does not seem to notice a distr. shift
    - Interestengly, it choses a small number of leaves compared to the size of the available fine-tuning dataset
- p4/Anchor and p3/OLS:
    - Surprisingly, Anchor a distributional shift in the beginning, but fails to identify it consistently
    - However, Anchor is able to outperform OLS by a margin, most likely due to the regularization / OLS seems to have highly correlated features that destroy its prediction
    - Not able to beat the LGBM Baseline

The evaluation mse on the fine-tuning data when performing parameter selection does not seem to have any predictive power of the outcome of the mse on the target data

This observation could be attributed to the limited available hyperparameters. It would be intriguing to investigate whether the models can surpass their Baseline when provided with more possibilities. A potential follow-up question is whether predictive performance improves with n=2000 (Hypothesis: Yes, as the prediction benefits from more accurate data).

Remarkably, all models outperformed the average prediction of the training data by a substantial margin

## Parameter Grid

```markdown
The hyperparameters were chosen from three distinct parameter grids:
```

**LightGBM (param_grid_lgbm):**
```python
param_grid_lgbm = {
    'boosting_type': ['gbdt'],
    'learning_rate': [0.01, 0.1, 0.3], # Gradient learning rate
    'n_estimators': [100, 800], # number of boosting iterations
    'num_leaves': [50, 200, 1024], # Control tree structure - max. number of leaves in tree (num_leaves < 2^max depth)
    'feature_fraction': [0.5, 0.9] # % of features to sample when training each tree
}
```

**RF (param_grid_rf):**
```python
param_grid_rf = {
    'boosting_type': ['rf'],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 800], 
    'num_leaves': [50, 200, 1024], 
    'feature_fraction': [0.5, 0.9]
}
```

**Anchor (param_grid_anchor):**
```python
param_grid_anchor = {
    'gamma': [1, 10, 10000],
    'instrument_regex': ['anchor'],
    'alpha': [0.00001, 0.001, 0.1]
}
```

**Custom Anchor:**
```python
param_grid_lgbm = {
    'boosting_type': ['gbdt'],
    'learning_rate': [0.01, 0.3], # Gradient learning rate
    'n_estimators': [100, 800], # number of boosting iterations
    'num_leaves': [50, 1024], # Control tree structure - max. number of leaves in tree (num_leaves < 2^max depth)
    'feature_fraction': [0.5, 0.9] # % of features to sample when training each tree
}
param_grid_anchor = {
    'gamma': [1, 10],
    'instrument_regex': ['anchor'],
    'alpha': [0.001, 0.1]
}
```

### Conclusion

We see that for both, LGBM and RF the algorithm chooses the same set of parameters when evaluated on the tuning set from the target date every time. Hence unsurprisingly, the mse is constant. Similar for Anchor, we choose alomst every time the same set of parameters and do not improve the mse.

For LGBM: The set of parameters chosen by grid search outperforms the parameters chosen by evaluation on the target. 

For RF: The set of parameters is the same, i.e. same performance 

For Anchor: No CV on train

However, Anchor is again able to beat the predictive performance from OLS with the available parameters. 

For CustomAnchor (Anchor + LGBM Boosting): The set of parameters improves when increasing the evaluation data from the target. This method outperforms all other methods when having 1600 fine-tuning data points available.

We conclude by noting that it is of utmost importance to include hyperparameters that prevent overfitting of the tree methods (compare results from parameterset 1 and parameterset 2) and are curious if the performance of CustomAnchor can be improved too when allowing these kind of parameters. 

# Comparison Mimic without children

In [None]:
Xy_test_mimic_no_children, Xy_tuning_mimic_no_children = Xy_test_mimic[Xy_test_mimic['age'] > 18], Xy_tuning_mimic[Xy_tuning_mimic['age'] > 18]

mse_grid_lgbm_mimic_nC = mean_squared_error(Xy_test_mimic_no_children['outcome'], p1.predict(Xy_test_mimic_no_children))
mse_grid_rf_mimic_nC = mean_squared_error(Xy_test_mimic_no_children['outcome'], p2.predict(Xy_test_mimic_no_children))

results_p1_mimic_nC = find_best_parameters(Xy_train, Xy_tuning_mimic_no_children, p1, param_grid_lgbm)
results_p2_mimic_nC = find_best_parameters(Xy_train, Xy_tuning_mimic_no_children, p2, param_grid_rf)
results_p4_mimic_nC = find_best_parameters(Xy_train, Xy_tuning_mimic_no_children, p4, param_grid_anchor)

mse_eicu_to_mimic_p1_nC = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic_no_children, Xy_test_mimic_no_children['outcome'], p1, results_p1_mimic_nC)
mse_eicu_to_mimic_p2_nC = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic_no_children, Xy_test_mimic_no_children['outcome'], p2, results_p2_mimic_nC)
mse_eicu_to_mimic_p4_nC = calculate_mse(Xy_train, Xy_train['outcome'], Xy_test_mimic_no_children, Xy_test_mimic_no_children['outcome'], p4, results_p4_mimic_nC)

mse_eicu_to_mimic_p3_nC = mean_squared_error(Xy_test_mimic_no_children['outcome'], p3.predict(Xy_test_mimic_no_children))
mse_eicu_to_mimic_dummy_prediction_nC = mean_squared_error(Xy_test_mimic_no_children['outcome'], np.full_like(Xy_test_mimic_no_children['outcome'],Xy_train[outcome].mean()))

plotting(mse_eicu_to_mimic_p1_nC, mse_eicu_to_mimic_p2_nC, mse_eicu_to_mimic_p3_nC, mse_eicu_to_mimic_p4_nC, mse_grid_lgbm_mimic_nC, mse_grid_rf_mimic_nC, 'Eicu', 'Mimic no Children')


# ToDo:

- Malte Fragen beantworten: 
    - peak: Hab mit falschen Parametern getestet
    - Sowohl RF wie LGBM mit tuning auf target distr. data sind schlechter (nie besser) als die Baselines, egal wie gross “n” ist. Wieso?
    - Dein MSE der OLS baseline eICU -> MIMIC III ist signifikant besser als das was ich in dem pdf das ich dir mal geschickt hatte habe (~175). Was machst du anders? ################## das ist eICU -> Hirid
- Refit implementieren und anschauen
- Euler
- Connect to ADA and run jobs - muss ich dann icu & iv neu installieren --- wie?
- ML Flow lernen 

- Evaluation auf hold out test set von trainingsdaten für lgbm vs ols 
- Preprocessing LGBM anpassen - lgbm kann mit categorical umgehen 
- andere rs für rf auf hirid
- Mimic3: optimale Gammas testen
- EV XtX und l2 reg. für OLS
- Python skript lernen
- Umgekehrt denken: erst organisieren, dann implementieren ---> Reusability im Kopf haben 
- Git clone auf Euler