## Regression

In [4]:
import re
import sys
import warnings
from collections import Counter, defaultdict
from federated_eval_helper_functions import *
import matplotlib as mpl
import matplotlib.font_manager as fm
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.stats.api as sms
from imblearn.over_sampling import  RandomOverSampler
from IPython.display import Audio
from matplotlib import pyplot as plt
from pylab import cm
from scipy.stats import ttest_ind
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import (
   mean_squared_error,mean_absolute_error
)
from sklearn.model_selection import (
    GridSearchCV,
    RepeatedKFold,
    train_test_split,
)
import statistics
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
    LabelBinarizer,
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
)
from sklearn.preprocessing import label_binarize

from itertools import compress
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
import glob
import pickle
import json

from sklearn.linear_model import  SGDRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

# import treated data

In [5]:
REPS = 10
CV_NUMBER = 10


In [6]:
# Collect all the font names available to matplotlib
font_names = [f.name for f in fm.fontManager.ttflist]
#print(font_names)
# Edit the font, font size, and axes width
mpl.rcParams["font.family"] = "Avenir"
plt.rcParams["font.size"] = 10
plt.rcParams["axes.linewidth"] = 2
# Generate 2 colors from the 'tab10' colormap
colors = cm.get_cmap("tab10", 2)

sound_file = "https://www.soundjay.com/buttons/button-09a.wav"  # https://www.soundjay.com/buttons/sounds/button-1.mp3
alarm = Audio(sound_file, autoplay=True)

In [7]:
silo_imputed=[]
for idx,file in enumerate(glob.glob("/Users/joaoalmeida/Desktop/tese_local/Obscare Giovana/imputed/silo*.csv")):
    silo_imputed.append(pd.read_csv(file,index_col=0))

In [8]:
with open("col_types.json", "r") as infile:
    col_types = json.load(infile)
int_cols=col_types["int"]
cat_cols=col_types["cat"]

In [9]:
target_int_cols=["IDADE_MATERNA",
"PESO_INICIAL",
"IMC",
"NUMERO_CONSULTAS_PRE_NATAL",
"IDADE_GESTACIONAL_ADMISSAO",
"SEMANAS_GESTACAO_PARTO"]

# Modeling

In [10]:
def define_weights(grid_list):
    i = 0
    result_list = []
    for grid in grid_list:
       # print(grid.best_score_)
        result_list.append(grid.best_score_)
        i += grid.best_score_
        #print(i)
    return [1/(v / i) for v in result_list]

In [11]:
def predict_global_model(models,X):
    result=[]
    w=models["weigths"]
    for idx,model in enumerate(models["global_model"]):
        pred=model.predict(X)
     #   print(pred[0:4])
      #  print(pred*w[idx]/np.sum(w))
        result.append(pred*w[idx]/np.sum(w))
    result=np.asarray(result)
    return result.sum(axis=0)
        

In [12]:
def evaluate_distributed_model_regression(
    silos, target, metrics, cv,int_cols,cat_cols, tuned_parameters, model,  model_type="myvoting",debug_mode=False
):
    """
    for every silo, trains and local model with hyperparameter tuning (CV)
    After that, creates a global_model and with all locals and global evaluates on the test set several metrics
    remove low frequency target (below 4) in order to get proper metric values (weighted f1 and auc)
    """
    grid_list = []
    result = {}
    models = []
    test_sets = []
    X_train_list = []
    y_train_list = []
    f = open("logs/regression_" + str(type(model).__name__) + ".txt", "a")
    log_to_file(f,["start",type(model).__name__])
    for idx, silo in enumerate(silos):
        if debug_mode:
            print("silo",str(idx))
            print(np.random.randint(1,20))
        if "random_state" in model.get_params().keys():
            model.set_params(random_state=np.random.randint(1, 20))

        clf = GridSearchCV(
            model, tuned_parameters, cv=RepeatedKFold(n_splits=cv,n_repeats=2), n_jobs=-3,scoring = 'neg_mean_squared_error'
        )
        #X,y=evaluate_variables_and_transform_variables(silo,target,int_cols,cat_cols,full_classes[target])
        y = silo[target]
        X = silo.drop(columns=[target])
        
        X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2
            )

        test_sets.append((y_test, X_test))
        X_train_list.append(X_train)
        y_train_list.append(y_train)
        models.append(clf.fit(X_train, y_train).best_estimator_)
        grid_list.append(clf)

    w = define_weights(grid_list) #explain paper -> define weights based on scores / sum of all scores
   # print("w,",str(w))

    global_models={"global_model":models,"weigths":w}
   # print(global_models)
    for idx, tests in enumerate(test_sets):

        y_pred_l = models[idx].predict(tests[1])
        y_pred_g = predict_global_model(global_models,tests[1])
       # print(y_pred_l[0:4],y_pred_g[0:4])
        for metric in metrics:
            if metric == "rmse":
                result["silo" + str(idx + 1) + "_rmse_local"] = mean_squared_error(tests[0],y_pred_l)

                result["silo" + str(idx + 1) + "_rmse_global"] = mean_squared_error(tests[0],y_pred_g)
            if metric == "mae":
                result["silo" + str(idx + 1) + "_mae_local"] = mean_absolute_error(tests[0],y_pred_l)

                result["silo" + str(idx + 1) + "_mae_global"] = mean_absolute_error(tests[0],y_pred_g)
    f.close()

    return result,global_models

In [13]:
def evalute_full_method_regression(
    targets,
    silos,
    metrics,
    tuned_parameters,
    cv,int_cols,cat_cols,
    model=SGDClassifier(loss="log"),
    repeats=2,
    model_type="myvoting",debug_mode=False
):
    total = {k: [] for k in targets}
    np.random.seed(42)
    for target in targets:
        print("evaluating " + target + "... ")
        total[target] = {}
        #total[target]["models"] = []
        #total[target]["g_model"] = []
        for metric in metrics:
            for silonr, silo in enumerate(silos):
                total[target]["silo" + str(silonr + 1) + "_" + metric + "_local"] = []
                total[target]["silo" + str(silonr + 1) + "_" + metric + "_global"] = []
        for i in range(repeats):
            t = evaluate_distributed_model_regression(
                silos=silos,
                target=target,
                metrics=metrics,
                tuned_parameters=tuned_parameters,
                cv=cv,int_cols=int_cols,cat_cols=cat_cols,
                model=model,
                model_type=model_type,debug_mode=debug_mode
            )
            for metric in metrics:
                for silonr, silo in enumerate(silos):
                    total[target][
                        "silo" + str(silonr + 1) + "_" + metric + "_local"
                    ].append(t[0]["silo" + str(silonr + 1) + "_" + metric + "_local"])

                    total[target][
                        "silo" + str(silonr + 1) + "_" + metric + "_global"
                    ].append(t[0]["silo" + str(silonr + 1) + "_" + metric + "_global"])
            #total[target]["models"].append(t[1])
            #total[target]["g_model"].append(t[2])
        save_zipped_model(target, model, model_type, t[1])

    return total

## SGD

In [21]:
%%time
# 9 min
total = evalute_full_method_regression(
    repeats=10,
    targets=target_int_cols,
    metrics=["rmse","mae"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,
    tuned_parameters=[{"alpha": [1e-15,1e-10,1e-5,1e-2],"tol":[40,50,30,20,12],"eta0":[1e-6,1e-5,1e-4],"epsilon":[0.1,0.01],"penalty":["l2","l1"],"max_iter":[10000]}],
    model=SGDRegressor(learning_rate="adaptive"),
    model_type="ensemble",debug_mode=False
)
alarm

evaluating IDADE_MATERNA... 
evaluating PESO_INICIAL... 
evaluating IMC... 
evaluating NUMERO_CONSULTAS_PRE_NATAL... 
evaluating IDADE_GESTACIONAL_ADMISSAO... 
evaluating SEMANAS_GESTACAO_PARTO... 
CPU times: user 51.3 s, sys: 15.8 s, total: 1min 7s
Wall time: 2min 17s


In [26]:
df_sgd=from_dict_to_df_raw(total,"SGD")
now=datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_sgd.to_csv("results/reg_sgd"+date_time+".csv")

## Decision Trees

In [12]:
%%time
#3h22
warnings.filterwarnings("ignore")

total_dt = evalute_full_method_regression(
  targets=target_int_cols,
    metrics=["rmse","mae"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,
    tuned_parameters=[
        {"criterion": ["squared_error", "absolute_error"], "max_features": ["log2", "auto"]}
    ],
    model=DecisionTreeRegressor(),
    repeats=10,
    model_type="ensemble",
)
alarm

evaluating IDADE_MATERNA... 
evaluating PESO_INICIAL... 
evaluating IMC... 
evaluating NUMERO_CONSULTAS_PRE_NATAL... 
evaluating IDADE_GESTACIONAL_ADMISSAO... 
evaluating SEMANAS_GESTACAO_PARTO... 
CPU times: user 12min 39s, sys: 12.2 s, total: 12min 51s
Wall time: 3h 22min 59s


In [13]:
df_dt=from_dict_to_df_raw(total_dt,"decisionTree")
df_dt.head()
now=datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_dt.to_csv("reg_dt"+date_time+".csv")

## Gaussian NB

In [None]:
%%time
#Wall time: 8min 27s
total_nb = evalute_full_method_regression(
  targets=target_int_cols,
    metrics=["rmse","mae"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,
    tuned_parameters=[{"tol": [1e-3, 1e-4],"alpha_1":[1e-6,1e-5,1e-4],"alpha_2":[1e-6,1e-5,1e-4]}],
    model=BayesianRidge(),
    repeats=10,
    model_type="ensemble",
)
alarm

evaluating IDADE_MATERNA... 
evaluating PESO_INICIAL... 
evaluating IMC... 
evaluating NUMERO_CONSULTAS_PRE_NATAL... 
evaluating IDADE_GESTACIONAL_ADMISSAO... 
evaluating SEMANAS_GESTACAO_PARTO... 
CPU times: user 17min 5s, sys: 7min 1s, total: 24min 7s
Wall time: 8min 52s


In [27]:
df_nb=from_dict_to_df_raw(total_nb,"BayesianRidge")
date_time = now.strftime("%Y%m%d%H")
df_nb.to_csv("results/reg_nb"+date_time+".csv")

## KNN

In [15]:
%%time
#Wall time: 1h4min

total_knn = evalute_full_method_regression(
  targets=target_int_cols,
    metrics=["rmse","mae"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,
    tuned_parameters={
            "n_neighbors": [5,7,10],
            "p": [1,2]
        },
    model=  KNeighborsRegressor(),
    repeats=10,
    model_type="ensemble",
)
alarm

evaluating IDADE_MATERNA... 
evaluating PESO_INICIAL... 
evaluating IMC... 
evaluating NUMERO_CONSULTAS_PRE_NATAL... 
evaluating IDADE_GESTACIONAL_ADMISSAO... 
evaluating SEMANAS_GESTACAO_PARTO... 
CPU times: user 45min 54s, sys: 53min 33s, total: 1h 39min 27s
Wall time: 1h 4min 59s


In [16]:
df_knn=from_dict_to_df_raw(total_knn,"KNN")
df_knn.head()
now=datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_knn.to_csv("results/reg_knn"+date_time+".csv")

## ADABOOST

In [12]:
%%time
#Wall time: 35m 1.27s
total_adaboost = evalute_full_method_regression(
  targets=target_int_cols,
    metrics=["rmse","mae"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,
    tuned_parameters={
            "learning_rate": [1,2,0.5],
            "n_estimators": [25,50]
        },
    model= AdaBoostRegressor(),
    repeats=10,
    model_type="ensemble",
)
alarm

evaluating IDADE_MATERNA... 
evaluating PESO_INICIAL... 
evaluating IMC... 
evaluating NUMERO_CONSULTAS_PRE_NATAL... 
evaluating IDADE_GESTACIONAL_ADMISSAO... 
evaluating SEMANAS_GESTACAO_PARTO... 
CPU times: user 3min 18s, sys: 26.9 s, total: 3min 45s
Wall time: 37min 27s


In [13]:
df_adaboost=from_dict_to_df_raw(total_adaboost,"ADABOOST")
now=datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_adaboost.to_csv("results/reg_adaboost"+date_time+".csv")

## NN

In [14]:
%%time
#Wall time: 27min 37s

total_nn= evalute_full_method_regression(
  targets=target_int_cols,
    metrics=["rmse","mae"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,
    tuned_parameters={
            "solver": ["lbfgs"],"learning_rate_init":[0.001,1e-4],"max_iter":[10000,500],"hidden_layer_sizes":[(100,)],
            "alpha": [1e-5,1e-4],"learning_rate":["adaptive"],"tol":[10,20]
        },
    model= MLPRegressor(),
    repeats=10,
    model_type="ensemble",
)
alarm

evaluating IDADE_MATERNA... 
evaluating PESO_INICIAL... 
evaluating IMC... 
evaluating NUMERO_CONSULTAS_PRE_NATAL... 
evaluating IDADE_GESTACIONAL_ADMISSAO... 
evaluating SEMANAS_GESTACAO_PARTO... 
CPU times: user 27min 27s, sys: 4min 2s, total: 31min 30s
Wall time: 27min 31s


In [16]:
df_nn=from_dict_to_df_raw(total_nn,"NN")
now=datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_nn.to_csv("results/reg_nn"+date_time+".csv")

## compile all DFs

In [81]:
now=datetime.now()
date_time = now.strftime("%Y%m%d%H")
result = pd.concat([df_sgd,df_dt,df_nb,df_knn,df_adaboost,df_nn], axis=0)
result.to_csv("results/reg_result_df"+date_time+".csv")

# Total

In [72]:
def create_total_model_and_evaluate(
    silos,
    targets,
    parameters,
    global_model_name,cat_cols,int_cols,
    model,
    cv=10,
    nr_repeats=10,samplingsilo=False
):
    np.random.seed(42)
    if samplingsilo:
        minisilos = [silo.sample(1500) if len(silo)>1500 else silo for silo in silos ]  # keep same nr rows
    else:    
        minisilos=[silo for silo in silos] #knn takes way too long
    
    full_data = pd.concat(minisilos).reset_index(drop=True)
    full_metric = defaultdict(dict)
    f = open("logs/reg_centralised_" + str(type(global_model_name).__name__) + ".txt", "w")

    for target in targets:
        print("testing....",target)
        log_to_file(f,["evaluating " , target , "... "])

        full_metric[target] = {
            "total": {"mae": [], "rmse": []},
            "global": {"mae": [], "rmse": []},
        }

        for i in range(nr_repeats):
            r_s = np.random.randint(1, nr_repeats)
            argspec = model.get_params()
            if "random_state" in argspec.keys():
                
                total_clf = GridSearchCV(
                    model.set_params(random_state=r_s),
                    param_grid=parameters,
                    cv=RepeatedKFold(n_splits=cv,n_repeats=2),
                    n_jobs=-2,
                )
            else:
                total_clf = GridSearchCV(
                    model.set_params(),
                    param_grid=parameters,
                    cv=RepeatedKFold(n_splits=cv,n_repeats=2),
                    n_jobs=-2,
                )
            y = full_data[target]
            X = full_data.drop(columns=[target])
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2
            )

            total_clf.fit(X_train, y_train)
           # print(models[target])
            y_pred_t=total_clf.best_estimator_.predict(X_test)

            full_metric[target]["total"]["mae"].append(
                mean_absolute_error(y_test,y_pred_t)
            )
            
            
            full_metric[target]["total"]["rmse"].append(
                mean_squared_error( y_test,y_pred_t)
            )
            
        save_zipped_model(target, model, "centralised", total_clf)
    f.close()

    return full_metric
result_dict={}

In [18]:
data_dict={"SGD":[SGDRegressor(),[{"alpha": [1e-10,1e-5,1e-2],"tol":[40,50,30,20,12],"eta0":[1e-6,1e-5,1e-4],"epsilon":[0.1,0.01],"penalty":["l2","l1"],"max_iter":[10000]}]],
   # "decisionTree":[DecisionTreeRegressor(),[{"criterion": ["squared_error", "absolute_error"], "max_features": ["log2", "auto"]}],slim_data_dt],
           "NaiveBayes":[BayesianRidge(),[{"tol": [1e-3, 1e-4],"alpha_1":[1e-6,1e-5,1e-4],"alpha_2":[1e-6,1e-5,1e-4]}]],
          # "KNN":(KNeighborsClassifier(),{"n_neighbors": [5,7,10],"p": [1,2]}),
           "ADABOOST":[AdaBoostRegressor(),{"learning_rate": [1,2,0.5],"n_estimators": [25,50]}],
           "NN":[MLPRegressor(),{
            "solver": ["lbfgs"],"learning_rate_init":[0.001,1e-4],"max_iter":[10000,500],"hidden_layer_sizes":[(100,)],
            "alpha": [1e-5,1e-4],"learning_rate":["adaptive"],"tol":[10,20]}]
           }


for k,v in data_dict.items():
    print(k)
  #  print(v[2]["IDADE_MATERNA"]["g_model"][0])
    result_dict[k] = create_total_model_and_evaluate(
        silos=silo_imputed,
        targets=target_int_cols,
        parameters=v[1],cat_cols=cat_cols,int_cols=int_cols,
        global_model_name=v[0],model=v[0]
    )

SGD
testing.... IDADE_MATERNA
testing.... PESO_INICIAL
testing.... IMC
testing.... NUMERO_CONSULTAS_PRE_NATAL
testing.... IDADE_GESTACIONAL_ADMISSAO
testing.... SEMANAS_GESTACAO_PARTO
NaiveBayes
testing.... IDADE_MATERNA
testing.... PESO_INICIAL
testing.... IMC
testing.... NUMERO_CONSULTAS_PRE_NATAL
testing.... IDADE_GESTACIONAL_ADMISSAO
testing.... SEMANAS_GESTACAO_PARTO
ADABOOST
testing.... IDADE_MATERNA
testing.... PESO_INICIAL
testing.... IMC
testing.... NUMERO_CONSULTAS_PRE_NATAL
testing.... IDADE_GESTACIONAL_ADMISSAO
testing.... SEMANAS_GESTACAO_PARTO
NN
testing.... IDADE_MATERNA
testing.... PESO_INICIAL
testing.... IMC
testing.... NUMERO_CONSULTAS_PRE_NATAL
testing.... IDADE_GESTACIONAL_ADMISSAO
testing.... SEMANAS_GESTACAO_PARTO


In [20]:
data_dict={
    "decisionTree":[DecisionTreeRegressor(),[{"criterion": ["squared_error", "absolute_error"], "max_features": ["log2", "auto"]}]],
         }


for k,v in data_dict.items():
    print(k)
    result_dict[k] = create_total_model_and_evaluate(
        silos=silo_imputed,
        targets=["SEMANAS_GESTACAO_PARTO"],
        parameters=v[1],cat_cols=cat_cols,int_cols=int_cols,
        global_model_name=v[0],model=v[0]
    )

decisionTree
testing.... SEMANAS_GESTACAO_PARTO


In [73]:
data_dict={
    "KNN":[KNeighborsRegressor(),[{"n_neighbors": [5,7,10],"p": [1,2]}]]
         }
#1361
for k,v in data_dict.items():
    print(k)
  #  print(v[2]["IDADE_MATERNA"]["g_model"][0])
    result_dict[k] = create_total_model_and_evaluate(
        silos=silo_imputed,
        targets=target_int_cols,
        parameters=v[1],cat_cols=cat_cols,int_cols=int_cols,
        global_model_name=v[0],model=v[0]
    )

KNN
testing.... IDADE_MATERNA
testing.... PESO_INICIAL
testing.... IMC
testing.... NUMERO_CONSULTAS_PRE_NATAL
testing.... IDADE_GESTACIONAL_ADMISSAO
testing.... SEMANAS_GESTACAO_PARTO


In [None]:
for k,v in result_dict.items():
    for k2, v2 in v.items():
        ttest, pval = ttest_ind(v2["total"]["rmse"], v2["global"]["rmse"], nan_policy="omit")
        print(k,k2, pval)

# All on silos

In [14]:
def evaluate_models_on_local_regression(
        silos, target, metrics, cv, int_cols, cat_cols, tuned_parameters, model,f,
        debug_mode=False
):
    """
    for every silo, trains and local model with hyperparameter tuning (CV)
    After that, creates a global_model and with all locals and global evaluates on the test set several metrics
    """
    grid_list = []
    result = {}
    models = []
    test_sets = []
    X_train_list = []
    y_train_list = []
    f = open("logs/regression_" + str(type(model).__name__) + ".txt", "a")
    log_to_file(f,["start",type(model).__name__])
    for idx, silo in enumerate(silos):
        if debug_mode:
            print("silo",str(idx))
            print(np.random.randint(1,20))
        if "random_state" in model.get_params().keys():
            model.set_params(random_state=np.random.randint(1, 20))

        clf = GridSearchCV(
            model, tuned_parameters, cv=RepeatedKFold(n_splits=cv,n_repeats=2), n_jobs=-3,scoring = 'neg_mean_squared_error'
        )
        #X,y=evaluate_variables_and_transform_variables(silo,target,int_cols,cat_cols,full_classes[target])
        y = silo[target]
        X = silo.drop(columns=[target])
        
        X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2
            )

        test_sets.append((y_test, X_test))
        X_train_list.append(X_train)
        y_train_list.append(y_train)
        models.append(clf.fit(X_train, y_train).best_estimator_)
        grid_list.append(clf)

    distributed_model = load_model_from_zip(target + "_" + str(type(model).__name__) + "_ensemble")#["ensemble"]
    #print(distributed_model.keys())
    centralised_model = load_model_from_zip(target + "_" + str(type(model).__name__) + "_centralised")
    for idx, tests in enumerate(test_sets):
        y_pred_l = models[idx].predict(tests[1])
        y_pred_g = predict_global_model(distributed_model,tests[1])
        y_pred_c = centralised_model.best_estimator_.predict(tests[1])
        for metric in metrics:
            if metric == "rmse":
                result["silo" + str(idx + 1) + "_rmse_local"] = mean_squared_error(tests[0],y_pred_l)

                result["silo" + str(idx + 1) + "_rmse_distributed"] = mean_squared_error(tests[0],y_pred_g)
                
                result["silo" + str(idx + 1) + "_rmse_centralised"] = mean_squared_error(tests[0],y_pred_c)

            if metric == "mae":
                result["silo" + str(idx + 1) + "_mae_local"] = mean_absolute_error(tests[0],y_pred_l)

                result["silo" + str(idx + 1) + "_mae_distributed"] = mean_absolute_error(tests[0],y_pred_g)

                result["silo" + str(idx + 1) + "_mae_centralised"] = mean_absolute_error(tests[0],y_pred_c)

    f.close()

    return result

In [15]:
def evaluate_all_on_local_regression(
        targets,
        silos,
        metrics,
        tuned_parameters,
        cv, int_cols, cat_cols, 
        model,
        repeats, debug_mode=False
):
    total = {k: [] for k in targets}
    np.random.seed(42)
    for target in targets:
        total[target] = {}
        f = open("logs/log_" + str(type(model).__name__) + "_final_test_all_reg.txt", "a")
        log_to_file(f,["evaluating " , target , "... "])

        for metric in metrics:
            for silonr, silo in enumerate(silos):
                total[target]["silo" + str(silonr + 1) + "_" + metric + "_local"] = []
                total[target]["silo" + str(silonr + 1) + "_" + metric + "_distributed"] = []
                total[target]["silo" + str(silonr + 1) + "_" + metric + "_centralised"] = []

        for i in range(repeats):
            t = evaluate_models_on_local_regression(
                silos=silos,
                target=target,
                metrics=metrics,
                tuned_parameters=tuned_parameters,
                cv=cv, int_cols=int_cols, cat_cols=cat_cols,
                model=model,f=f, debug_mode=debug_mode
            )
            for metric in metrics:
                for silonr, silo in enumerate(silos):
                    total[target][
                        "silo" + str(silonr + 1) + "_" + metric + "_local"
                    ].append(t["silo" + str(silonr + 1) + "_" + metric + "_local"])

                    total[target][
                        "silo" + str(silonr + 1) + "_" + metric + "_distributed"
                    ].append(t["silo" + str(silonr + 1) + "_" + metric + "_distributed"])

                    total[target][
                        "silo" + str(silonr + 1) + "_" + metric + "_centralised"
                    ].append(t["silo" + str(silonr + 1) + "_" + metric + "_centralised"])

    f.close()

    return total

In [16]:
#warnings.filterwarnings("ignore")
#145min
sgd_all_local = evaluate_all_on_local_regression(
    repeats=10,
    targets=target_int_cols,
    metrics=["rmse","mae"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,
    tuned_parameters=[{"alpha": [1e-15,1e-10,1e-5,1e-2],"tol":[40,50,30,20,12],"eta0":[1e-6,1e-5,1e-4],"epsilon":[0.1,0.01],"penalty":["l2","l1"],"max_iter":[10000]}],
    model=SGDRegressor(learning_rate="adaptive"),debug_mode=False
)
alarm

In [17]:
df_sgd = from_dict_to_df_raw(sgd_all_local, "SGD")
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_sgd.to_csv("results/reg_all_in_all_sgd" + date_time + ".csv")

In [18]:
warnings.filterwarnings("ignore")
#238min
tt_dt = evaluate_all_on_local_regression(targets=target_int_cols,
    metrics=["rmse","mae"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,
    tuned_parameters=[
        {"criterion": ["squared_error", "absolute_error"], "max_features": ["log2", "auto"]}
    ],
    model=DecisionTreeRegressor(),
    repeats=10)

In [19]:
df_dt = from_dict_to_df_raw(tt_dt, "decisionTree")
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_dt.to_csv("results/reg_all_in_all_dt" + date_time + ".csv")

In [20]:
warnings.filterwarnings("ignore")
#9min
tt_nb  = evaluate_all_on_local_regression(
  targets=target_int_cols,
    metrics=["rmse","mae"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,
    tuned_parameters=[{"tol": [1e-3, 1e-4],"alpha_1":[1e-6,1e-5,1e-4],"alpha_2":[1e-6,1e-5,1e-4]}],
    model=BayesianRidge(),
    repeats=10,debug_mode=False)

In [21]:
df_nb = from_dict_to_df_raw(tt_nb, "NaiveBayes")
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_nb.to_csv("results/reg_all_in_all_nb" + date_time + ".csv")

In [28]:
warnings.filterwarnings("ignore")
#38m
tt_ada = evaluate_all_on_local_regression(
  targets=target_int_cols,
    metrics=["rmse","mae"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,
    tuned_parameters={
            "learning_rate": [1,2,0.5],
            "n_estimators": [25,50]
        },
    model= AdaBoostRegressor(),
    repeats=10)

In [30]:
df_adaboost = from_dict_to_df_raw(tt_ada, "ADABOOST")
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_adaboost.to_csv("results/reg_all_in_all_adaboost" + date_time + ".csv")

In [23]:
warnings.filterwarnings("ignore")
#60m
tt_knn = evaluate_all_on_local_regression( targets=target_int_cols,
    metrics=["rmse","mae"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,
     tuned_parameters={
            "n_neighbors": [5,7,10],
            "p": [1,2]
        },
    model=  KNeighborsRegressor(),
    repeats=10)

In [24]:
df_knn = from_dict_to_df_raw(tt_knn, "KNN")
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_knn.to_csv("results/reg_all_in_all_knn" + date_time + ".csv")

In [25]:
warnings.filterwarnings("ignore")
#27min
tt_nn =evaluate_all_on_local_regression(
  targets=target_int_cols,
    metrics=["rmse","mae"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,
    tuned_parameters={
            "solver": ["lbfgs"],"learning_rate_init":[0.001,1e-4],"max_iter":[10000,500],"hidden_layer_sizes":[(100,)],
            "alpha": [1e-5,1e-4],"learning_rate":["adaptive"],"tol":[10,20]
        },
    model= MLPRegressor(),
    repeats=10)

In [26]:
df_nn = from_dict_to_df_raw(tt_nn, "NN")
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_nn.to_csv("results/reg_all_in_all_nn" + date_time + ".csv")

In [31]:
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
result = pd.concat([df_sgd, df_dt, df_nb, df_adaboost, df_nn,df_knn], axis=0)
result.to_csv("results/reg_result_all_in_all_df_" + date_time + ".csv")