# Start

In [1]:

import glob
import os
import pickle
import statistics
import zipfile
from datetime import datetime
from itertools import compress

import matplotlib as mpl
import matplotlib.font_manager as fm
import numpy as np
import pandas as pd
from IPython.display import Audio
from imblearn.over_sampling import RandomOverSampler
from matplotlib import pyplot as plt
from pylab import cm
from scipy.stats import ttest_ind
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import (
    f1_score, average_precision_score,
    roc_auc_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    RepeatedStratifiedKFold,
    train_test_split,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import label_binarize
from sklearn.tree import DecisionTreeClassifier
import json
from federated_eval_helper_functions import *

In [2]:
REPS = 10
CV_NUMBER = 10


# import treated data

In [3]:
# Collect all the font names available to matplotlib
font_names = [f.name for f in fm.fontManager.ttflist]
#print(font_names)
# Edit the font, font size, and axes width
mpl.rcParams["font.family"] = "Avenir"
plt.rcParams["font.size"] = 10
plt.rcParams["axes.linewidth"] = 2
# Generate 2 colors from the 'tab10' colormap
colors = cm.get_cmap("tab10", 2)

sound_file = "https://www.soundjay.com/buttons/button-09a.wav"  # https://www.soundjay.com/buttons/sounds/button-1.mp3
alarm = Audio(sound_file, autoplay=True)

In [4]:
silo_imputed = []
for idx, file in enumerate(glob.glob("/Users/joaoalmeida/Desktop/tese_local/Obscare Giovana/imputed/silo*.csv")):
    silo_imputed.append(pd.read_csv(file, index_col=0))

In [5]:
i = 1
for k in silo_imputed[0].columns:
    print(k)
    print(i)
    i += 1

IDADE_MATERNA
1
PESO_INICIAL
2
IMC
3
NUMERO_CONSULTAS_PRE_NATAL
4
IDADE_GESTACIONAL_ADMISSAO
5
SEMANAS_GESTACAO_PARTO
6
PESO_ADMISSAO_INTERNAMENTO
7
GS
8
BISHOP_DILATACAO
9
CIGARROS
10
CESARIANAS_ANTERIOR
11
A_PARA
12
EUTOCITO_ANTERIOR
13
APRESENTACAO_NO_PARTO
14
VIGIADA
15
APRESENTACAO_ADMISSAO
16
BISHOP_SCORE
17
TIPO_GRAVIDEZ
18
VIGIADA_PARICULAR
19
DIABETES_GESTACIONAL
20
TRAB_PARTO_NO_PARTO
21
BISHOP_DESCIDA
22
BISHOP_CONSISTENCIA
23
BACIA
24
VIGIADA_NESTE_HOSPITAL
25
RPM
26
FORCEPS_ANTERIOR
27
TIPO_PARTO
28
TRAB_PARTO_ENTRADA_INDUZIDO
29
BISHOP_EXTINCAO
30
TRAB_PARTO_ENTRADA_ESPONTANEO
31
VENTOSAS_ANTERIOR
32
BISHOP_POSICAO
33
A_GESTA
34
VIGIADA_CENTRO_SAUDE
35
GRUPO_ROBSON
36


In [6]:
with open("col_types.json", "r") as infile:
    col_types = json.load(infile)
int_cols=col_types["int"]
cat_cols=col_types["cat"]

In [7]:
target_cat_cols = ["GS",
                   "A_PARA",
                   "A_GESTA",
                   "TIPO_GRAVIDEZ",
                   "VIGIADA",
                   "VIGIADA_CENTRO_SAUDE",
                   "VIGIADA_NESTE_HOSPITAL",
                   "APRESENTACAO_ADMISSAO",
                   "TRAB_PARTO_ENTRADA_ESPONTANEO",
                   "TIPO_PARTO",
                   "APRESENTACAO_NO_PARTO",
                   "TRAB_PARTO_NO_PARTO",
                   "GRUPO_ROBSON"]

# Functions

In [8]:
def get_all_classes(dfs, target):
    final_list = []
    for df in dfs:
        for e in df[target].unique():
            final_list.append(e) if e not in final_list else final_list
    return final_list

In [9]:
classes_dict = {}
for col in cat_cols:
    classes_dict[col] = get_all_classes(silo_imputed, col)

In [10]:
def prepare_global_model_init(g_model, X_train, y_train, model_type):
    # threshold = get_best_threshold(g_model["myvoting"], X_train, y_train)
    #  g_model["myvoting"].set_threshold(threshold[0])
    g_model["ensemble"].fit(X_train, y_train)  #only one to use // so we can use it. Does not change the models
    #   g_model["stacking"].fit(X_train, y_train)
    return g_model

In [11]:
def evaluate_variables_and_transform_variables(df, target, cat_cols, int_cols, class_list=None, threshold=12,
                                               nr_rows=25):
    if class_list:
        missing_class = list(set(class_list) - set(df[str(target)].unique()))
        if len(missing_class) > 0:
            # print("missing class:",missing_class)
            #  print("existing class:",df[str(target)].unique())
            for c in missing_class:
                df = dummy_row_creation(df, target, c, int_cols, cat_cols, nr_rows)
    s = df[target].value_counts().le(threshold)
    to_smote = list(s[s].index.values)
    #print("vars to be enhanced",to_smote) 
    # transform the dataset
    y = df[target]
    X = df.drop(columns=[target])
    if len(to_smote) > 0:
        smote_params = {}
        for e in to_smote:
            smote_params[e] = nr_rows
        ros = RandomOverSampler(random_state=0, sampling_strategy=smote_params)

        X_resampled, y_resampled = ros.fit_resample(X, y)
        #if not present
        return X_resampled, y_resampled
    else:
        return X, y

In [12]:
def auprc_multiclass(y_test, y_score, full_classes):
    #print(y_score)
    #print(full_classes)
    #print(y_test)
    #print(y_test.shape)
    y_test_multilabel = label_binarize(y_test, classes=full_classes)
    #y_test_multilabel=label_binarize(y_test, classes=full_classes)
    try:
        auprc = average_precision_score(
            y_test_multilabel, y_score, average="weighted"
        )


    except Exception as e:
        print("Error on", "auprc score calculate", e, "y_score --->", y_score)
        print("Error on", "auprc score calculate", e, " y_test--->", y_test_multilabel)

        #result["silo" + str(idx + 1) + "_roc_auc_score_global"] = np.nan
        return np.nan
    #print(auprc)
    return auprc

In [16]:
def evaluate_federated_model(
        silos, target, metrics, cv, int_cols, cat_cols, tuned_parameters, model, full_classes, model_type="myvoting",
        debug_mode=False
):
    """
    for every silo, trains and local model with hyperparameter tuning (CV)
    After that, creates a global_model and with all locals and global evaluates on the test set several metrics
    remove low frequency target (below 4) in order to get proper metric values (weighted f1 and auc)
    """
    grid_list = []
    result = {}
    models = []
    test_sets = []
    X_train_list = []
    y_train_list = []
    f = open("logs/log_" + str(type(model).__name__) + ".txt", "a")
    now = datetime.now()
    date_time = now.strftime("%Y%m%d - %H:%M")
    f.write(date_time + "\n")
    for idx, silo in enumerate(silos):
        if debug_mode:
            print("silo", str(idx))
            print(np.random.randint(1, 20))
        if "random_state" in model.get_params().keys():
            model.set_params(random_state=np.random.randint(1, 20))
        #   print(model.get_params())
        #     s_model = OneVsRestClassifier(model)
        clf = GridSearchCV(
            model, tuned_parameters, cv=RepeatedStratifiedKFold(n_splits=cv, n_repeats=2), n_jobs=-2, scoring='f1_micro'
        )
        nr_classes = silo[target].unique()
        X, y = evaluate_variables_and_transform_variables(silo, target, int_cols, cat_cols, full_classes[target])

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y
        )

        test_sets.append((y_test, X_test))
        X_train_list.append(X_train)
        y_train_list.append(y_train)
        models.append(clf.fit(X_train, y_train).best_estimator_)
        grid_list.append(clf)
        print(clf.best_params_)
    w = define_weights(grid_list)  #explain paper -> define weights based on scores / sum of all scores
    f.write("w," + " " + str(w) + "\n")
    global_model = create_global_model(models, "voting",
                                       w)  #explain paper intialize class of EnsembleVoteClassifier from mlxtend with weigths
    global_model = prepare_global_model_init(  #explain paper does fit with a random train set to use it further
        global_model, X_train_list[0], y_train_list[0], model_type
    )

    for idx, tests in enumerate(test_sets):
        y_pred_l_auc = models[idx].predict_proba(tests[1])
        y_pred_g_auc = global_model[model_type].predict_proba(tests[1])

        y_pred_l = models[idx].predict(tests[1])
        y_pred_g = global_model[model_type].predict(tests[1])
        classes_score = global_model[model_type].classes_
        #  print("classses model",str(classes_score))
        if len(nr_classes) < 3:
            y_pred_l_auc = y_pred_l
            y_pred_g_auc = y_pred_g
        for metric in metrics:
            if metric == "auprc":
                result["silo" + str(idx + 1) + "_auprc_local"] = auprc_multiclass(tests[0], y_pred_l_auc, classes_score)
                #average_precision_score(
                #   y_pred_l, tests[0],average="weighted"
                #)
                result["silo" + str(idx + 1) + "_auprc_global"] = auprc_multiclass(tests[0], y_pred_g_auc,
                                                                                   classes_score)
            #  average_precision_score(
            #      y_pred_g, tests[0],average="weighted"
            #  )
            if metric == "roc_auc_score":
                try:
                    result[
                        "silo" + str(idx + 1) + "_roc_auc_score_local"
                        ] = roc_auc_score(y_true=tests[0], y_score=y_pred_l_auc, average="weighted", multi_class="ovr")

                except Exception as e:
                    f.write("Error on local-silo" + " " + str(idx + 1) + "\n")
                    f.write("roc score calculate" + " " + str(e) + " --->" + str(y_pred_l_auc) + "\n")
                    result["silo" + str(idx + 1) + "_roc_auc_score_local"] = np.nan
                try:
                    result[
                        "silo" + str(idx + 1) + "_roc_auc_score_global"
                        ] = roc_auc_score(y_true=tests[0], y_score=y_pred_g_auc, average="weighted", multi_class="ovr")

                except Exception as e:
                    f.write("Error on global-silo " + str(idx + 1) + "\n")
                    f.write("roc score calculate " + str(e) + " ---> " + str(y_pred_g_auc) + "\n")
                    result["silo" + str(idx + 1) + "_roc_auc_score_global"] = np.nan
            if metric == "f1":
                result["silo" + str(idx + 1) + "_f1_local"] = f1_score(
                    y_pred=y_pred_l, y_true=tests[0], average="weighted"
                )
                result["silo" + str(idx + 1) + "_f1_global"] = f1_score(
                    y_pred=y_pred_g, y_true=tests[0], average="weighted"
                )

    f.close()
    return result, global_model

In [14]:
def evalute_full_method(
        targets,
        silos,
        metrics,
        tuned_parameters,
        cv, int_cols, cat_cols, full_classes,
        model=SGDClassifier(),
        repeats=2,
        model_type="myvoting", debug_mode=False
):
    total = {k: [] for k in targets}
    np.random.seed(42)
    score_to_save = 0
    mode_to_save = None
    for target in targets:
        print("evaluating " + target + "... ")
        total[target] = {}
        total[target]["models"] = []
        total[target]["g_model"] = []
        for metric in metrics:
            for silonr, silo in enumerate(silos):
                total[target]["silo" + str(silonr + 1) + "_" + metric + "_local"] = []
                total[target]["silo" + str(silonr + 1) + "_" + metric + "_global"] = []
        for i in range(repeats):
            t = evaluate_federated_model(
                silos=silos,
                target=target,
                metrics=metrics,
                tuned_parameters=tuned_parameters,
                cv=cv, int_cols=int_cols, cat_cols=cat_cols,
                model=model, full_classes=full_classes,
                model_type=model_type, debug_mode=debug_mode
            )
            for metric in metrics:
                for silonr, silo in enumerate(silos):
                    total[target][
                        "silo" + str(silonr + 1) + "_" + metric + "_local"
                        ].append(t[0]["silo" + str(silonr + 1) + "_" + metric + "_local"])

                    total[target][
                        "silo" + str(silonr + 1) + "_" + metric + "_global"
                        ].append(t[0]["silo" + str(silonr + 1) + "_" + metric + "_global"])
        save_zipped_model(target, model, model_type, t[1])

    return total

## SGD

In [18]:
%%time
# Wall time: 1h1m
# stochastic gradient
warnings.filterwarnings("ignore")

total = evalute_full_method(
    repeats=REPS,
    targets=target_cat_cols,
    metrics=["roc_auc_score", "auprc"],
    silos=silo_imputed,
    cv=CV_NUMBER, int_cols=int_cols, cat_cols=cat_cols, full_classes=classes_dict,
    tuned_parameters=[{"alpha": [0.0001, 0.01,0.001], "l1_ratio": [0.05],"loss":["log_loss","modified_huber"]}],
    model_type="ensemble", debug_mode=False
)
alarm
with open("raw_classif_sgd.json", "w") as f:
    json.dump(total, f)

evaluating GS... 
{'alpha': 0.001, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.01, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.01, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.0001, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.0001, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.0001, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.001, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.001, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.01, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.01, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.01, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.001, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.0001, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.0001, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.01, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alpha': 0.0001, 'l1_ratio': 0.05, 'loss': 'modified_huber'}
{'alph

In [None]:
final = get_stats(total)
#plot_paper_grade_error(final)

In [None]:
ttest_data = get_ttest(total)

In [None]:
df_sgd = create_mega_table(total, "SGD")
df_sgd.head()
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_sgd.to_csv("results/classif_sgd" + date_time + ".csv")

## Decision Tree

In [None]:
%%time

warnings.filterwarnings("ignore")
# decision tree
#Wall time: 21min

# voting é o unico justo. stacking com o learn com um X train fica mt bom nesse e mau nos outros
# ensemble ainda n sei bem
total_dt = evalute_full_method(
    targets=target_cat_cols,
    metrics=["roc_auc_score", "auprc"],
    silos=silo_imputed,
    cv=CV_NUMBER, int_cols=int_cols, cat_cols=cat_cols, full_classes=classes_dict,
    tuned_parameters=[
        {"criterion": ["gini", "entropy"], "max_features": ["log2", "auto"]}
    ],
    model=DecisionTreeClassifier(),
    repeats=REPS,
    model_type="ensemble",
)
alarm
with open("raw_classif_dt.json", "w") as f:
    json.dump(total_dt, f)

In [None]:
final_dt = get_stats(total_dt)
#plot_paper_grade_error(final_dt)

In [None]:
tt = get_ttest(total_dt)


In [None]:
df_dt = create_mega_table(total_dt, "decisionTree")
df_dt.head()
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_dt.to_csv("results/classif_dt" + date_time + ".csv")

## Gaussian NB

In [None]:
%%time
#Wall time: 20min

total_nb = evalute_full_method(
    targets=target_cat_cols,
    metrics=["roc_auc_score", "auprc"],
    silos=silo_imputed,
    cv=CV_NUMBER, int_cols=int_cols, cat_cols=cat_cols, full_classes=classes_dict,
    tuned_parameters=[{"var_smoothing": [1e-9, 1e-8, 1e-7]}],
    model=GaussianNB(),
    repeats=REPS,
    model_type="ensemble",
)
alarm
with open("raw_classif_nb.json", "w") as f:
    json.dump(total_nb, f)

In [None]:
final_nb = get_stats(total_nb)
#plot_paper_grade_error(final_nb)

In [None]:
df_nb = create_mega_table(total_nb, "NaiveBayes")
df_nb.head()
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_nb.to_csv("results/classif_nb" + date_time + ".csv")

## KNN

In [None]:
%%time
#3h 40min
warnings.filterwarnings("ignore")

total_knn = evalute_full_method(
    targets=target_cat_cols,
    metrics=["roc_auc_score", "auprc"],
    silos=silo_imputed,
    cv=CV_NUMBER, int_cols=int_cols, cat_cols=cat_cols, full_classes=classes_dict,
    tuned_parameters={
        "n_neighbors": [5, 7, 10],
        "p": [1, 2]
    },
    model=KNeighborsClassifier(),
    repeats=REPS,
    model_type="ensemble",
)
alarm
with open("raw_classif_knn.json", "w") as f:
    json.dump(total_knn, f)

In [None]:
df_knn = create_mega_table(total_knn, "KNN")
df_knn.head()
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_knn.to_csv("results/classif_knn" + date_time + ".csv")

## ADABOOST

In [None]:
%%time
#Wall time: 1h 20m
warnings.filterwarnings("ignore")

total_adaboost = evalute_full_method(
    targets=target_cat_cols,
    metrics=["roc_auc_score", "auprc"],
    silos=silo_imputed,
    cv=CV_NUMBER, int_cols=int_cols, cat_cols=cat_cols, full_classes=classes_dict,
    tuned_parameters={
        "learning_rate": [1, 2, 0.5],
        "n_estimators": [25, 50]
    },
    model=AdaBoostClassifier(),
    repeats=REPS,
    model_type="ensemble",
)
alarm
with open("raw_classif_ada.json", "w") as f:
    json.dump(total_adaboost, f)

In [None]:
final_adaboost = get_stats(total_adaboost)
#plot_paper_grade_error(final_adaboost)

In [None]:
df_adaboost = create_mega_table(total_adaboost, "ADABOOST")
df_adaboost.head()
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_adaboost.to_csv("results/classif_adaboost" + date_time + ".csv")

## NN

In [None]:
%%time
#Wall time: 1h13m

total_nn = evalute_full_method(
    targets=target_cat_cols,
    metrics=["roc_auc_score", "auprc"],
    silos=silo_imputed,
    cv=CV_NUMBER, int_cols=int_cols, cat_cols=cat_cols, full_classes=classes_dict,
    tuned_parameters={
        "solver": ["lbfgs"], "learning_rate_init": [0.001, 1e-4], "max_iter": [10000, 500],
        "hidden_layer_sizes": [(100,)],
        "alpha": [1e-5, 1e-4], "learning_rate": ["adaptive"], "tol": [10, 20]
    },
    model=MLPClassifier(),
    repeats=REPS,
    model_type="ensemble",
)
alarm
with open("raw_classif_nn.json", "w") as f:
    json.dump(total_nb, f)

In [None]:
final_nn = get_stats(total_nn)
#plot_paper_grade_error(final_adaboost)

In [None]:
df_nn = create_mega_table(total_nn, "NN")
df_nn.head()
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_nn.to_csv("results/classif_nn" + date_time + ".csv")

## compile all DFs

In [None]:
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
result = pd.concat([df_sgd, df_dt, df_nb, df_knn, df_adaboost, df_nn], axis=0)
result.to_csv("results/classif_result_df" + date_time + ".csv")

# Total

In [19]:
def create_total_model_and_evaluate(
        silos,
        targets,
        parameters,
        global_model_name, cat_cols, int_cols,
        full_classes,
        model,
        cv=10,
        nr_repeats=10, samplingsilo=False
):
    np.random.seed(42)
    if samplingsilo:
        minisilos = [silo.sample(1000) if len(silo) > 1000 else silo for silo in silos]  # keep same nr rows
    minisilos = [silo for silo in silos]  #knn takes way too long
    full_data = pd.concat(minisilos).reset_index(drop=True)
    full_metric = defaultdict(dict)
    f = open("logs/centralised_" + str(type(global_model_name).__name__) + ".txt", "w")
    for target in targets:

        print("testing....", target, ".................." * 3)
        full_metric[target] = {
            "total": {"auc": [], "f1": [], "auprc": []},
            "global": {"auc": [], "f1": [], "auprc": []},
        }
        global_model = load_model_from_zip(target + "_" + str(type(global_model_name).__name__) + "_ensemble")
        # print(global_model)

        for i in range(nr_repeats):
            adpat_value = False
            r_s = np.random.randint(1, nr_repeats)
            argspec = model.get_params()
            if "random_state" in argspec.keys():

                total_clf = GridSearchCV(
                    model.set_params(random_state=r_s),
                    param_grid=parameters,
                    cv=RepeatedStratifiedKFold(n_splits=cv, n_repeats=2),
                    n_jobs=-2,
                )
            else:
                total_clf = GridSearchCV(
                    model.set_params(),
                    param_grid=parameters,
                    cv=RepeatedStratifiedKFold(n_splits=cv, n_repeats=2),
                    n_jobs=-2,
                )
            # print(i,target)
            X, y = evaluate_variables_and_transform_variables(full_data, target, cat_cols=cat_cols, int_cols=int_cols,
                                                              class_list=full_classes[target])

            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, stratify=y
            )

            total_clf.fit(X_train, y_train)
            g_model = prepare_global_model(
                global_model["ensemble"], X_train, y_train
            )
            #pred centralised on total
            y_pred_t_auc = y_pred_t = total_clf.best_estimator_.predict(X_test)
            #pred distributed on total
            y_pred_g_auc = y_pred_g = g_model.predict(X_test)
            classes_score = g_model.classes_
            #  print("classses model",str(classes_score))
            if len(full_classes[target]) > 3:
                y_pred_t_auc = total_clf.best_estimator_.predict_proba(X_test)
                y_pred_g_auc = g_model.predict_proba(X_test)

            full_metric[target]["total"]["auc"].append(
                roc_auc_score(y_score=y_pred_t_auc, y_true=y_test, multi_class="ovr")
            )

            try:
                full_metric[target]["global"]["auc"].append(
                    roc_auc_score(y_score=y_pred_g_auc, y_true=y_test, multi_class="ovr")
                )

            except Exception as e:
                f.write("ERROR: Y_PRED_G_AUC\n")
                #np.savetxt(f,y_pred_g_auc)
                any_nan = np.any(np.isnan(y_pred_g_auc))
                local_nan = np.argwhere(np.isnan(y_pred_g_auc))
                f.write(str(local_nan))
                # f.write(str(y_pred_g_auc[np.argwhere(np.isnan(y_pred_g_auc))]))
                #f.write(str(X_test[np.argwhere(np.isnan(y_pred_g_auc))]))
                #f.write(str(X_test[2429,:]))
                f.write(str(any_nan))
                f.write(str(np.sum(any_nan)))

                #                print("ERROR: on AUC",e)
                f.write("ERROR: the sum of nan in y_pred_g_auc is:" + str(np.sum(np.isnan(y_pred_g_auc))) + "\n")

                #print(y_pred_g_auc.shape)
                non_nan_pred = y_pred_g_auc[~np.isnan(y_pred_g_auc)]
                non_nan_pred_2 = y_pred_g_auc[~np.isnan(y_pred_g_auc).any(axis=1), :]
                # x[ ~np.isnan(x).any(axis=1),:]
                adpat_value = True
                #print(local_nan[0])
                #print(local_nan[0][0])
                #print(local_nan[0][1])
                #print(local_nan[:,0])
                nan_indexes = np.unique(local_nan[:, 0]).tolist()
                # print(str(y_test.reset_index()))
                non_nan_test = y_test.reset_index().drop(index=nan_indexes)
                non_nan_test = non_nan_test.drop(columns=["index"])
                f.write(str(non_nan_test) + "\n")
                f.write(str(non_nan_pred.shape) + "\n")
                f.write(str(non_nan_pred_2.shape) + "\n")

                non_nan_result = roc_auc_score(y_score=non_nan_pred_2, y_true=non_nan_test, multi_class="ovr")
                full_metric[target]["global"]["auc"].append(non_nan_result)

            full_metric[target]["total"]["f1"].append(
                f1_score(y_pred_t, y_test, average="weighted")
            )

            full_metric[target]["global"]["f1"].append(
                f1_score(y_pred_g, y_test, average="weighted")
            )

            full_metric[target]["total"]["auprc"].append(
                auprc_multiclass(y_test, y_pred_t_auc, classes_score)
            )
            if adpat_value:
                full_metric[target]["global"]["auprc"].append(
                    auprc_multiclass(non_nan_test, non_nan_pred_2, classes_score))
            else:
                full_metric[target]["global"]["auprc"].append(auprc_multiclass(y_test, y_pred_g_auc, classes_score))
            adpat_value = False
        save_zipped_model(target, model, "centralised", total_clf)

    f.close()
    return full_metric


In [20]:
def create_total_model(
        silos,
        targets,
        parameters,
        global_model_name, cat_cols, int_cols,
        full_classes,
        model,
        cv=10,
        nr_repeats=10, samplingsilo=False
):
    np.random.seed(42)
    if samplingsilo:
        minisilos = [silo.sample(1000) if len(silo) > 1000 else silo for silo in silos]  # keep same nr rows
    minisilos = [silo for silo in silos]  #knn takes way too long
    full_data = pd.concat(minisilos).reset_index(drop=True)
    full_metric = defaultdict(dict)
    f = open("logs/centralised_" + str(type(global_model_name).__name__) + ".txt", "w")
    for target in targets:

        print("testing....", target, ".................." * 3)
        full_metric[target] = {
            "total": {"auc": [], "f1": [], "auprc": []}        }

        r_s = np.random.randint(1, nr_repeats)
        argspec = model.get_params()
        if "random_state" in argspec.keys():

            total_clf = GridSearchCV(
                model.set_params(random_state=r_s),
                param_grid=parameters,
                cv=RepeatedStratifiedKFold(n_splits=cv, n_repeats=2),
                n_jobs=-2,
            )
        else:
            total_clf = GridSearchCV(
                model.set_params(),
                param_grid=parameters,
                cv=RepeatedStratifiedKFold(n_splits=cv, n_repeats=2),
                n_jobs=-2,
            )
        # print(i,target)
        X, y = evaluate_variables_and_transform_variables(full_data, target, cat_cols=cat_cols, int_cols=int_cols,
                                                            class_list=full_classes[target])

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y
        )

        total_clf.fit(X_train, y_train)
        y_pred_t_auc = y_pred_t = total_clf.best_estimator_.predict(X_test)
        classes_score = total_clf.best_estimator_.classes_
        if len(full_classes[target]) > 3:
            y_pred_t_auc = total_clf.best_estimator_.predict_proba(X_test)

        full_metric[target]["total"]["auc"].append(
            roc_auc_score(y_score=y_pred_t_auc, y_true=y_test, multi_class="ovr")
        )

        full_metric[target]["total"]["auprc"].append(
            auprc_multiclass(y_test, y_pred_t_auc, classes_score)
        )
    save_zipped_model(target, model, "centralised", total_clf)

    f.close()
    return full_metric


In [21]:
result_dict = {}

In [None]:
#2h 55min without SGD and KNN
data_dict = {
    "decisionTree": [DecisionTreeClassifier(), [{"criterion": ["gini", "entropy"], "max_features": ["log2", "auto"]}]],
    "NaiveBayes": [GaussianNB(), [{"var_smoothing": [1e-9, 1e-8, 1e-7]}]],
    "ADABOOST": [AdaBoostClassifier(), {"learning_rate": [1, 2, 0.5], "n_estimators": [25, 50]}],
    "NN": [MLPClassifier(), {"solver": ["lbfgs"], "learning_rate_init": [0.001, 1e-4], "max_iter": [10000, 500],
                             "hidden_layer_sizes": [(100,)],
                             "alpha": [1e-5, 1e-4], "learning_rate": ["adaptive"], "tol": [10, 20]}]
}
for k, v in data_dict.items():
    print(k)
    result_dict[k] = create_total_model_and_evaluate(
        silos=silo_imputed,
        targets=target_cat_cols,
        parameters=v[1], cat_cols=cat_cols, int_cols=int_cols,
        global_model_name=v[0], full_classes=classes_dict,
        model=v[0], samplingsilo=False
    )

In [22]:
#2h 46min
data_dict = {"SGD": [SGDClassifier(),[{"alpha": [0.0001, 0.01,0.001], "l1_ratio": [0.05],"loss":["log_loss","modified_huber"]}]]}

for k, v in data_dict.items():
    print(k)
    result_dict[k] = create_total_model_and_evaluate(
        silos=silo_imputed,
        targets=target_cat_cols,
        parameters=v[1], cat_cols=cat_cols, int_cols=int_cols,
        global_model_name=v[0], full_classes=classes_dict,
        model=v[0], samplingsilo=False)

SGD
testing.... GS ......................................................
testing.... A_PARA ......................................................
testing.... A_GESTA ......................................................
testing.... TIPO_GRAVIDEZ ......................................................
testing.... VIGIADA ......................................................
testing.... VIGIADA_CENTRO_SAUDE ......................................................
testing.... VIGIADA_NESTE_HOSPITAL ......................................................
testing.... APRESENTACAO_ADMISSAO ......................................................
testing.... TRAB_PARTO_ENTRADA_ESPONTANEO ......................................................
testing.... TIPO_PARTO ......................................................
testing.... APRESENTACAO_NO_PARTO ......................................................
testing.... TRAB_PARTO_NO_PARTO ......................................................
tes

In [None]:
data_dict = {"KNN": [KNeighborsClassifier(), {"n_neighbors": [5, 7, 10], "p": [1, 2]}]}

for k, v in data_dict.items():
    for t in target_cat_cols:
        print(t)
        print(k)
        result_dict[k] = create_total_model(
            silos=silo_imputed,
            targets=[t],
            parameters=v[1], cat_cols=cat_cols, int_cols=int_cols,
            global_model_name=v[0], full_classes=classes_dict,
            model=v[0], samplingsilo=False)

In [None]:
def check_and_remove_nan_output(output,X,y,f):
    # Get boolean array of null values
    null_mask = np.isnan(output)

    # Get the indices of the null values
    null_indices = np.where(null_mask)
   
    # Get the corresponding values in arr2
    corresponding_values = X[null_indices]
    if null_mask.any():
        log_to_file(f,["There is nan in output"])
        log_to_file(f,["nan indices",output])
        log_to_file(f,["corresponding values",corresponding_values])
        return np.delete(output, null_indices),np.delete(y, null_indices)
    else:   
        return output,y


# All on silos

In [23]:
def evaluate_models_on_local(
        silos, target, metrics, cv, int_cols, cat_cols, tuned_parameters, model, full_classes,f,
        debug_mode=False
):
    """
    for every silo, trains and local model with hyperparameter tuning (CV)
    After that, creates a global_model and with all locals and global evaluates on the test set several metrics
    remove low frequency target (below 4) in order to get proper metric values (weighted f1 and auc)
    """
    grid_list = []
    result = {}
    models = []
    test_sets = []
    X_train_list = []
    y_train_list = []
    now = datetime.now()
    date_time = now.strftime("%Y%m%d - %H:%M")
    f.write(date_time + "\n")
  #  print("starting creating locals")
    for idx, silo in enumerate(silos):
        if debug_mode:
            #print("silo", str(idx))
            print(np.random.randint(1, 20))
        f.write("silo " + str(idx) + "\n")
        if "random_state" in model.get_params().keys():
            model.set_params(random_state=np.random.randint(1, 20))
        clf = GridSearchCV(
            model, tuned_parameters, cv=RepeatedStratifiedKFold(n_splits=cv, n_repeats=2), n_jobs=-2, scoring='f1_micro'
        )
        nr_classes = silo[target].unique()
        X, y = evaluate_variables_and_transform_variables(silo, target, int_cols, cat_cols, full_classes[target])

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y
        )

        test_sets.append((y_test, X_test))
        X_train_list.append(X_train)
        y_train_list.append(y_train)
        models.append(clf.fit(X_train, y_train).best_estimator_)
        #grid_list.append(clf)

    distributed_model = load_model_from_zip(target + "_" + str(type(model).__name__) + "_ensemble")["ensemble"]
    centralised_model = load_model_from_zip(target + "_" + str(type(model).__name__) + "_centralised")
   # print("loaded models")
    #print(distributed_model)
    #print(centralised_model)
    for idx, tests in enumerate(test_sets):
        y_pred_l_auc = models[idx].predict_proba(tests[1])
       # print(y_pred_l_auc)
        y_pred_g_auc = distributed_model.predict_proba(tests[1])
        y_pred_c_auc = centralised_model.best_estimator_.predict_proba(tests[1])
       # print("here")
        f.write("silo " + str(idx) + "\n")
        f.write("The sum of nan in y_pred_l_auc is:" + str(np.sum(np.isnan(y_pred_l_auc))) + "\n")
        f.write("The sum of nan in y_pred_g_auc is:" + str(np.sum(np.isnan(y_pred_g_auc))) + "\n")
        f.write("The sum of nan in y_pred_c_auc is:" + str(np.sum(np.isnan(y_pred_c_auc))) + "\n")
        y_pred_l = models[idx].predict(tests[1])
        y_pred_g = distributed_model.predict(tests[1])
        y_pred_c = centralised_model.best_estimator_.predict(tests[1])

        classes_score = centralised_model.classes_
        f.write(str(classes_score) + "\n")
        if len(nr_classes) < 3:
            y_pred_l_auc = y_pred_l
            y_pred_g_auc = y_pred_g
            y_pred_c_auc = y_pred_c

        for metric in metrics:
            if metric == "auprc":
                result["silo" + str(idx + 1) + "_auprc_local"] = auprc_multiclass(tests[0], y_pred_l_auc, classes_score)
                result["silo" + str(idx + 1) + "_auprc_global"] = auprc_multiclass(tests[0], y_pred_g_auc,
                                                                                   classes_score)
                result["silo" + str(idx + 1) + "_auprc_centralised"] = auprc_multiclass(tests[0], y_pred_c_auc,
                                                                                        classes_score)

            if metric == "roc_auc_score":
                try:
                    result[
                        "silo" + str(idx + 1) + "_roc_auc_score_local"
                        ] = roc_auc_score(y_true=tests[0], y_score=y_pred_l_auc, average="weighted", multi_class="ovr")

                except Exception as e:
                    f.write("Error on local-silo" + " " + str(idx + 1) + "\n")
                    f.write("roc score calculate" + " " + str(e) + " ---> "+ str(y_pred_l_auc) + "\n")
                    result["silo" + str(idx + 1) + "_roc_auc_score_local"] = np.nan
                try:
                    result[
                        "silo" + str(idx + 1) + "_roc_auc_score_global"
                        ] = roc_auc_score(y_true=tests[0], y_score=y_pred_g_auc, average="weighted", multi_class="ovr")

                except Exception as e:
                    f.write("Error on global-silo " + str(idx + 1) + "\n")
                    f.write("roc score calculate " + str(e) + " ---> " + str(y_pred_g_auc) + "\n")
                    result["silo" + str(idx + 1) + "_roc_auc_score_global"] = np.nan

                try:
                    result[
                        "silo" + str(idx + 1) + "_roc_auc_score_centralised"
                        ] = roc_auc_score(y_true=tests[0], y_score=y_pred_c_auc, average="weighted", multi_class="ovr")

                except Exception as e:
                    f.write("Error on central-silo" + " " + str(idx + 1) + "\n")
                    f.write("roc score calculate" + " " + str(e) + " --->" +" "+ str(y_pred_c_auc) + "\n")
                    result["silo" + str(idx + 1) + "_roc_auc_score_centralised"] = np.nan
    return result

In [24]:
def evaluate_all_on_local(
        targets,
        silos,
        metrics,
        tuned_parameters,
        cv, int_cols, cat_cols, full_classes,
        model,
        repeats, debug_mode=False
):
    total = {}
    np.random.seed(42)
    for target in targets:
        total[target] = {}
        f = open("logs/log_" + str(type(model).__name__) + "_final_test_all.txt", "a")
        log_to_file(f,["evaluating " , target , "... "])
        for metric in metrics:
            for silonr, silo in enumerate(silos):
                total[target]["silo" + str(silonr + 1) + "_" + metric + "_local"] = []
                total[target]["silo" + str(silonr + 1) + "_" + metric + "_distributed"] = []
                total[target]["silo" + str(silonr + 1) + "_" + metric + "_centralised"] = []
        #print("starting range repeats")
        for i in range(repeats):
            log_to_file(f,["repeat " , i ])
            t = evaluate_models_on_local(
                silos=silos,
                target=target,
                metrics=metrics,
                tuned_parameters=tuned_parameters,
                cv=cv, int_cols=int_cols, cat_cols=cat_cols,
                model=model, full_classes=full_classes,f=f, debug_mode=debug_mode
            )
           # print(t)
            for metric in metrics:
                for silonr, silo in enumerate(silos):
                    total[target][
                        "silo" + str(silonr + 1) + "_" + metric + "_local"
                        ].append(t["silo" + str(silonr + 1) + "_" + metric + "_local"])

                    total[target][
                        "silo" + str(silonr + 1) + "_" + metric + "_distributed"
                        ].append(t["silo" + str(silonr + 1) + "_" + metric + "_global"])
                    total[target][
                        "silo" + str(silonr + 1) + "_" + metric + "_centralised"
                        ].append(t["silo" + str(silonr + 1) + "_" + metric + "_centralised"])
    f.close()

    return total

In [25]:
warnings.filterwarnings("ignore")

tt = evaluate_all_on_local(targets=target_cat_cols,
                           metrics=["roc_auc_score", "auprc"],
                           silos=silo_imputed,
                           cv=CV_NUMBER, int_cols=int_cols, cat_cols=cat_cols, full_classes=classes_dict,
                           tuned_parameters = [{"alpha": [0.0001, 0.01,0.001], "l1_ratio": [0.05],"loss":["log_loss","modified_huber"]}],
                           model=SGDClassifier(),
                           repeats=REPS,debug_mode=False)

In [26]:
df_sgd = from_dict_to_df_raw(tt, "SGD")
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_sgd.to_csv("results/classif_all_in_all_sgd" + date_time + ".csv")

In [None]:
warnings.filterwarnings("ignore")

tt_dt = evaluate_all_on_local(targets=target_cat_cols,
                           metrics=["roc_auc_score", "auprc"],
                           silos=silo_imputed,
                           cv=CV_NUMBER, int_cols=int_cols, cat_cols=cat_cols, full_classes=classes_dict,
                           tuned_parameters=[{"criterion": ["gini", "entropy"], "max_features": ["log2", "auto"]}],
                           model=DecisionTreeClassifier(),
                           repeats=REPS,debug_mode=False)

In [None]:
df_dt = from_dict_to_df_raw(tt_dt, "decisionTree")
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_dt.to_csv("results/classif_all_in_all_dt" + date_time + ".csv")

In [None]:
warnings.filterwarnings("ignore")

tt_nb = evaluate_all_on_local(targets=target_cat_cols,
                           metrics=["roc_auc_score", "auprc"],
                           silos=silo_imputed,
                           cv=CV_NUMBER, int_cols=int_cols, cat_cols=cat_cols, full_classes=classes_dict,
                           tuned_parameters=[{"var_smoothing": [1e-9, 1e-8, 1e-7]}],
                           model=GaussianNB(),
                           repeats=REPS,debug_mode=False)

In [None]:
df_nb = from_dict_to_df_raw(tt_nb, "NaiveBayes")
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_nb.to_csv("results/classif_all_in_all_nb" + date_time + ".csv")

In [None]:
warnings.filterwarnings("ignore")

tt_ada = evaluate_all_on_local(targets=target_cat_cols,
                           metrics=["roc_auc_score", "auprc"],
                           silos=silo_imputed,
                           cv=CV_NUMBER, int_cols=int_cols, cat_cols=cat_cols, full_classes=classes_dict,
                           tuned_parameters={"learning_rate": [1, 2, 0.5], "n_estimators": [25, 50]},
                           model=AdaBoostClassifier(),
                           repeats=REPS,debug_mode=False)

In [None]:
df_adaboost = from_dict_to_df_raw(tt_ada, "ADABOOST")
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_adaboost.to_csv("results/classif_all_in_all_adaboost" + date_time + ".csv")

In [None]:
warnings.filterwarnings("ignore")
#270m
tt_knn = evaluate_all_on_local(targets=target_cat_cols,
                           metrics=["roc_auc_score", "auprc"],
                           silos=silo_imputed,
                           cv=CV_NUMBER, int_cols=int_cols, cat_cols=cat_cols, full_classes=classes_dict,
                           tuned_parameters={"n_neighbors": [5,7,10],"p": [1,2]},
                           model=KNeighborsClassifier(),
                           repeats=REPS,debug_mode=False)

In [None]:
df_knn = from_dict_to_df_raw(tt_knn, "KNN")
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_knn.to_csv("results/classif_all_in_all_knn" + date_time + ".csv")

In [None]:
warnings.filterwarnings("ignore")

tt_nn = evaluate_all_on_local(targets=target_cat_cols,
                           metrics=["roc_auc_score", "auprc"],
                           silos=silo_imputed,
                           cv=CV_NUMBER, int_cols=int_cols, cat_cols=cat_cols, full_classes=classes_dict,
                           tuned_parameters={"solver": ["lbfgs"], "learning_rate_init": [0.001, 1e-4], "max_iter": [10000, 500],
                             "hidden_layer_sizes": [(100,)],
                             "alpha": [1e-5, 1e-4], "learning_rate": ["adaptive"], "tol": [10, 20]},
                           model=MLPClassifier(),
                           repeats=REPS,debug_mode=False)

In [None]:
df_nn = from_dict_to_df_raw(tt_nn, "NN")
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
df_nn.to_csv("results/classif_all_in_all_nn" + date_time + ".csv")

In [30]:
now = datetime.now()
date_time = now.strftime("%Y%m%d%H")
result = pd.concat([df_sgd, df_dt, df_nb, df_adaboost,df_knn,df_nn], axis=0)
result.to_csv("results/classif_result_all_in_all_df_" + date_time + ".csv")