In [None]:
import re
import sys
import warnings
from collections import Counter, defaultdict
from datetime import datetime
from federated_eval_helper_functions import *
import matplotlib as mpl
import matplotlib.font_manager as fm
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.stats.api as sms
from imblearn.over_sampling import ADASYN, SMOTE, SVMSMOTE, RandomOverSampler
from IPython.display import Audio
from matplotlib import pyplot as plt
from mlxtend.classifier import EnsembleVoteClassifier, StackingClassifier
from pylab import cm
from scipy.stats import ttest_ind
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import BayesianRidge, LogisticRegression, SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    multilabel_confusion_matrix,
    plot_confusion_matrix,
    roc_auc_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RepeatedKFold,
    RepeatedStratifiedKFold,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
import statistics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
    LabelBinarizer,
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
)
from itertools import compress
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import glob


# Import processed data

In [None]:
# Collect all the font names available to matplotlib
font_names = [f.name for f in fm.fontManager.ttflist]
#print(font_names)
# Edit the font, font size, and axes width
mpl.rcParams["font.family"] = "Avenir"
plt.rcParams["font.size"] = 10
plt.rcParams["axes.linewidth"] = 2
# Generate 2 colors from the 'tab10' colormap
colors = cm.get_cmap("tab10", 2)

sound_file = "https://www.soundjay.com/buttons/button-09a.wav"  # https://www.soundjay.com/buttons/sounds/button-1.mp3
alarm = Audio(sound_file, autoplay=True)

In [None]:
silo_imputed=[]
for idx,file in enumerate(glob.glob("<PATH>/imputed/silo*.csv")):
    silo_imputed.append(pd.read_csv(file,index_col=0))

In [None]:
%store -r int_cols
int_cols

In [None]:
%store -r cat_cols

# Modeling

In [None]:
def get_all_classes(dfs, target):
    final_list = []
    for df in dfs:
        for e in df[target].unique():
            final_list.append(e) if e not in final_list else final_list
    return final_list

In [None]:
classes_dict = {}
for col in cat_cols:
    classes_dict[col] = get_all_classes(silo_imputed, col)

In [None]:
def prepare_global_model_init(g_model, X_train, y_train, model_type):
    threshold = get_best_threshold(g_model["myvoting"], X_train, y_train)
    g_model["myvoting"].set_threshold(threshold[0])
    g_model["ensemble"].fit(X_train, y_train)
    #   g_model["stacking"].fit(X_train, y_train)
    return g_model

In [None]:
def evaluate_variables_and_transform_variables(df,target,cat_cols,int_cols,class_list=None,threshold=12,nr_rows=25):
    if class_list:
        missing_class=list(set(class_list)-set(df[str(target)].unique()))
        if len(missing_class)>0:
            print("missing class:",missing_class)
            print("existing class:",df[str(target)].unique())
            for c in missing_class:
                df=dummy_row_creation(df,target,c,int_cols,cat_cols,nr_rows)
    s=df[target].value_counts().le(threshold)
    to_smote=list(s[s].index.values)
    print("vars to be enhanced",to_smote) 
        # transform the dataset
    y = df[target]
    X = df.drop(columns=[target])
    if len(to_smote)>0:
        smote_params={}
        for e in to_smote:
            smote_params[e]=nr_rows
        ros = RandomOverSampler(random_state=0,sampling_strategy=smote_params)

        X_resampled, y_resampled = ros.fit_resample(X, y)
        #if not present
        return X_resampled,y_resampled
    else:
        return X,y

In [None]:
def evaluate_federated_model(
    silos, target, metrics, cv,int_cols,cat_cols, tuned_parameters, model, full_classes, model_type="myvoting",debug_mode=False
):
    """
    for every silo, trains and local model with hyperparameter tuning (CV)
    After that, creates a global_model and with all locals and global evaluates on the test set several metrics
    remove low frequency target (below 4) in order to get proper metric values (weighted f1 and auc)
    """
    grid_list = []
    result = {}
    models = []
    test_sets = []
    X_train_list = []
    y_train_list = []
    for idx, silo in enumerate(silos):
        if debug_mode:
            print("silo",str(idx))
            print(np.random.randint(1,20))
        if "random_state" in model.get_params().keys():
            model.set_params(random_state=np.random.randint(1, 20))
     #   print(model.get_params())
   #     s_model = OneVsRestClassifier(model)
        clf = GridSearchCV(
            model, tuned_parameters, cv=RepeatedStratifiedKFold(n_splits=cv,n_repeats=2), n_jobs=-2,scoring = 'f1_micro'
        )
        nr_classes=silo[target].unique()
        X,y=evaluate_variables_and_transform_variables(silo,target,int_cols,cat_cols,full_classes[target])

        X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, stratify=y
            )

        test_sets.append((y_test, X_test))
        X_train_list.append(X_train)
        y_train_list.append(y_train)
        models.append(clf.fit(X_train, y_train).best_estimator_)
        grid_list.append(clf)

    w = define_weights(grid_list)
    global_model = create_global_model(models, "voting", w)
    global_model = prepare_global_model_init(
        global_model, X_train_list[0], y_train_list[0], model_type
    )

    for idx, tests in enumerate(test_sets):
        y_pred_l_auc = models[idx].predict_proba(tests[1])
        y_pred_g_auc = global_model[model_type].predict_proba(tests[1])
        
        y_pred_l = models[idx].predict(tests[1])
        y_pred_g = global_model[model_type].predict(tests[1])
        if len(nr_classes)<3:
            y_pred_l_auc=y_pred_l
            y_pred_g_auc=y_pred_g
        for metric in metrics:
            if metric == "accuracy":
                result["silo" + str(idx + 1) + "_accuracy_local"] = accuracy_score(
                    y_pred_l, tests[0],average="weighted"
                )
                result["silo" + str(idx + 1) + "_accuracy_global"] = accuracy_score(
                    y_pred_g, tests[0],average="weighted"
                )
            if metric == "roc_auc_score":
                try:
                    result[
                        "silo" + str(idx + 1) + "_roc_auc_score_local"
                    ] = roc_auc_score( y_true=tests[0],y_score=y_pred_l_auc,average="weighted",multi_class="ovr")

                except Exception as e:
                    print("Error on local-silo",str(idx+1),"roc score calculate",e,"--->",y_pred_l_auc)
                    result["silo" + str(idx + 1) + "_roc_auc_score_local"] = np.nan
                try:
                    result[
                        "silo" + str(idx + 1) + "_roc_auc_score_global"
                    ] = roc_auc_score(y_true=tests[0],y_score=y_pred_g_auc,average="weighted",multi_class="ovr")

                except Exception as e:
                    print("Error on global-silo",str(idx+1),"roc score calculate",e,"--->",y_pred_g_auc)
                    result["silo" + str(idx + 1) + "_roc_auc_score_global"] = np.nan
            if metric == "f1":
                result["silo" + str(idx + 1) + "_f1_local"] = f1_score(
                    y_pred=y_pred_l, y_true=tests[0],average="weighted"
                )
                result["silo" + str(idx + 1) + "_f1_global"] = f1_score(
                    y_pred=y_pred_g, y_true=tests[0],average="weighted"
                )

    g_model = global_model

    return result, models, g_model

In [None]:
def evalute_full_method(
    targets,
    silos,
    metrics,
    tuned_parameters,
    cv,int_cols,cat_cols,full_classes,
    model=SGDClassifier(loss="log"),
    repeats=2,
    model_type="myvoting",debug_mode=False
):
    total = {k: [] for k in targets}
    np.random.seed(42)
    for target in targets:
        print("evaluating " + target + "... ")
        total[target] = {}
        total[target]["models"] = []
        total[target]["g_model"] = []
        for metric in metrics:
            for silonr, silo in enumerate(silos):
                total[target]["silo" + str(silonr + 1) + "_" + metric + "_local"] = []
                total[target]["silo" + str(silonr + 1) + "_" + metric + "_global"] = []
        for i in range(repeats):
            t = evaluate_federated_model(
                silos=silos,
                target=target,
                metrics=metrics,
                tuned_parameters=tuned_parameters,
                cv=cv,int_cols=int_cols,cat_cols=cat_cols,
                model=model,full_classes=full_classes,
                model_type=model_type,debug_mode=debug_mode
            )
            for metric in metrics:
                for silonr, silo in enumerate(silos):
                    total[target][
                        "silo" + str(silonr + 1) + "_" + metric + "_local"
                    ].append(t[0]["silo" + str(silonr + 1) + "_" + metric + "_local"])

                    total[target][
                        "silo" + str(silonr + 1) + "_" + metric + "_global"
                    ].append(t[0]["silo" + str(silonr + 1) + "_" + metric + "_global"])
            total[target]["models"].append(t[1])
            total[target]["g_model"].append(t[2])
    return total

## SGD

In [None]:
%%time
# Wall time: 35min 17s
# stochastic gradient
total = evalute_full_method(
    repeats=10,
    targets=[      "GRUPO_ROBSON",
        "TIPO_GRAVIDEZ",
        "EUTOCITO_ANTERIOR",
        "CESARIANAS_ANTERIOR",
        "TRAB_PARTO_NO_PARTO",
        "TIPO_PARTO",
        "BACIA",
        "DIABETES_GESTACIONAL",
        "GS"],
    metrics=["f1", "roc_auc_score"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,full_classes=classes_dict,
    tuned_parameters=[{"alpha": [0.0001, 0.01], "l1_ratio": [0.05]}],
    model_type="ensemble",debug_mode=False
)
alarm

In [None]:
%%time
# CPU times: user 2h 21min 54s, sys: 8min 2s, total: 2h 29min 56s
# stochastic gradient
total_teste = evalute_full_method(
    repeats=10,
    targets=["GRUPO_ROBSON",
        "TIPO_GRAVIDEZ"],
    metrics=["f1", "roc_auc_score"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,full_classes=classes_dict,
    tuned_parameters=[{"alpha": [0.0001, 0.01], "l1_ratio": [0.05]}],
    model_type="ensemble",debug_mode=True
)
alarm

In [None]:
final = get_stats(total)

In [None]:
ttest_data=get_ttest(total)

In [None]:
%%time

warnings.filterwarnings("ignore")
# decision tree
#Wall time: 4min 49s

# voting é o unico justo. stacking com o learn com um X train fica mt bom nesse e mau nos outros
# ensemble ainda n sei bem
total_dt = evalute_full_method(
  targets=[      "GRUPO_ROBSON",
        "TIPO_GRAVIDEZ",
        "EUTOCITO_ANTERIOR",
        "CESARIANAS_ANTERIOR",
        "TRAB_PARTO_NO_PARTO",
        "TIPO_PARTO",
        "BACIA",
        "DIABETES_GESTACIONAL",
        "GS"],
    metrics=["f1", "roc_auc_score"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,full_classes=classes_dict,
    tuned_parameters=[
        {"criterion": ["gini", "entropy"], "max_features": ["log2", "auto"]}
    ],
    model=DecisionTreeClassifier(),
    repeats=10,
    model_type="ensemble",
)
alarm

In [None]:
final_dt = get_stats(total_dt)

In [None]:
tt = get_ttest(total_dt)

### Gaussian NB

In [None]:
%%time
#Wall time: 3min 59s

total_nb = evalute_full_method(
  targets=[      "GRUPO_ROBSON",
        "TIPO_GRAVIDEZ",
        "EUTOCITO_ANTERIOR",
        "CESARIANAS_ANTERIOR",
        "TRAB_PARTO_NO_PARTO",
        "TIPO_PARTO",
        "BACIA",
        "DIABETES_GESTACIONAL",
        "GS"],
    metrics=["f1", "roc_auc_score"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,full_classes=classes_dict,
    tuned_parameters=[{"var_smoothing": [1e-9, 1e-8, 1e-7]}],
    model=GaussianNB(),
    repeats=10,
    model_type="ensemble",
)
alarm

In [None]:
final_nb = get_stats(total_nb)

#### KNN

In [None]:
%%time
#Wall time: 1h 49min 18s

total_knn = evalute_full_method(
  targets=[      "GRUPO_ROBSON",
        "TIPO_GRAVIDEZ",
        "EUTOCITO_ANTERIOR",
        "CESARIANAS_ANTERIOR",
        "TRAB_PARTO_NO_PARTO",
        "TIPO_PARTO",
        "BACIA",
        "DIABETES_GESTACIONAL",
        "GS"],
    metrics=["f1", "roc_auc_score"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,full_classes=classes_dict,
    tuned_parameters={
            "n_neighbors": [5,7,10],
            "p": [1,2]
        },
    model= KNeighborsClassifier(),
    repeats=10,
    model_type="ensemble",
)
alarm

### ADABOOST

In [None]:
%%time
#Wall time: 43min 37s

total_adaboost = evalute_full_method(
  targets=[      "GRUPO_ROBSON",
        "TIPO_GRAVIDEZ",
        "EUTOCITO_ANTERIOR",
        "CESARIANAS_ANTERIOR",
        "TRAB_PARTO_NO_PARTO",
        "TIPO_PARTO",
        "BACIA",
        "DIABETES_GESTACIONAL",
        "GS"],
    metrics=["f1", "roc_auc_score"],
    silos=silo_imputed,
    cv=10,int_cols=int_cols,cat_cols=cat_cols,full_classes=classes_dict,
    tuned_parameters={
            "learning_rate": [1,2,0.5],
            "n_estimators": [25,50]
        },
    model= AdaBoostClassifier(),
    repeats=10,
    model_type="ensemble",
)
alarm

In [None]:
final_adaboost = get_stats(total_adaboost)

### Total

In [None]:
def create_total_model_and_evaluate(
    silos,
    targets,
    parameters,
    global_model,cat_cols,int_cols,
    full_classes,
    model=SGDClassifier(loss="log"),
    cv=10,
    nr_repeats=10,samplingsilo=False
):
    np.random.seed(42)
    if samplingsilo:
        minisilos = [silo.sample(1000) if len(silo)>1000 else silo for silo in silos ]  # keep same nr rows
    minisilos=[silo for silo in silos] #knn takes way too long
    full_data = pd.concat(minisilos).reset_index(drop=True)
    full_metric = defaultdict(dict)
    
    for target in targets:
        print("testing....",target)
        full_metric[target] = {
            "total": {"auc": [], "f1": []},
            "global": {"auc": [], "f1": []},
        }
        
        for i in range(nr_repeats):
            r_s = np.random.randint(1, nr_repeats)
            argspec = model.get_params()
            if "random_state" in argspec.keys():
                
                total_clf = GridSearchCV(
                    model.set_params(random_state=r_s),
                    param_grid=parameters,
                    cv=RepeatedKFold(n_splits=cv,n_repeats=2),
                    n_jobs=-2,
                )
            else:
                total_clf = GridSearchCV(
                    model.set_params(),
                    param_grid=parameters,
                    cv=RepeatedStratifiedKFold(n_splits=cv,n_repeats=2),
                    n_jobs=-2,
                )
         # print(i,target)
            X,y=evaluate_variables_and_transform_variables(full_data,target,cat_cols=cat_cols,int_cols=int_cols,class_list=full_classes[target])

            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, stratify=y
            )

            total_clf.fit(X_train, y_train)
            g_model = prepare_global_model(
                global_model[target]["g_model"][0]["ensemble"], X_train, y_train
            )

            y_pred_t_auc=y_pred_t=total_clf.best_estimator_.predict(X_test)
            y_pred_g_auc=y_pred_g=g_model.predict(X_test)
            
            if len(full_classes[target])>3:
                y_pred_t_auc=total_clf.best_estimator_.predict_proba(X_test)
                y_pred_g_auc=g_model.predict_proba(X_test)
                
            full_metric[target]["total"]["auc"].append(
                roc_auc_score(y_score=y_pred_t_auc, y_true=y_test,multi_class="ovr")
            )
            
            try:
                full_metric[target]["global"]["auc"].append(
                roc_auc_score(y_score=y_pred_g_auc, y_true=y_test,multi_class="ovr")
            )
            except Exception as e:
                print("ERROR",e)
                print("error",sum(np.isnan(y_pred_g_auc)))
                full_metric[target]["global"]["auc"].append(np.nan)
            
            full_metric[target]["total"]["f1"].append(
                f1_score(y_pred_t, y_test,average="weighted")
            )
            
            full_metric[target]["global"]["f1"].append(
                f1_score(y_pred_g, y_test,average="weighted")
            )
            
    return full_metric


In [None]:
result_dict={}
data_dict={"SGD":[SGDClassifier(loss="log"),[{"alpha": [0.0001, 0.01], "l1_ratio": [0.05]}],total],
    "decisionTree":[DecisionTreeClassifier(),[{"criterion": ["gini", "entropy"], "max_features": ["log2", "auto"]}],total_dt],
           "NaiveBayes":[GaussianNB(),[{"var_smoothing": [1e-9, 1e-8, 1e-7]}],total_nb],
          # "KNN":(KNeighborsClassifier(),{"n_neighbors": [5,7,10],"p": [1,2]}),
           "ADABOOST":[AdaBoostClassifier(),{"learning_rate": [1,2,0.5],"n_estimators": [25,50]},total_adaboost]
           }


for k,v in data_dict.items():
    print(k)
    result_dict[k] = create_total_model_and_evaluate(
        silos=silo_imputed,
        targets=[
        "GRUPO_ROBSON",
        "TIPO_GRAVIDEZ",
        "EUTOCITO_ANTERIOR",
        "CESARIANAS_ANTERIOR",
        "TRAB_PARTO_NO_PARTO",
        "TIPO_PARTO",
        "BACIA",
        "DIABETES_GESTACIONAL",
        "GS"
        ],
        parameters=v[1],cat_cols=cat_cols,int_cols=int_cols,
        global_model=v[2],full_classes=classes_dict,
        model=v[0],samplingsilo=False
    )

In [None]:
total_knn_slim={}
for k,v in total_knn.items():
    total_knn_slim[k]={"g_model":[{"ensemble":v["g_model"][0]["ensemble"]}]}
total_knn_slim

In [None]:
#with open('manual_encoder.json', 'w') as outfile:
# write
import pickle
with open('total_knn_slim.pkl', 'wb') as f:
    pickle.dump(total_knn_slim, f)

In [None]:
data_dict={
           "KNN":[KNeighborsClassifier(),{"n_neighbors": [5,7,10],"p": [1,2]},total_knn_slim]
         }


for k,v in data_dict.items():
    print(k)
    result_dict[k] = create_total_model_and_evaluate(
        silos=silo_imputed,
        targets=[
        "GRUPO_ROBSON",
        "TIPO_GRAVIDEZ",
        "EUTOCITO_ANTERIOR",
        "CESARIANAS_ANTERIOR",
        "TRAB_PARTO_NO_PARTO",
        "TIPO_PARTO",
        "BACIA",
        "DIABETES_GESTACIONAL",
        "GS"
        ],
        parameters=v[1],cat_cols=cat_cols,int_cols=int_cols,
        global_model=v[2],full_classes=classes_dict,
        model=v[0],samplingsilo=True
    )