# V3 - 2024


In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold,cross_val_score
import json
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    ndcg_score,
    cohen_kappa_score,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import BayesianRidge
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import SVC,SVR
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor

from sklearn.base import clone

import rbo
from sklearn.inspection import permutation_importance

import scipy.stats as st
import random
from textdistance import (
    levenshtein,
    damerau_levenshtein,
    jaro_winkler,
    hamming,
)
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import pickle 


In [17]:
cv = KFold(n_splits=10)

## Functions

In [18]:
def get_several_feat_imp_dataset_2(
    data,
    categorical_cols,
    int_cols,
    rep=5,
    seed=42,
    test_size=0.05,
    models=[DecisionTreeClassifier(), DecisionTreeRegressor()],
):
    """

    1. por cada coluna
    2. por cada nr de repitições
    3. treinar modelo
    4. ir buscar feature importance
    5. fazer a media das medias

    result:{Predicted:{feature1:[v_rep1,v_rep2,v_rep3],feature2:[v_rep1,v_rep2,v_rep3]}}


    """

    r_cols = data.columns
    result = {}
    #  print(result)
    np.random.seed(seed)
    random.seed(seed)
    for i in range(0, len(r_cols)):
        # print("testing...", r_cols[i])
        l_feats = {k: [] for k in r_cols if k != r_cols[i]}
        for r in range(0, rep):
            #     print("rep",r)
            n = random.randint(0, 100)
            # print(models[0])
            # print(models[1])
            if r_cols[i] in categorical_cols:
                model = clone(models[0])

                if "random_state" in model.get_params():
                    model = model.set_params(random_state=np.random.randint(1, 20))

            else:
                model = clone(models[1])

                if "random_state" in model.get_params():
                    model = model.set_params(random_state=np.random.randint(1, 20))
            # metric = (
            #    "roc_auc_score"
            #    if r_cols[i] in categorical_cols
            #    else "neg_mean_absolute_error"
            # )
            X = data.drop(r_cols[i], axis=1)
            y = data[r_cols[i]]
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, random_state=n
            )  # just for bootstrap
            # print(X_train)
            t = model.fit(X_train, y_train)

            if hasattr(model, "feature_importances_"):
                # print(r)
                # print(t.feature_names_in_)
                # print(t.feature_importances_)
                # feats = {}
                for g in zip(t.feature_names_in_, t.feature_importances_):
                    # print(g)
                    l_feats[g[0]].append(g[1])
            #        print(l_feats)
            else:
                r = permutation_importance(
                    t, X_train, y_train, n_repeats=15, random_state=n, n_jobs=-2
                )

                for g in zip(X_train.columns, r.importances_mean):
                    l_feats[g[0]].append(g[1])

            result[r_cols[i]] = l_feats
    return result

In [19]:
def create_scores_v2(result1, result2):
    """
    does not work for more than two datasets
    #https://towardsdatascience.com/rbo-v-s-kendall-tau-to-compare-ranked-lists-of-items-8776c5182899
    #https://stats.stackexchange.com/questions/51295/comparison-of-ranked-lists
    #https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.weightedtau.html


    """
    keys = result1.keys()
    scores_ = {}
    for target in keys:
        # print(col)
        ftkeys = [key for key in keys if key != target]

        #  print(result1[col])
        m1 = {k: np.mean(v) for k, v in result1[target].items()}
        m2 = {k: np.mean(v) for k, v in result2[target].items()}
        # print(m1)
        # print(m2)
        x1_rank = st.rankdata(
            [-1 * el for el in m1.values()], method="ordinal"
        )  # avoid tie
        x1_rank_dict = {k: v for k, v in zip(m1.keys(), x1_rank)}
        # print(x1_rank_dict)

        x2_rank = st.rankdata(
            [
                -1 * el if el != 0 else el * np.random.randint(1, 10) * 0.00001 * -1
                for el in m2.values()
            ],
            method="ordinal",  # avoid tie
        )  # avoid being zero
        x2_rank_dict = {k: v for k, v in zip(m2.keys(), x2_rank)}

        true_score = []
        model_score = []
        true_score_rank = []
        model_score_rank = []
        for key in ftkeys:
            true_score_rank.append(x1_rank_dict[key])
            model_score_rank.append(x2_rank_dict[key])
            true_score.append(m1[key])
            model_score.append(m2[key])

        true_score_rank_join = "".join(str(int(e)) for e in true_score_rank)
        model_score_rank_join = "".join(str(int(e)) for e in model_score_rank)

        #  l_=ndcg_score([true_score_rank],[model_score])
        n_l = ndcg_score([true_score_rank], [model_score_rank])

        #
        def mae_over_max(mae, max_):
            if max_ == 0:
                return 1
            else:
                return mae / max_

        sc = {}
        sc["ndgc_score"] = n_l
        sc["cohen_kappa_score"] = cohen_kappa_score(true_score_rank, model_score_rank)

        sc["r2_score"] = r2_score(true_score, model_score)
        sc["levenshtein_normalized_similarity"] = levenshtein.normalized_similarity(
            true_score_rank, model_score_rank
        )
        sc["kendalltau"] = st.kendalltau(true_score_rank, model_score_rank)[0]
        sc["weightedtau"] = st.weightedtau(true_score_rank, model_score_rank)[0]
        sc["rbo"] = rbo.RankingSimilarity(true_score_rank, model_score_rank).rbo()

        sc["damerau_levenshtein_normalized_similarity"] = (
            damerau_levenshtein.normalized_similarity(true_score_rank, model_score_rank)
        )
        sc["jaro_winkler_normalized_similarity"] = jaro_winkler.normalized_similarity(
            true_score_rank, model_score_rank
        )

        sc["hamming_normalized_similarity"] = hamming.normalized_similarity(
            true_score_rank, model_score_rank
        )

        scores_[target] = {
            "results": sc,
            "true_score": true_score,
            "model_score": model_score,
            "true_score_rank": true_score_rank,
            "model_score_rank": model_score_rank,
            "true_score_rank_join": true_score_rank_join,
            "model_score_rank_join": model_score_rank_join,
        }
    # for aggregated scores:
    full_df = None
    for k, v in scores_.items():
        # print(scores_[k]["results"])
        res_df = pd.DataFrame(scores_[k]["results"], index=[0])
        if full_df is None:
            full_df = res_df
        else:
            full_df = pd.concat([full_df, res_df])

    full_df.loc["mean"] = full_df.mean()

    scores_["aggregated"] = full_df.loc["mean"].to_dict()
    return scores_

In [20]:

# for comparasion
def get_several_dif_dataset(
    data1_,
    data2_,
    categorical_cols,
    int_cols,
    cv,
    models=[DecisionTreeClassifier, LinearRegression],
):
    """
    This is the gold standard as of now. It is a function that takes in two datasets and
    returns the scores for each of the metrics.

    1. preprocesses two datasets
    2.for all columns
    2.1 - create model on real (first)
    2.2 - test on real and on synth
    """
    data1 = data1_.copy()
    data2 = data2_.copy()
    le = preprocessing.OrdinalEncoder()
    le.fit(data1_[categorical_cols].astype(str))
    data1[categorical_cols] = le.transform(data1_[categorical_cols].astype(str))
    # le = preprocessing.OrdinalEncoder()
    # le.fit(data2[categorical_cols].astype(str))

    data2[categorical_cols] = le.transform(data2[categorical_cols].astype(str))

    r_cols = data1.columns
    result = {}
    for i in range(0, len(r_cols)):
        model = (
            models[0](random_state=42) if r_cols[i] in categorical_cols else models[1]()
        )
        metric = accuracy_score if r_cols[i] in categorical_cols else mean_squared_error
        X1 = data1.drop(r_cols[i], axis=1)
        y1 = data1[r_cols[i]]
        X2 = data2.drop(r_cols[i], axis=1)
        y2 = data2[r_cols[i]]

        X_train1, X_test1, y_train1, y_test1 = train_test_split(
            X1, y1, test_size=0.2, random_state=42
        )
        #  X_train2, X_test2, y_train2, y_test2 = train_test_split(
        #      X2, y2, test_size=0.2, random_state=42
        #  )
        model.fit(X_train1, y_train1)
        real_real = metric(y_test1, model.predict(X_test1))
        real_synth = metric(y2, model.predict(X2))
        # =cross_val_score(lr, X_train1, y_train1, cv=cv, scoring=metric)
        # =cross_val_score(lr, X_train1, y2, cv=cv, scoring=metric)

        result[r_cols[i]] = (real_synth / real_real,real_real,real_synth)
    return result


In [21]:

def aggregate_data_cross(
    real_data, synth_data, categorical_values, continuous_values, cv
):
    """
    ???
    """
    real_synth_dif = get_several_dif_dataset(
        real_data, synth_data, categorical_values, continuous_values, cv
    )
    #print(real_synth_dif)
    synth_real_dif = get_several_dif_dataset(
        synth_data, real_data, categorical_values, continuous_values, cv
    )
    #print(synth_real_dif)
    synth_real_score = {k: np.mean(v) for k, v in synth_real_dif.items()}
    real_synth_score = {k: np.mean(v) for k, v in real_synth_dif.items()}
    # synth_real_score_df=pd.DataFrame.from_dict(synth_real_score,orient='index',columns=["Metric"])
    # real_synth_score_df=pd.DataFrame.from_dict(real_synth_score,orient='index',columns=["Metric"])
    final_score = {"real_synth":real_synth_dif,"synth_real":synth_real_dif,"aggregated":[]}
    for k, v in synth_real_score.items():
        # print(synth_real_score[k],real_synth_score[k])
        final_score["aggregated"].append(synth_real_score[k] / real_synth_score[k])
    return final_score

In [22]:
def test_two_datasets(
    data, data_1, categorical_values, continuous_values, reps=10, seed=42,models=[DecisionTreeClassifier(), DecisionTreeRegressor()]
):
    """
    1. gets several feature importance for 1 dataset
    2. Gets several feature importante for 2 dataset
    3. calculates new scores with them
    4. Creates Cross-validation score
    
    """
    result_1 = get_several_feat_imp_dataset_2(
        data, categorical_values, continuous_values, reps, seed=seed,models=models
    )
    result_2 = get_several_feat_imp_dataset_2(
        data_1, categorical_values, continuous_values, reps, seed=seed,models=models
    )
    sc = create_scores_v2(result_1, result_2)

    sc["cross"] = aggregate_data_cross(
        data_1, data, categorical_values, continuous_values, 10
    )

    return sc

In [23]:
def get_several_feat_imp_dataset_2(
    data,
    categorical_cols,
    int_cols,
    rep=5,
    seed=42,
    test_size=0.05,
    models=[DecisionTreeClassifier(), DecisionTreeRegressor()],
):
    """

    1. por cada coluna
    2. por cada nr de repitições
    3. treinar modelo
    4. ir buscar feature importance
    5. fazer a media das medias

    result:{Predicted:{feature1:[v_rep1,v_rep2,v_rep3],feature2:[v_rep1,v_rep2,v_rep3]}}


    """

    r_cols = data.columns
    result = {}
    #  print(result)
    np.random.seed(seed)
    random.seed(seed)
    for i in range(0, len(r_cols)):
        # print("testing...", r_cols[i])
        l_feats = {k: [] for k in r_cols if k != r_cols[i]}
        for r in range(0, rep):
            #     print("rep",r)
            n = random.randint(0, 100)
            # print(models[0])
            # print(models[1])
            if r_cols[i] in categorical_cols:
                model = clone(models[0])

                if "random_state" in model.get_params():
                    model = model.set_params(random_state=np.random.randint(1, 20))

            else:
                model = clone(models[1])

                if "random_state" in model.get_params():
                    model = model.set_params(random_state=np.random.randint(1, 20))
            # metric = (
            #    "roc_auc_score"
            #    if r_cols[i] in categorical_cols
            #    else "neg_mean_absolute_error"
            # )
            X = data.drop(r_cols[i], axis=1)
            y = data[r_cols[i]]
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, random_state=n
            )  # just for bootstrap
            # print(X_train)
            t = model.fit(X_train, y_train)

            if hasattr(model, "feature_importances_"):
                # print(r)
                # print(t.feature_names_in_)
                # print(t.feature_importances_)
                # feats = {}
                for g in zip(t.feature_names_in_, t.feature_importances_):
                    # print(g)
                    l_feats[g[0]].append(g[1])
            #        print(l_feats)
            else:
                r = permutation_importance(
                    t, X_train, y_train, n_repeats=15, random_state=n, n_jobs=-2
                )

                for g in zip(X_train.columns, r.importances_mean):
                    l_feats[g[0]].append(g[1])

            result[r_cols[i]] = l_feats
    return result

In [24]:
def create_scores_v2(result1, result2):
    """
    does not work for more than two datasets
    #https://towardsdatascience.com/rbo-v-s-kendall-tau-to-compare-ranked-lists-of-items-8776c5182899
    #https://stats.stackexchange.com/questions/51295/comparison-of-ranked-lists
    #https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.weightedtau.html


    """
    keys = result1.keys()
    scores_ = {}
    for target in keys:
        # print(col)
        ftkeys = [key for key in keys if key != target]

        #  print(result1[col])
        m1 = {k: np.mean(v) for k, v in result1[target].items()}
        m2 = {k: np.mean(v) for k, v in result2[target].items()}
        # print(m1)
        # print(m2)
        x1_rank = st.rankdata(
            [-1 * el for el in m1.values()], method="ordinal"
        )  # avoid tie
        x1_rank_dict = {k: v for k, v in zip(m1.keys(), x1_rank)}
        # print(x1_rank_dict)

        x2_rank = st.rankdata(
            [
                -1 * el if el != 0 else el * np.random.randint(1, 10) * 0.00001 * -1
                for el in m2.values()
            ],
            method="ordinal",  # avoid tie
        )  # avoid being zero
        x2_rank_dict = {k: v for k, v in zip(m2.keys(), x2_rank)}

        true_score = []
        model_score = []
        true_score_rank = []
        model_score_rank = []
        for key in ftkeys:
            true_score_rank.append(x1_rank_dict[key])
            model_score_rank.append(x2_rank_dict[key])
            true_score.append(m1[key])
            model_score.append(m2[key])

        true_score_rank_join = "".join(str(int(e)) for e in true_score_rank)
        model_score_rank_join = "".join(str(int(e)) for e in model_score_rank)

        #  l_=ndcg_score([true_score_rank],[model_score])
        n_l = ndcg_score([true_score_rank], [model_score_rank])

        #
        def mae_over_max(mae, max_):
            if max_ == 0:
                return 1
            else:
                return mae / max_

        sc = {}
        sc["ndgc_score"] = n_l
        sc["cohen_kappa_score"] = cohen_kappa_score(true_score_rank, model_score_rank)

        sc["r2_score"] = r2_score(true_score, model_score)
        sc["levenshtein_normalized_similarity"] = levenshtein.normalized_similarity(
            true_score_rank, model_score_rank
        )
        sc["kendalltau"] = st.kendalltau(true_score_rank, model_score_rank)[0]
        sc["weightedtau"] = st.weightedtau(true_score_rank, model_score_rank)[0]
        sc["rbo"] = rbo.RankingSimilarity(true_score_rank, model_score_rank).rbo()

        sc["damerau_levenshtein_normalized_similarity"] = (
            damerau_levenshtein.normalized_similarity(true_score_rank, model_score_rank)
        )
        sc["jaro_winkler_normalized_similarity"] = jaro_winkler.normalized_similarity(
            true_score_rank, model_score_rank
        )

        sc["hamming_normalized_similarity"] = hamming.normalized_similarity(
            true_score_rank, model_score_rank
        )

        scores_[target] = {
            "results": sc,
            "true_score": true_score,
            "model_score": model_score,
            "true_score_rank": true_score_rank,
            "model_score_rank": model_score_rank,
            "true_score_rank_join": true_score_rank_join,
            "model_score_rank_join": model_score_rank_join,
        }
    # for aggregated scores:
    full_df = None
    for k, v in scores_.items():
        # print(scores_[k]["results"])
        res_df = pd.DataFrame(scores_[k]["results"], index=[0])
        if full_df is None:
            full_df = res_df
        else:
            full_df = pd.concat([full_df, res_df])

    full_df.loc["mean"] = full_df.mean()

    scores_["aggregated"] = full_df.loc["mean"].to_dict()
    return scores_

In [25]:

# for comparasion
def get_several_dif_dataset(
    data1_,
    data2_,
    categorical_cols,
    int_cols,
    cv,
    models=[DecisionTreeClassifier, LinearRegression],
):
    """
    This is the gold standard as of now. It is a function that takes in two datasets and
    returns the scores for each of the metrics.

    1. preprocesses two datasets
    2.for all columns
    2.1 - create model on real (first)
    2.2 - test on real and on synth
    """
    data1 = data1_.copy()
    data2 = data2_.copy()
    le = preprocessing.OrdinalEncoder()
    le.fit(data1_[categorical_cols].astype(str))
    data1[categorical_cols] = le.transform(data1_[categorical_cols].astype(str))
    # le = preprocessing.OrdinalEncoder()
    # le.fit(data2[categorical_cols].astype(str))

    data2[categorical_cols] = le.transform(data2[categorical_cols].astype(str))

    r_cols = data1.columns
    result = {}
    for i in range(0, len(r_cols)):
        model = (
            models[0](random_state=42) if r_cols[i] in categorical_cols else models[1]()
        )
        metric = accuracy_score if r_cols[i] in categorical_cols else mean_squared_error
        X1 = data1.drop(r_cols[i], axis=1)
        y1 = data1[r_cols[i]]
        X2 = data2.drop(r_cols[i], axis=1)
        y2 = data2[r_cols[i]]

        X_train1, X_test1, y_train1, y_test1 = train_test_split(
            X1, y1, test_size=0.2, random_state=42
        )
        #  X_train2, X_test2, y_train2, y_test2 = train_test_split(
        #      X2, y2, test_size=0.2, random_state=42
        #  )
        model.fit(X_train1, y_train1)
        real_real = metric(y_test1, model.predict(X_test1))
        real_synth = metric(y2, model.predict(X2))
        # =cross_val_score(lr, X_train1, y_train1, cv=cv, scoring=metric)
        # =cross_val_score(lr, X_train1, y2, cv=cv, scoring=metric)

        result[r_cols[i]] = [real_synth / real_real,real_real,real_synth]
    return result


In [26]:

def aggregate_data_cross(
    real_data, synth_data, categorical_values, continuous_values, cv
):
    """
    ???
    """
    real_synth_dif = get_several_dif_dataset(
        real_data, synth_data, categorical_values, continuous_values, cv
    )
    #print(real_synth_dif)
    synth_real_dif = get_several_dif_dataset(
        synth_data, real_data, categorical_values, continuous_values, cv
    )
    #print(synth_real_dif)
    synth_real_score = {k: np.mean(v) for k, v in synth_real_dif.items()}
    real_synth_score = {k: np.mean(v) for k, v in real_synth_dif.items()}
    # synth_real_score_df=pd.DataFrame.from_dict(synth_real_score,orient='index',columns=["Metric"])
    # real_synth_score_df=pd.DataFrame.from_dict(real_synth_score,orient='index',columns=["Metric"])
    final_score = {"real_synth":real_synth_dif,"synth_real":synth_real_dif,"aggregated":[]}
    for k, v in synth_real_score.items():
        # print(synth_real_score[k],real_synth_score[k])
        final_score["aggregated"].append(synth_real_score[k] / real_synth_score[k])
    return final_score

In [27]:

def trial_permutatin(
    data,
    categorical_values,
    continuous_values,
    cv,
    reps=20,
    nr_cols_to_test=7,
    models=[DecisionTreeClassifier(), LinearRegression()],
):
    plot_data = {}
    #local_plot_data = {}
    for i in range(0, nr_cols_to_test + 1):  # nr of columns
        plot_data["run " + str(i)] = {"cross": []}
        print("run nr {}".format(i), "++" * 40)
        #local_plot_data = {"cross": []}

        for j in range(0, reps):  # nr of repetitions
            print("reps", str(j + 1))
            random.seed(j)
            data_1 = data.copy()
            # for k in range(0, i):
            #    print("k",k)
            if i > 0:
                cols_to_shuffle = random.sample(range(0, len(data.columns)), i)
                print(cols_to_shuffle)
                print(data.columns[cols_to_shuffle])

                #  print(list(range(0, i)))
                data_1.iloc[:, cols_to_shuffle] = np.random.permutation(
                    data_1.iloc[:, cols_to_shuffle].values
                )
            # print(data_1)
            seed = np.random.randint(1, 20)
            res=test_two_datasets(data,data_1,categorical_values,continuous_values,reps,seed,models)
           
        plot_data["run " + str(i)] = res

    return plot_data

## Data

In [28]:
#synth_data=pd.read_csv("synth_pop_2.csv",index_col=0,dtype=float).reset_index(drop=True)
real_data=pd.read_csv("real_data_testing.csv",index_col=0,dtype=float).reset_index(drop=True)
###variables
continuous_values = [
    "Age",
    "trestbps",
    "chol",
    "thalach",
    "oldpeak"
]
NA_REPLACE=["?"]
categorical_values = real_data[real_data.columns.difference(continuous_values)].columns
categorical_values
#newdata2
real_data2=pd.read_csv("real_data2_testing.csv",index_col=0,dtype=float).reset_index(drop=True)
real_data2.drop(columns=["id"],inplace=True)

continuous_values2=["Clump_Thickness","Uniformity_of_Cell_Size","Uniformity_of_Cell_Shape","Marginal_Adhesion","Single_Epithelial_Cell_Size","Bare_Nuclei","Bland_Chromatin","Normal_Nucleoli","Mitoses"]
categorical_values2 = real_data2[real_data2.columns.difference(continuous_values2)].columns
real_data2
#newdata3
real_data3=pd.read_csv("real_data3_testing.csv",index_col=0,dtype=float).reset_index(drop=True)
continuous_values3=["mcv","alkphos","sgpt","sgot","gammagt","drinks"]
categorical_values3 = real_data3[real_data3.columns.difference(continuous_values3)].columns
#newdata4
real_data4=pd.read_csv("real_data4_testing.csv",index_col=0,dtype=float).reset_index(drop=True)
continuous_values4=["T3","TST","TSTRI","TSH","TMAX"]
categorical_values4 = real_data4[real_data4.columns.difference(continuous_values4)].columns

#newdata5
real_data5=pd.read_csv("real_data5_testing.csv",index_col=0,dtype=str).reset_index(drop=True)
continuous_values5=["Age_linear"]
categorical_values5 = real_data5[real_data5.columns.difference(continuous_values5)].columns
categorical_values5
#newdata6
real_data6=pd.read_csv("real_data6_testing.csv",index_col=0).reset_index(drop=True)

continuous_values6=[""]
categorical_values6 = real_data6[real_data6.columns.difference(continuous_values6)].columns
real_data6

Unnamed: 0,class,age,sex,histologic-type,degree-of-diffe,bone,bone-marrow,lung,pleura,peritoneum,liver,brain,skin,neck,supraclavicular,axillar,mediastinum,abdominal
0,1,1,1,2,3,2,2,1,2,2,2,2,2,2,2,2,2,2
1,1,1,1,2,3,2,2,2,2,2,1,2,2,2,1,2,1,2
2,1,1,2,2,3,1,2,2,2,2,2,2,2,2,2,2,1,2
3,1,1,2,2,3,1,2,1,1,2,2,2,2,2,2,2,1,2
4,1,1,2,2,3,1,2,1,1,2,2,2,2,2,2,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,22,2,2,2,3,2,2,2,2,2,2,2,2,2,2,1,2,2
335,22,2,2,2,3,2,2,2,2,2,2,2,2,2,2,1,2,2
336,22,2,2,2,3,1,2,2,2,2,2,2,2,2,1,1,2,2
337,22,3,2,2,2,2,2,2,2,2,2,2,2,1,1,1,2,2


## Trials

### Decision Tree

In [41]:
# Function to recursively convert NumPy types in a dictionary to Python types
def convert_numpy_types(obj):
    if isinstance(obj, dict):
        return {k: convert_numpy_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(v) for v in obj]
    elif isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return obj


In [85]:
plot_data=trial_permutatin(real_data,categorical_values,continuous_values,cv,reps=10)
# Writing to sample.json
with open("dt_1.json", "w") as outfile:
    json.dump(convert_numpy_types(plot_data), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[13]
Index(['num'], dtype='object')
reps 2
[2]
Index(['cp'], dtype='object')
reps 3
[13]
Index(['num'], dtype='object')
reps 4
[3]
Index(['trestbps'], dtype='object')
reps 5
[3]
Index(['trestbps'], dtype='object')
reps 6
[9]
Index(['oldpeak'], dtype='object')
reps 7
[12]
Index(['thal'], dtype='object')
reps 8
[5]
Index(['fbs'], dtype='object')
reps 9
[3]
Index(['trestbps'], dtype='object')
reps 10
[7]
Index(['thalach'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[13, 6]
Index(['num', 'restecg'], dtype='object')
reps 2
[2, 9]
Index(['cp', 'oldpeak'], dtype='object')
reps 3
[13, 0]
Index(['num', 'Age'], dtype='object')
reps 4
[3, 9]
Index(['trestbps', 'oldpeak'], dt

In [82]:


plot_data2=trial_permutatin(real_data2,categorical_values2,continuous_values2,cv,reps=10,nr_cols_to_test=5)
# Writing to sample.json
with open("dt_2.json", "w") as outfile:
    json.dump(convert_numpy_types(plot_data2), outfile)


run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6]
Index(['Bland_Chromatin'], dtype='object')
reps 2
[2]
Index(['Uniformity_of_Cell_Shape'], dtype='object')
reps 3
[0]
Index(['Clump_Thickness'], dtype='object')
reps 4
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 5
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 6
[9]
Index(['Class'], dtype='object')
reps 7
[9]
Index(['Class'], dtype='object')
reps 8
[5]
Index(['Bare_Nuclei'], dtype='object')
reps 9
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 10
[7]
Index(['Normal_Nucleoli'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6, 9]
Index(['Bland_Chromatin', 'Class'], dtype='object')
reps 2
[2, 1]
Index(['Uniformity_of_Cell_Shape', 'Uniformity_

In [83]:

plot_data3=trial_permutatin(real_data3,categorical_values3,continuous_values3,cv,reps=10,nr_cols_to_test=4)
# Writing to sample.json
with open("dt_3.json", "w") as outfile:
    json.dump(convert_numpy_types(plot_data3), outfile)

plot_data4=trial_permutatin(real_data4,categorical_values4,continuous_values4,cv,reps=10,nr_cols_to_test=4)
# Writing to sample.json
with open("dt_4.json", "w") as outfile:
    json.dump(convert_numpy_types(plot_data4), outfile)

 
plot_data6=trial_permutatin(real_data6,categorical_values6,continuous_values6,cv,reps=10,nr_cols_to_test=9)
# Writing to sample.json
with open("dt_6.json", "w") as outfile:
    json.dump(convert_numpy_types(plot_data6), outfile)
 

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6]
Index(['Selector'], dtype='object')
reps 2
[1]
Index(['alkphos'], dtype='object')
reps 3
[6]
Index(['Selector'], dtype='object')
reps 4
[1]
Index(['alkphos'], dtype='object')
reps 5
[1]
Index(['alkphos'], dtype='object')
reps 6
[4]
Index(['gammagt'], dtype='object')
reps 7
[6]
Index(['Selector'], dtype='object')
reps 8
[2]
Index(['sgpt'], dtype='object')
reps 9
[1]
Index(['alkphos'], dtype='object')
reps 10
[3]
Index(['sgot'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6, 3]
Index(['Selector', 'sgot'], dtype='object')
reps 2
[1, 4]
Index(['alkphos', 'gammagt'], dtype='object')
reps 3
[6, 0]
Index(['Selector', 'mcv'], dtype='object')
reps 4
[1, 4]
Index(['alkp

### Random Forest

In [55]:

plot_data_rf=trial_permutatin(real_data,categorical_values,continuous_values,cv,reps=10,models=[RandomForestClassifier(),RandomForestRegressor()])
#145min
# Writing to sample.json
with open("rf_1.json", "w") as outfile:
    json.dump(convert_numpy_types(plot_data_rf), outfile)
 

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[13]
Index(['num'], dtype='object')
reps 2
[2]
Index(['cp'], dtype='object')
reps 3
[13]
Index(['num'], dtype='object')
reps 4
[3]
Index(['trestbps'], dtype='object')
reps 5
[3]
Index(['trestbps'], dtype='object')
reps 6
[9]
Index(['oldpeak'], dtype='object')
reps 7
[12]
Index(['thal'], dtype='object')
reps 8
[5]
Index(['fbs'], dtype='object')
reps 9
[3]
Index(['trestbps'], dtype='object')
reps 10
[7]
Index(['thalach'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[13, 6]
Index(['num', 'restecg'], dtype='object')
reps 2
[2, 9]
Index(['cp', 'oldpeak'], dtype='object')
reps 3
[13, 0]
Index(['num', 'Age'], dtype='object')
reps 4
[3, 9]
Index(['trestbps', 'oldpeak'], dt

In [56]:
plot_data_rf2=trial_permutatin(real_data2,categorical_values2,continuous_values2,cv,reps=10,nr_cols_to_test=5,models=[RandomForestClassifier(),RandomForestRegressor()])
# Writing to sample.json
with open("rf_2.json", "w") as outfile:
    json.dump(convert_numpy_types(plot_data_rf2), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6]
Index(['Bland_Chromatin'], dtype='object')
reps 2
[2]
Index(['Uniformity_of_Cell_Shape'], dtype='object')
reps 3
[0]
Index(['Clump_Thickness'], dtype='object')
reps 4
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 5
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 6
[9]
Index(['Class'], dtype='object')
reps 7
[9]
Index(['Class'], dtype='object')
reps 8
[5]
Index(['Bare_Nuclei'], dtype='object')
reps 9
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 10
[7]
Index(['Normal_Nucleoli'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6, 9]
Index(['Bland_Chromatin', 'Class'], dtype='object')
reps 2
[2, 1]
Index(['Uniformity_of_Cell_Shape', 'Uniformity_

In [57]:
plot_data_rf3=trial_permutatin(real_data3,categorical_values3,continuous_values3,cv,reps=10,nr_cols_to_test=4,models=[RandomForestClassifier(),RandomForestRegressor()])
with open("rf_3.json", "w") as outfile:
    #json.dump(plot_data_rf3, outfile)
    json.dump(convert_numpy_types(plot_data_rf3), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6]
Index(['Selector'], dtype='object')
reps 2
[1]
Index(['alkphos'], dtype='object')
reps 3
[6]
Index(['Selector'], dtype='object')
reps 4
[1]
Index(['alkphos'], dtype='object')
reps 5
[1]
Index(['alkphos'], dtype='object')
reps 6
[4]
Index(['gammagt'], dtype='object')
reps 7
[6]
Index(['Selector'], dtype='object')
reps 8
[2]
Index(['sgpt'], dtype='object')
reps 9
[1]
Index(['alkphos'], dtype='object')
reps 10
[3]
Index(['sgot'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6, 3]
Index(['Selector', 'sgot'], dtype='object')
reps 2
[1, 4]
Index(['alkphos', 'gammagt'], dtype='object')
reps 3
[6, 0]
Index(['Selector', 'mcv'], dtype='object')
reps 4
[1, 4]
Index(['alkp

In [58]:
plot_data_rf4=trial_permutatin(real_data4,categorical_values4,continuous_values4,cv,reps=10,nr_cols_to_test=4,models=[RandomForestClassifier(),RandomForestRegressor()])
with open("rf_4.json", "w") as outfile:
    #json.dump(plot_data_rf4, outfile)
    json.dump(convert_numpy_types(plot_data_rf4), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[3]
Index(['TSTRI'], dtype='object')
reps 2
[1]
Index(['T3'], dtype='object')
reps 3
[0]
Index(['Class'], dtype='object')
reps 4
[1]
Index(['T3'], dtype='object')
reps 5
[1]
Index(['T3'], dtype='object')
reps 6
[4]
Index(['TSH'], dtype='object')
reps 7
[4]
Index(['TSH'], dtype='object')
reps 8
[2]
Index(['TST'], dtype='object')
reps 9
[1]
Index(['T3'], dtype='object')
reps 10
[3]
Index(['TSTRI'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[3, 5]
Index(['TSTRI', 'TMAX'], dtype='object')
reps 2
[1, 4]
Index(['T3', 'TSH'], dtype='object')
reps 3
[0, 5]
Index(['Class', 'TMAX'], dtype='object')
reps 4
[1, 4]
Index(['T3', 'TSH'], dtype='object')
reps 5
[1, 2]
Index(['T3

In [59]:
plot_data_rf6=trial_permutatin(real_data6,categorical_values6,continuous_values6,cv,reps=10,nr_cols_to_test=9,models=[RandomForestClassifier(),RandomForestRegressor()])
with open("rf_6.json", "w") as outfile:
   # json.dump(plot_data_rf6, outfile)
    json.dump(convert_numpy_types(plot_data_rf6), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[12]
Index(['skin'], dtype='object')
reps 2
[4]
Index(['degree-of-diffe'], dtype='object')
reps 3
[1]
Index(['age'], dtype='object')
reps 4
[7]
Index(['lung'], dtype='object')
reps 5
[7]
Index(['lung'], dtype='object')
reps 6
[8]
Index(['pleura'], dtype='object')
reps 7
[2]
Index(['sex'], dtype='object')
reps 8
[10]
Index(['liver'], dtype='object')
reps 9
[7]
Index(['lung'], dtype='object')
reps 10
[14]
Index(['supraclavicular'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[12, 13]
Index(['skin', 'neck'], dtype='object')
reps 2
[4, 2]
Index(['degree-of-diffe', 'sex'], dtype='object')
reps 3
[1, 2]
Index(['age', 'sex'], dtype='object')
reps 4
[7, 4]
Index(['lung', '

### KNN

In [60]:
plot_data_knn=trial_permutatin(real_data,categorical_values,continuous_values,cv,reps=10,models=[KNeighborsClassifier(),KNeighborsRegressor()])
# Writing to sample.json
with open("knn_1.json", "w") as outfile:
    #json.dump(plot_data_knn, outfile)
     json.dump(convert_numpy_types(plot_data_knn), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[13]
Index(['num'], dtype='object')
reps 2
[2]
Index(['cp'], dtype='object')
reps 3
[13]
Index(['num'], dtype='object')
reps 4
[3]
Index(['trestbps'], dtype='object')
reps 5
[3]
Index(['trestbps'], dtype='object')
reps 6
[9]
Index(['oldpeak'], dtype='object')
reps 7
[12]
Index(['thal'], dtype='object')
reps 8
[5]
Index(['fbs'], dtype='object')
reps 9
[3]
Index(['trestbps'], dtype='object')
reps 10
[7]
Index(['thalach'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[13, 6]
Index(['num', 'restecg'], dtype='object')
reps 2
[2, 9]
Index(['cp', 'oldpeak'], dtype='object')
reps 3
[13, 0]
Index(['num', 'Age'], dtype='object')
reps 4
[3, 9]
Index(['trestbps', 'oldpeak'], dt

In [61]:
plot_data_knn2=trial_permutatin(real_data2,categorical_values2,continuous_values2,cv,reps=10,nr_cols_to_test=5,models=[KNeighborsClassifier(),KNeighborsRegressor()])
# Writing to sample.json
with open("knn_2.json", "w") as outfile:
   # json.dump(plot_data_knn2, outfile)
      json.dump(convert_numpy_types(plot_data_knn2), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6]
Index(['Bland_Chromatin'], dtype='object')
reps 2
[2]
Index(['Uniformity_of_Cell_Shape'], dtype='object')
reps 3
[0]
Index(['Clump_Thickness'], dtype='object')
reps 4
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 5
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 6
[9]
Index(['Class'], dtype='object')
reps 7
[9]
Index(['Class'], dtype='object')
reps 8
[5]
Index(['Bare_Nuclei'], dtype='object')
reps 9
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 10
[7]
Index(['Normal_Nucleoli'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6, 9]
Index(['Bland_Chromatin', 'Class'], dtype='object')
reps 2
[2, 1]
Index(['Uniformity_of_Cell_Shape', 'Uniformity_

In [62]:
plot_data_knn3=trial_permutatin(real_data3,categorical_values3,continuous_values3,cv,reps=10,nr_cols_to_test=4,models=[KNeighborsClassifier(),KNeighborsRegressor()])
with open("knn_3.json", "w") as outfile:
   # json.dump(plot_data_knn3, outfile)
    json.dump(convert_numpy_types(plot_data_knn3), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6]
Index(['Selector'], dtype='object')
reps 2
[1]
Index(['alkphos'], dtype='object')
reps 3
[6]
Index(['Selector'], dtype='object')
reps 4
[1]
Index(['alkphos'], dtype='object')
reps 5
[1]
Index(['alkphos'], dtype='object')
reps 6
[4]
Index(['gammagt'], dtype='object')
reps 7
[6]
Index(['Selector'], dtype='object')
reps 8
[2]
Index(['sgpt'], dtype='object')
reps 9
[1]
Index(['alkphos'], dtype='object')
reps 10
[3]
Index(['sgot'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6, 3]
Index(['Selector', 'sgot'], dtype='object')
reps 2
[1, 4]
Index(['alkphos', 'gammagt'], dtype='object')
reps 3
[6, 0]
Index(['Selector', 'mcv'], dtype='object')
reps 4
[1, 4]
Index(['alkp

In [63]:
plot_data_knn4=trial_permutatin(real_data4,categorical_values4,continuous_values4,cv,reps=10,nr_cols_to_test=4,models=[KNeighborsClassifier(),KNeighborsRegressor()])
with open("knn_4.json", "w") as outfile:
    #json.dump(plot_data_knn4, outfile)
    json.dump(convert_numpy_types(plot_data_knn4), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[3]
Index(['TSTRI'], dtype='object')
reps 2
[1]
Index(['T3'], dtype='object')
reps 3
[0]
Index(['Class'], dtype='object')
reps 4
[1]
Index(['T3'], dtype='object')
reps 5
[1]
Index(['T3'], dtype='object')
reps 6
[4]
Index(['TSH'], dtype='object')
reps 7
[4]
Index(['TSH'], dtype='object')
reps 8
[2]
Index(['TST'], dtype='object')
reps 9
[1]
Index(['T3'], dtype='object')
reps 10
[3]
Index(['TSTRI'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[3, 5]
Index(['TSTRI', 'TMAX'], dtype='object')
reps 2
[1, 4]
Index(['T3', 'TSH'], dtype='object')
reps 3
[0, 5]
Index(['Class', 'TMAX'], dtype='object')
reps 4
[1, 4]
Index(['T3', 'TSH'], dtype='object')
reps 5
[1, 2]
Index(['T3

In [64]:
plot_data_knn6=trial_permutatin(real_data6,categorical_values6,continuous_values6,cv,reps=10,nr_cols_to_test=9,models=[KNeighborsClassifier(),KNeighborsRegressor()])
with open("knn_6.json", "w") as outfile:
    #json.dump(plot_data_knn6, outfile)
    json.dump(convert_numpy_types(plot_data_knn6), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[12]
Index(['skin'], dtype='object')
reps 2
[4]
Index(['degree-of-diffe'], dtype='object')
reps 3
[1]
Index(['age'], dtype='object')
reps 4
[7]
Index(['lung'], dtype='object')
reps 5
[7]
Index(['lung'], dtype='object')
reps 6
[8]
Index(['pleura'], dtype='object')
reps 7
[2]
Index(['sex'], dtype='object')
reps 8
[10]
Index(['liver'], dtype='object')
reps 9
[7]
Index(['lung'], dtype='object')
reps 10
[14]
Index(['supraclavicular'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[12, 13]
Index(['skin', 'neck'], dtype='object')
reps 2
[4, 2]
Index(['degree-of-diffe', 'sex'], dtype='object')
reps 3
[1, 2]
Index(['age', 'sex'], dtype='object')
reps 4
[7, 4]
Index(['lung', '

### SVM

In [65]:
plot_data_svm=trial_permutatin(real_data,categorical_values,continuous_values,cv,reps=10,models=[SVC(),SVR()])
# Writing to sample.json
with open("SVM_1.json", "w") as outfile:
   # json.dump(plot_data_svm, outfile)
    json.dump(convert_numpy_types(plot_data_svm), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[13]
Index(['num'], dtype='object')
reps 2
[2]
Index(['cp'], dtype='object')
reps 3
[13]
Index(['num'], dtype='object')
reps 4
[3]
Index(['trestbps'], dtype='object')
reps 5
[3]
Index(['trestbps'], dtype='object')
reps 6
[9]
Index(['oldpeak'], dtype='object')
reps 7
[12]
Index(['thal'], dtype='object')
reps 8
[5]
Index(['fbs'], dtype='object')
reps 9
[3]
Index(['trestbps'], dtype='object')
reps 10
[7]
Index(['thalach'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[13, 6]
Index(['num', 'restecg'], dtype='object')
reps 2
[2, 9]
Index(['cp', 'oldpeak'], dtype='object')
reps 3
[13, 0]
Index(['num', 'Age'], dtype='object')
reps 4
[3, 9]
Index(['trestbps', 'oldpeak'], dt

In [66]:
plot_data_svm2=trial_permutatin(real_data2,categorical_values2,continuous_values2,cv,reps=10,nr_cols_to_test=5,models=[SVC(),SVR()])
# Writing to sample.json
with open("svm_2.json", "w") as outfile:
   # json.dump(plot_data_svm2, outfile)
    json.dump(convert_numpy_types(plot_data_svm2), outfile)
 

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6]
Index(['Bland_Chromatin'], dtype='object')
reps 2
[2]
Index(['Uniformity_of_Cell_Shape'], dtype='object')
reps 3
[0]
Index(['Clump_Thickness'], dtype='object')
reps 4
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 5
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 6
[9]
Index(['Class'], dtype='object')
reps 7
[9]
Index(['Class'], dtype='object')
reps 8
[5]
Index(['Bare_Nuclei'], dtype='object')
reps 9
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 10
[7]
Index(['Normal_Nucleoli'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6, 9]
Index(['Bland_Chromatin', 'Class'], dtype='object')
reps 2
[2, 1]
Index(['Uniformity_of_Cell_Shape', 'Uniformity_



reps 3
[0, 1, 8, 2]
Index(['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Mitoses',
       'Uniformity_of_Cell_Shape'],
      dtype='object')
reps 4
[3, 8, 2, 7]
Index(['Marginal_Adhesion', 'Mitoses', 'Uniformity_of_Cell_Shape',
       'Normal_Nucleoli'],
      dtype='object')
reps 5
[3, 4, 1, 5]
Index(['Marginal_Adhesion', 'Single_Epithelial_Cell_Size',
       'Uniformity_of_Cell_Size', 'Bare_Nuclei'],
      dtype='object')
reps 6
[9, 4, 5, 6]
Index(['Class', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei',
       'Bland_Chromatin'],
      dtype='object')
reps 7
[9, 1, 7, 6]
Index(['Class', 'Uniformity_of_Cell_Size', 'Normal_Nucleoli',
       'Bland_Chromatin'],
      dtype='object')
reps 8
[5, 2, 6, 9]
Index(['Bare_Nuclei', 'Uniformity_of_Cell_Shape', 'Bland_Chromatin', 'Class'], dtype='object')
reps 9
[3, 5, 6, 1]
Index(['Marginal_Adhesion', 'Bare_Nuclei', 'Bland_Chromatin',
       'Uniformity_of_Cell_Size'],
      dtype='object')
reps 10
[7, 5, 4, 1]
Index(['Normal_Nucleoli', 'Bare_Nu

In [67]:
plot_data_svm3=trial_permutatin(real_data3,categorical_values3,continuous_values3,cv,reps=10,nr_cols_to_test=4,models=[SVC(),SVR()])
with open("svm_3.json", "w") as outfile:
  #  json.dump(plot_data_svm3, outfile)
    json.dump(convert_numpy_types(plot_data_svm3), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6]
Index(['Selector'], dtype='object')
reps 2
[1]
Index(['alkphos'], dtype='object')
reps 3
[6]
Index(['Selector'], dtype='object')
reps 4
[1]
Index(['alkphos'], dtype='object')
reps 5
[1]
Index(['alkphos'], dtype='object')
reps 6
[4]
Index(['gammagt'], dtype='object')
reps 7
[6]
Index(['Selector'], dtype='object')
reps 8
[2]
Index(['sgpt'], dtype='object')
reps 9
[1]
Index(['alkphos'], dtype='object')
reps 10
[3]
Index(['sgot'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6, 3]
Index(['Selector', 'sgot'], dtype='object')
reps 2
[1, 4]
Index(['alkphos', 'gammagt'], dtype='object')
reps 3
[6, 0]
Index(['Selector', 'mcv'], dtype='object')
reps 4
[1, 4]
Index(['alkp

In [68]:
plot_data_svm4=trial_permutatin(real_data4,categorical_values4,continuous_values4,cv,reps=10,nr_cols_to_test=4,models=[SVC(),SVR()])
with open("svm_4.json", "w") as outfile:
   # json.dump(plot_data_svm4, outfile)
    json.dump(convert_numpy_types(plot_data_svm4), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[3]
Index(['TSTRI'], dtype='object')
reps 2
[1]
Index(['T3'], dtype='object')
reps 3
[0]
Index(['Class'], dtype='object')
reps 4
[1]
Index(['T3'], dtype='object')
reps 5
[1]
Index(['T3'], dtype='object')
reps 6
[4]
Index(['TSH'], dtype='object')
reps 7
[4]
Index(['TSH'], dtype='object')
reps 8
[2]
Index(['TST'], dtype='object')
reps 9
[1]
Index(['T3'], dtype='object')
reps 10
[3]
Index(['TSTRI'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[3, 5]
Index(['TSTRI', 'TMAX'], dtype='object')
reps 2
[1, 4]
Index(['T3', 'TSH'], dtype='object')
reps 3
[0, 5]
Index(['Class', 'TMAX'], dtype='object')
reps 4
[1, 4]
Index(['T3', 'TSH'], dtype='object')
reps 5
[1, 2]
Index(['T3

In [70]:
plot_data_svm6=trial_permutatin(real_data6,categorical_values6,continuous_values6,cv,reps=10,nr_cols_to_test=9,models=[SVC(),SVR()])
with open("svm_6.json", "w") as outfile:
   # json.dump(plot_data_svm6, outfile)
    json.dump(convert_numpy_types(plot_data_svm6), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[12]
Index(['skin'], dtype='object')
reps 2
[4]
Index(['degree-of-diffe'], dtype='object')
reps 3
[1]
Index(['age'], dtype='object')
reps 4
[7]
Index(['lung'], dtype='object')
reps 5
[7]
Index(['lung'], dtype='object')
reps 6
[8]
Index(['pleura'], dtype='object')
reps 7
[2]
Index(['sex'], dtype='object')
reps 8
[10]
Index(['liver'], dtype='object')
reps 9
[7]
Index(['lung'], dtype='object')
reps 10
[14]
Index(['supraclavicular'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[12, 13]
Index(['skin', 'neck'], dtype='object')
reps 2
[4, 2]
Index(['degree-of-diffe', 'sex'], dtype='object')
reps 3
[1, 2]
Index(['age', 'sex'], dtype='object')
reps 4
[7, 4]
Index(['lung', '

### LINEAR

In [71]:
plot_data_lm=trial_permutatin(real_data,categorical_values,continuous_values,cv,reps=10,models=[LogisticRegression(solver="liblinear"),LinearRegression()])
#103min
# Writing to sample.json
with open("lm_1.json", "w") as outfile:
  #  json.dump(plot_data_lm, outfile)
    json.dump(convert_numpy_types(plot_data_lm), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[13]
Index(['num'], dtype='object')
reps 2
[2]
Index(['cp'], dtype='object')
reps 3
[13]
Index(['num'], dtype='object')
reps 4
[3]
Index(['trestbps'], dtype='object')
reps 5
[3]
Index(['trestbps'], dtype='object')
reps 6
[9]
Index(['oldpeak'], dtype='object')
reps 7
[12]
Index(['thal'], dtype='object')
reps 8
[5]
Index(['fbs'], dtype='object')
reps 9
[3]
Index(['trestbps'], dtype='object')
reps 10
[7]
Index(['thalach'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[13, 6]
Index(['num', 'restecg'], dtype='object')
reps 2
[2, 9]
Index(['cp', 'oldpeak'], dtype='object')
reps 3
[13, 0]
Index(['num', 'Age'], dtype='object')
reps 4
[3, 9]
Index(['trestbps', 'oldpeak'], dt

In [72]:
plot_data_lm2=trial_permutatin(real_data2,categorical_values2,continuous_values2,cv,reps=10,nr_cols_to_test=5,models=[LogisticRegression(solver="liblinear"),LinearRegression()])
# Writing to sample.json
with open("lm_2.json", "w") as outfile:
   # json.dump(plot_data_lm2, outfile)
     json.dump(convert_numpy_types(plot_data_lm2), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6]
Index(['Bland_Chromatin'], dtype='object')
reps 2
[2]
Index(['Uniformity_of_Cell_Shape'], dtype='object')
reps 3
[0]
Index(['Clump_Thickness'], dtype='object')
reps 4
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 5
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 6
[9]
Index(['Class'], dtype='object')
reps 7
[9]
Index(['Class'], dtype='object')
reps 8
[5]
Index(['Bare_Nuclei'], dtype='object')
reps 9
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 10
[7]
Index(['Normal_Nucleoli'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6, 9]
Index(['Bland_Chromatin', 'Class'], dtype='object')
reps 2
[2, 1]
Index(['Uniformity_of_Cell_Shape', 'Uniformity_

In [73]:
plot_data_lm3=trial_permutatin(real_data3,categorical_values3,continuous_values3,cv,reps=10,nr_cols_to_test=4,models=[LogisticRegression(solver="liblinear"),LinearRegression()])
with open("lm_3.json", "w") as outfile:
    #json.dump(plot_data_lm3, outfile)
    json.dump(convert_numpy_types(plot_data_lm3), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6]
Index(['Selector'], dtype='object')
reps 2
[1]
Index(['alkphos'], dtype='object')
reps 3
[6]
Index(['Selector'], dtype='object')
reps 4
[1]
Index(['alkphos'], dtype='object')
reps 5
[1]
Index(['alkphos'], dtype='object')
reps 6
[4]
Index(['gammagt'], dtype='object')
reps 7
[6]
Index(['Selector'], dtype='object')
reps 8
[2]
Index(['sgpt'], dtype='object')
reps 9
[1]
Index(['alkphos'], dtype='object')
reps 10
[3]
Index(['sgot'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6, 3]
Index(['Selector', 'sgot'], dtype='object')
reps 2
[1, 4]
Index(['alkphos', 'gammagt'], dtype='object')
reps 3
[6, 0]
Index(['Selector', 'mcv'], dtype='object')
reps 4
[1, 4]
Index(['alkp

In [74]:
plot_data_lm4=trial_permutatin(real_data4,categorical_values4,continuous_values4,cv,reps=10,nr_cols_to_test=4,models=[LogisticRegression(solver="liblinear"),LinearRegression()])
with open("lm_4.json", "w") as outfile:
    #json.dump(plot_data_lm4, outfile)
    json.dump(convert_numpy_types(plot_data_lm4), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[3]
Index(['TSTRI'], dtype='object')
reps 2
[1]
Index(['T3'], dtype='object')
reps 3
[0]
Index(['Class'], dtype='object')
reps 4
[1]
Index(['T3'], dtype='object')
reps 5
[1]
Index(['T3'], dtype='object')
reps 6
[4]
Index(['TSH'], dtype='object')
reps 7
[4]
Index(['TSH'], dtype='object')
reps 8
[2]
Index(['TST'], dtype='object')
reps 9
[1]
Index(['T3'], dtype='object')
reps 10
[3]
Index(['TSTRI'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[3, 5]
Index(['TSTRI', 'TMAX'], dtype='object')
reps 2
[1, 4]
Index(['T3', 'TSH'], dtype='object')
reps 3
[0, 5]
Index(['Class', 'TMAX'], dtype='object')
reps 4
[1, 4]
Index(['T3', 'TSH'], dtype='object')
reps 5
[1, 2]
Index(['T3

In [75]:
plot_data_lm6=trial_permutatin(real_data6,categorical_values6,continuous_values6,cv,reps=10,nr_cols_to_test=9,models=[LogisticRegression(solver="liblinear"),LinearRegression()])
with open("lm_6.json", "w") as outfile:
   # json.dump(plot_data_lm6, outfile)
    json.dump(convert_numpy_types(plot_data_lm6), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[12]
Index(['skin'], dtype='object')
reps 2
[4]
Index(['degree-of-diffe'], dtype='object')
reps 3
[1]
Index(['age'], dtype='object')
reps 4
[7]
Index(['lung'], dtype='object')
reps 5
[7]
Index(['lung'], dtype='object')
reps 6
[8]
Index(['pleura'], dtype='object')
reps 7
[2]
Index(['sex'], dtype='object')
reps 8
[10]
Index(['liver'], dtype='object')
reps 9
[7]
Index(['lung'], dtype='object')
reps 10
[14]
Index(['supraclavicular'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[12, 13]
Index(['skin', 'neck'], dtype='object')
reps 2
[4, 2]
Index(['degree-of-diffe', 'sex'], dtype='object')
reps 3
[1, 2]
Index(['age', 'sex'], dtype='object')
reps 4
[7, 4]
Index(['lung', '

### NB

In [76]:

plot_data_nb=trial_permutatin(real_data,categorical_values,continuous_values,cv,reps=10,models=[GaussianNB(),BayesianRidge()])
#107min
# Writing to sample.json
with open("nb_1.json", "w") as outfile:
    #json.dump(plot_data_nb, outfile)
    json.dump(convert_numpy_types(plot_data_nb), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[13]
Index(['num'], dtype='object')
reps 2
[2]
Index(['cp'], dtype='object')
reps 3
[13]
Index(['num'], dtype='object')
reps 4
[3]
Index(['trestbps'], dtype='object')
reps 5
[3]
Index(['trestbps'], dtype='object')
reps 6
[9]
Index(['oldpeak'], dtype='object')
reps 7
[12]
Index(['thal'], dtype='object')
reps 8
[5]
Index(['fbs'], dtype='object')
reps 9
[3]
Index(['trestbps'], dtype='object')
reps 10
[7]
Index(['thalach'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[13, 6]
Index(['num', 'restecg'], dtype='object')
reps 2
[2, 9]
Index(['cp', 'oldpeak'], dtype='object')
reps 3
[13, 0]
Index(['num', 'Age'], dtype='object')
reps 4
[3, 9]
Index(['trestbps', 'oldpeak'], dt

In [77]:
plot_data_nb2=trial_permutatin(real_data2,categorical_values2,continuous_values2,cv,reps=10,nr_cols_to_test=5,models=[GaussianNB(),BayesianRidge()])
# Writing to sample.json
with open("nb_2.json", "w") as outfile:
    #json.dump(plot_data_nb2, outfile)
     json.dump(convert_numpy_types(plot_data_nb2), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6]
Index(['Bland_Chromatin'], dtype='object')
reps 2
[2]
Index(['Uniformity_of_Cell_Shape'], dtype='object')
reps 3
[0]
Index(['Clump_Thickness'], dtype='object')
reps 4
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 5
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 6
[9]
Index(['Class'], dtype='object')
reps 7
[9]
Index(['Class'], dtype='object')
reps 8
[5]
Index(['Bare_Nuclei'], dtype='object')
reps 9
[3]
Index(['Marginal_Adhesion'], dtype='object')
reps 10
[7]
Index(['Normal_Nucleoli'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6, 9]
Index(['Bland_Chromatin', 'Class'], dtype='object')
reps 2
[2, 1]
Index(['Uniformity_of_Cell_Shape', 'Uniformity_

In [78]:
plot_data_nb3=trial_permutatin(real_data3,categorical_values3,continuous_values3,cv,reps=10,nr_cols_to_test=4,models=[GaussianNB(),BayesianRidge()])
with open("nb_3.json", "w") as outfile:
   # json.dump(plot_data_nb3, outfile)
    json.dump(convert_numpy_types(plot_data_nb3), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6]
Index(['Selector'], dtype='object')
reps 2
[1]
Index(['alkphos'], dtype='object')
reps 3
[6]
Index(['Selector'], dtype='object')
reps 4
[1]
Index(['alkphos'], dtype='object')
reps 5
[1]
Index(['alkphos'], dtype='object')
reps 6
[4]
Index(['gammagt'], dtype='object')
reps 7
[6]
Index(['Selector'], dtype='object')
reps 8
[2]
Index(['sgpt'], dtype='object')
reps 9
[1]
Index(['alkphos'], dtype='object')
reps 10
[3]
Index(['sgot'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[6, 3]
Index(['Selector', 'sgot'], dtype='object')
reps 2
[1, 4]
Index(['alkphos', 'gammagt'], dtype='object')
reps 3
[6, 0]
Index(['Selector', 'mcv'], dtype='object')
reps 4
[1, 4]
Index(['alkp

In [79]:
plot_data_nb4=trial_permutatin(real_data4,categorical_values4,continuous_values4,cv,reps=10,nr_cols_to_test=4,models=[GaussianNB(),BayesianRidge()])
with open("nb_4.json", "w") as outfile:
  #  json.dump(plot_data_nb4, outfile)
    json.dump(convert_numpy_types(plot_data_nb4), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[3]
Index(['TSTRI'], dtype='object')
reps 2
[1]
Index(['T3'], dtype='object')
reps 3
[0]
Index(['Class'], dtype='object')
reps 4
[1]
Index(['T3'], dtype='object')
reps 5
[1]
Index(['T3'], dtype='object')
reps 6
[4]
Index(['TSH'], dtype='object')
reps 7
[4]
Index(['TSH'], dtype='object')
reps 8
[2]
Index(['TST'], dtype='object')
reps 9
[1]
Index(['T3'], dtype='object')
reps 10
[3]
Index(['TSTRI'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[3, 5]
Index(['TSTRI', 'TMAX'], dtype='object')
reps 2
[1, 4]
Index(['T3', 'TSH'], dtype='object')
reps 3
[0, 5]
Index(['Class', 'TMAX'], dtype='object')
reps 4
[1, 4]
Index(['T3', 'TSH'], dtype='object')
reps 5
[1, 2]
Index(['T3

In [80]:
plot_data_nb6=trial_permutatin(real_data6,categorical_values6,continuous_values6,cv,reps=10,nr_cols_to_test=9,models=[GaussianNB(),BayesianRidge()])
with open("nb_6.json", "w") as outfile:
   # json.dump(plot_data_nb6, outfile)
    json.dump(convert_numpy_types(plot_data_nb6), outfile)

run nr 0 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
reps 2
reps 3
reps 4
reps 5
reps 6
reps 7
reps 8
reps 9
reps 10
run nr 1 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[12]
Index(['skin'], dtype='object')
reps 2
[4]
Index(['degree-of-diffe'], dtype='object')
reps 3
[1]
Index(['age'], dtype='object')
reps 4
[7]
Index(['lung'], dtype='object')
reps 5
[7]
Index(['lung'], dtype='object')
reps 6
[8]
Index(['pleura'], dtype='object')
reps 7
[2]
Index(['sex'], dtype='object')
reps 8
[10]
Index(['liver'], dtype='object')
reps 9
[7]
Index(['lung'], dtype='object')
reps 10
[14]
Index(['supraclavicular'], dtype='object')
run nr 2 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
reps 1
[12, 13]
Index(['skin', 'neck'], dtype='object')
reps 2
[4, 2]
Index(['degree-of-diffe', 'sex'], dtype='object')
reps 3
[1, 2]
Index(['age', 'sex'], dtype='object')
reps 4
[7, 4]
Index(['lung', '

## Synth Data Test synthpop

In [33]:

synth_data1=pd.read_csv("synth_pop_1.csv",index_col=0,dtype=float).reset_index(drop=True)
result1=test_two_datasets(real_data, synth_data1, categorical_values, continuous_values, reps=10, seed=42)

synth_data2=pd.read_csv("synth_pop_2.csv",index_col=0,dtype=float).reset_index(drop=True)
synth_data2.drop(columns=["id"],inplace=True)
result2=test_two_datasets(real_data2, synth_data2, categorical_values2   , continuous_values2, reps=10, seed=42)

synth_data3=pd.read_csv("synth_pop_3.csv",index_col=0,dtype=float).reset_index(drop=True)
result3=test_two_datasets(real_data3, synth_data3, categorical_values3   , continuous_values3, reps=10, seed=42)

synth_data4=pd.read_csv("synth_pop_4.csv",index_col=0,dtype=float).reset_index(drop=True)
result4=test_two_datasets(real_data4, synth_data4, categorical_values4   , continuous_values4, reps=10, seed=42)

synth_data6=pd.read_csv("synth_pop_6_2.csv",index_col=0,dtype=str).reset_index(drop=True)
continuous_values6=[""]
synth_data6.columns=real_data6.columns
result6=test_two_datasets(real_data6, synth_data6, categorical_values6, continuous_values6, reps=10, seed=42)

synth_result=[result1,result2,result3,result4,result6]

with open('results/synth_result.pkl', 'wb') as f:
    pickle.dump(synth_result, f)

## Synth Data Test CTGAN

In [87]:
synth_data1_ctgan=pd.read_csv("synth1_ctgan.csv",index_col=0,dtype=float).reset_index(drop=True)

synth_data1_ctgan.replace("?", np.nan,inplace=True)

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
x_cont=imp_mean.fit_transform(synth_data1_ctgan[continuous_values])
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x_cat=imp_mean.fit_transform(synth_data1_ctgan[categorical_values])

synth_data1_ctgan[continuous_values]=pd.DataFrame(x_cont,columns=continuous_values)
synth_data1_ctgan[categorical_values]=pd.DataFrame(x_cat,columns=categorical_values)

result=test_two_datasets(real_data, synth_data1_ctgan, categorical_values, continuous_values, reps=10, seed=42)
synth_data2_ctgan=pd.read_csv("synth2_ctgan.csv",index_col=0).reset_index(drop=True)
synth_data2_ctgan.replace("?", np.nan,inplace=True)

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
x_cont=imp_mean.fit_transform(synth_data2_ctgan[continuous_values2])
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x_cat=imp_mean.fit_transform(synth_data2_ctgan[categorical_values2])

synth_data2_ctgan[continuous_values2]=pd.DataFrame(x_cont,columns=continuous_values2)
synth_data2_ctgan[categorical_values2]=pd.DataFrame(x_cat,columns=categorical_values2)
synth_data2_ctgan=synth_data2_ctgan.astype(float)
result2=test_two_datasets(real_data2, synth_data2_ctgan, categorical_values2   , continuous_values2, reps=10, seed=42)
synth_data3_ctgan=pd.read_csv("synth3_ctgan.csv",index_col=0,dtype=float).reset_index(drop=True)
result3=test_two_datasets(real_data3, synth_data3_ctgan, categorical_values3   , continuous_values3, reps=10, seed=42)
synth_data4_ctgan=pd.read_csv("synth4_ctgan.csv",index_col=0,dtype=float).reset_index(drop=True)
result4=test_two_datasets(real_data4, synth_data4_ctgan, categorical_values4   , continuous_values4, reps=10, seed=42)
synth_data6_ctgan=pd.read_csv("synth6_ctgan2.csv",index_col=0,dtype=str).reset_index(drop=True)


synth_data6_ctgan.replace("?", np.nan,inplace=True)

#imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
#x_cont=imp_mean.fit_transform(synth_data6_ctgan[continuous_values6])
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x_cat=imp_mean.fit_transform(synth_data6_ctgan[categorical_values6])

#synth_data6_ctgan[continuous_values6]=pd.DataFrame(x_cont,columns=continuous_values6)
synth_data6_ctgan[categorical_values6]=pd.DataFrame(x_cat,columns=categorical_values6)
result6=test_two_datasets(real_data6, synth_data6_ctgan, categorical_values6  , continuous_values6, reps=10, seed=42)
synth_result=[result,result2,result3,result4,result6]

with open('results/synth_result_Ctgan.pkl', 'wb') as f:
    pickle.dump(synth_result, f)

## Synth Data Test Gaussian

In [88]:
synth_data1_gaussian=pd.read_csv("synth1_gaussian.csv",index_col=0,dtype=float).reset_index(drop=True)

synth_data1_gaussian.replace("?", np.nan,inplace=True)

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
x_cont=imp_mean.fit_transform(synth_data1_gaussian[continuous_values])
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x_cat=imp_mean.fit_transform(synth_data1_gaussian[categorical_values])

synth_data1_gaussian[continuous_values]=pd.DataFrame(x_cont,columns=continuous_values)
synth_data1_gaussian[categorical_values]=pd.DataFrame(x_cat,columns=categorical_values)

result=test_two_datasets(real_data, synth_data1_gaussian, categorical_values, continuous_values, reps=10, seed=42)
synth_data2_gaussian=pd.read_csv("synth2_gaussian.csv",index_col=0).reset_index(drop=True)
synth_data2_gaussian.replace("?", np.nan,inplace=True)

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
x_cont=imp_mean.fit_transform(synth_data2_gaussian[continuous_values2])
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x_cat=imp_mean.fit_transform(synth_data2_gaussian[categorical_values2])

synth_data2_gaussian[continuous_values2]=pd.DataFrame(x_cont,columns=continuous_values2)
synth_data2_gaussian[categorical_values2]=pd.DataFrame(x_cat,columns=categorical_values2)
synth_data2_gaussian=synth_data2_gaussian.astype(float)
result2=test_two_datasets(real_data2, synth_data2_gaussian, categorical_values2   , continuous_values2, reps=10, seed=42)
synth_data3_gaussian=pd.read_csv("synth3_gaussian.csv",index_col=0,dtype=float).reset_index(drop=True)
result3=test_two_datasets(real_data3, synth_data3_gaussian, categorical_values3   , continuous_values3, reps=10, seed=42)
synth_data4_gaussian=pd.read_csv("synth4_gaussian.csv",index_col=0,dtype=float).reset_index(drop=True)
result4=test_two_datasets(real_data4, synth_data4_gaussian, categorical_values4, continuous_values4, reps=10, seed=42)
synth_data6_gaussian=pd.read_csv("synth6_gaussian2.csv",index_col=0,dtype=str).reset_index(drop=True)


synth_data6_gaussian.replace("?", np.nan,inplace=True)

#imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
#x_cont=imp_mean.fit_transform(synth_data6_ctgan[continuous_values6])
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x_cat=imp_mean.fit_transform(synth_data6_gaussian[categorical_values6])

#synth_data6_ctgan[continuous_values6]=pd.DataFrame(x_cont,columns=continuous_values6)
synth_data6_gaussian[categorical_values6]=pd.DataFrame(x_cat,columns=categorical_values6)
result6=test_two_datasets(real_data6, synth_data6_gaussian, categorical_values6  , continuous_values6, reps=10, seed=42)
synth_result=[result,result2,result3,result4,result6]

with open('results/synth_result_gaussian.pkl', 'wb') as f:
    pickle.dump(synth_result, f)

  result[r_cols[i]] = [real_synth / real_real,real_real,real_synth]
