# redo sept 2024


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import json
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    ndcg_score,
    cohen_kappa_score,
)
from sklearn.base import clone

import rbo
from sklearn.inspection import permutation_importance

# import shap
import itertools
import scipy.stats as st
import random
from textdistance import (
    levenshtein,
    damerau_levenshtein,
    jaro_winkler,
    hamming,
)
from sklearn.model_selection import train_test_split
import plotly.express as px
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score


In [10]:
# synth_data=pd.read_csv("synth_pop_2.csv",index_col=0,dtype=float).reset_index(drop=True)
real_data = pd.read_csv("real_data_testing.csv", index_col=0, dtype=float).reset_index(
    drop=True
)
###variables
continuous_values = ["Age", "trestbps", "chol", "thalach", "oldpeak"]
NA_REPLACE = ["?"]

In [11]:
categorical_values = real_data[real_data.columns.difference(continuous_values)].columns

In [12]:
synth_data1 = pd.read_csv("synth_pop_1.csv", index_col=0, dtype=float).reset_index(
    drop=True
)

In [13]:
def get_several_feat_imp_dataset_2(
    data,
    categorical_cols,
    int_cols,
    rep=5,
    seed=42,
    test_size=0.05,
    models=[DecisionTreeClassifier(), DecisionTreeRegressor()],
):
    """

    1. por cada coluna
    2. por cada nr de repitições
    3. treinar modelo
    4. ir buscar feature importance
    5. fazer a media das medias

    result:{Predicted:{feature1:[v_rep1,v_rep2,v_rep3],feature2:[v_rep1,v_rep2,v_rep3]}}


    """

    r_cols = data.columns
    result = {}
    #  print(result)
    np.random.seed(seed)
    random.seed(seed)
    for i in range(0, len(r_cols)):
        # print("testing...", r_cols[i])
        l_feats = {k: [] for k in r_cols if k != r_cols[i]}
        for r in range(0, rep):
            #     print("rep",r)
            n = random.randint(0, 100)
            # print(models[0])
            # print(models[1])
            if r_cols[i] in categorical_cols:
                model = clone(models[0])

                if "random_state" in model.get_params():
                    model = model.set_params(random_state=np.random.randint(1, 20))

            else:
                model = clone(models[1])

                if "random_state" in model.get_params():
                    model = model.set_params(random_state=np.random.randint(1, 20))
            # metric = (
            #    "roc_auc_score"
            #    if r_cols[i] in categorical_cols
            #    else "neg_mean_absolute_error"
            # )
            X = data.drop(r_cols[i], axis=1)
            y = data[r_cols[i]]
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, random_state=n
            )  # just for bootstrap
            # print(X_train)
            t = model.fit(X_train, y_train)

            if hasattr(model, "feature_importances_"):
                # print(r)
                # print(t.feature_names_in_)
                # print(t.feature_importances_)
                # feats = {}
                for g in zip(t.feature_names_in_, t.feature_importances_):
                    # print(g)
                    l_feats[g[0]].append(g[1])
            #        print(l_feats)
            else:
                r = permutation_importance(
                    t, X_train, y_train, n_repeats=15, random_state=n, n_jobs=-2
                )

                for g in zip(X_train.columns, r.importances_mean):
                    l_feats[g[0]].append(g[1])

            result[r_cols[i]] = l_feats
    return result

In [14]:
def create_scores_v2(result1, result2):
    """
    does not work for more than two datasets
    #https://towardsdatascience.com/rbo-v-s-kendall-tau-to-compare-ranked-lists-of-items-8776c5182899
    #https://stats.stackexchange.com/questions/51295/comparison-of-ranked-lists
    #https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.weightedtau.html


    """
    keys = result1.keys()
    scores_ = {}
    for target in keys:
        # print(col)
        ftkeys = [key for key in keys if key != target]

        #  print(result1[col])
        m1 = {k: np.mean(v) for k, v in result1[target].items()}
        m2 = {k: np.mean(v) for k, v in result2[target].items()}
        # print(m1)
        # print(m2)
        x1_rank = st.rankdata(
            [-1 * el for el in m1.values()], method="ordinal"
        )  # avoid tie
        x1_rank_dict = {k: v for k, v in zip(m1.keys(), x1_rank)}
        # print(x1_rank_dict)

        x2_rank = st.rankdata(
            [
                -1 * el if el != 0 else el * np.random.randint(1, 10) * 0.00001 * -1
                for el in m2.values()
            ],
            method="ordinal",  # avoid tie
        )  # avoid being zero
        x2_rank_dict = {k: v for k, v in zip(m2.keys(), x2_rank)}

        true_score = []
        model_score = []
        true_score_rank = []
        model_score_rank = []
        for key in ftkeys:
            true_score_rank.append(x1_rank_dict[key])
            model_score_rank.append(x2_rank_dict[key])
            true_score.append(m1[key])
            model_score.append(m2[key])

        true_score_rank_join = "".join(str(int(e)) for e in true_score_rank)
        model_score_rank_join = "".join(str(int(e)) for e in model_score_rank)

        #  l_=ndcg_score([true_score_rank],[model_score])
        n_l = ndcg_score([true_score_rank], [model_score_rank])

        #
        def mae_over_max(mae, max_):
            if max_ == 0:
                return 1
            else:
                return mae / max_

        sc = {}
        sc["ndgc_score"] = n_l
        sc["cohen_kappa_score"] = cohen_kappa_score(true_score_rank, model_score_rank)

        sc["r2_score"] = r2_score(true_score, model_score)
        sc["levenshtein_normalized_similarity"] = levenshtein.normalized_similarity(
            true_score_rank, model_score_rank
        )
        sc["kendalltau"] = st.kendalltau(true_score_rank, model_score_rank)[0]
        sc["weightedtau"] = st.weightedtau(true_score_rank, model_score_rank)[0]
        sc["rbo"] = rbo.RankingSimilarity(true_score_rank, model_score_rank).rbo()

        sc["damerau_levenshtein_normalized_similarity"] = (
            damerau_levenshtein.normalized_similarity(true_score_rank, model_score_rank)
        )
        sc["jaro_winkler_normalized_similarity"] = jaro_winkler.normalized_similarity(
            true_score_rank, model_score_rank
        )

        sc["hamming_normalized_similarity"] = hamming.normalized_similarity(
            true_score_rank, model_score_rank
        )

        scores_[target] = {
            "results": sc,
            "true_score": true_score,
            "model_score": model_score,
            "true_score_rank": true_score_rank,
            "model_score_rank": model_score_rank,
            "true_score_rank_join": true_score_rank_join,
            "model_score_rank_join": model_score_rank_join,
        }
    # for aggregated scores:
    full_df = None
    for k, v in scores_.items():
        # print(scores_[k]["results"])
        res_df = pd.DataFrame(scores_[k]["results"], index=[0])
        if full_df is None:
            full_df = res_df
        else:
            full_df = pd.concat([full_df, res_df])

    full_df.loc["mean"] = full_df.mean()

    scores_["aggregated"] = full_df.loc["mean"].to_dict()
    return scores_

In [15]:
# for comparasion
def get_several_dif_dataset(
    data1_,
    data2_,
    categorical_cols,
    int_cols,
    cv,
    models=[DecisionTreeClassifier, LinearRegression],
):
    """
    This is the gold standard as of now. It is a function that takes in two datasets and
    returns the scores for each of the metrics.

    1. preprocesses two datasets
    2.for all columns
    2.1 - create model on real (first)
    2.2 - test on real and on synth
    """
    data1 = data1_.copy()
    data2 = data2_.copy()
    le = preprocessing.OrdinalEncoder()
    le.fit(data1_[categorical_cols].astype(str))
    data1[categorical_cols] = le.transform(data1_[categorical_cols].astype(str))
    # le = preprocessing.OrdinalEncoder()
    # le.fit(data2[categorical_cols].astype(str))

    data2[categorical_cols] = le.transform(data2[categorical_cols].astype(str))

    r_cols = data1.columns
    result = {}
    for i in range(0, len(r_cols)):
        model = (
            models[0](random_state=42) if r_cols[i] in categorical_cols else models[1]()
        )
        metric = accuracy_score if r_cols[i] in categorical_cols else mean_squared_error
        X1 = data1.drop(r_cols[i], axis=1)
        y1 = data1[r_cols[i]]
        X2 = data2.drop(r_cols[i], axis=1)
        y2 = data2[r_cols[i]]

        X_train1, X_test1, y_train1, y_test1 = train_test_split(
            X1, y1, test_size=0.2, random_state=42
        )
        #  X_train2, X_test2, y_train2, y_test2 = train_test_split(
        #      X2, y2, test_size=0.2, random_state=42
        #  )
        model.fit(X_train1, y_train1)
        real_real = metric(y_test1, model.predict(X_test1))
        real_synth = metric(y2, model.predict(X2))
        # =cross_val_score(lr, X_train1, y_train1, cv=cv, scoring=metric)
        # =cross_val_score(lr, X_train1, y2, cv=cv, scoring=metric)

        result[r_cols[i]] = real_synth / real_real
    return result


In [16]:
def aggregate_data_cross(
    real_data, synth_data, categorical_values, continuous_values, cv
):
    """
    ???
    """
    real_synth_dif = get_several_dif_dataset(
        real_data, synth_data, categorical_values, continuous_values, cv
    )
    # print(real_synth_dif)
    synth_real_dif = get_several_dif_dataset(
        synth_data, real_data, categorical_values, continuous_values, cv
    )
    # print(synth_real_dif)
    synth_real_score = {k: np.mean(v) for k, v in synth_real_dif.items()}
    real_synth_score = {k: np.mean(v) for k, v in real_synth_dif.items()}
    # synth_real_score_df=pd.DataFrame.from_dict(synth_real_score,orient='index',columns=["Metric"])
    # real_synth_score_df=pd.DataFrame.from_dict(real_synth_score,orient='index',columns=["Metric"])
    final_score = {
        "real_synth": real_synth_dif,
        "synth_real": synth_real_dif,
        "aggregated": [],
    }
    for k, v in synth_real_score.items():
        # print(synth_real_score[k],real_synth_score[k])
        final_score["aggregated"].append(synth_real_score[k] / real_synth_score[k])
    return final_score

In [17]:
def test_two_datasets(
    data, data_1, categorical_values, continuous_values, reps=10, seed=42
):
    """
    1. gets several feature importance for 1 dataset
    2. Gets several feature importante for 2 dataset
    3. calculates new scores with them
    4. Creates Cross-validation score

    """
    result_1 = get_several_feat_imp_dataset_2(
        data, categorical_values, continuous_values, reps, seed=seed
    )
    result_2 = get_several_feat_imp_dataset_2(
        data_1, categorical_values, continuous_values, reps, seed=seed
    )
    sc = create_scores_v2(result_1, result_2)

    sc["cross"] = aggregate_data_cross(
        data_1, data, categorical_values, continuous_values, 10
    )

    return sc