In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sksurv.svm import FastSurvivalSVM
import seaborn as sns
import matplotlib.pyplot as plt
import joblib


def map_to_scikit_surv(y):
    y = y.values.tolist()
    y1 = []
    y2 = []
    for w in y:
        y1.append(w[0])
        y2.append(w[1])
    y1 = list(map(bool, y1))
    ya = []
    for i in range(len(y1)):
        ya.append([y1[i], y2[i]])
    yy = pd.DataFrame(ya)
    y = yy.to_records(index=False)
    return y


def create_feature_importance(classifier, feature_names: list):
    coef = classifier.coef_.ravel()
    coef_df = pd.DataFrame({"Feature": feature_names, "Coefficient": coef}).sort_values(by="Coefficient",
                                                                                        ascending=False)

    return coef_df


dataset_shapes = {
    'brca': '198x84',
    'gbsg2': '686x11',
    'whas500': '500x16',
    # 'microbiome': '150x1995'
}

In [None]:
alpha = 0.0001

dataset_names = ['brca', 'gbsg2', 'whas500',
                 # 'microbiome'
                 ]
n_clients = [1, 3, 5]
n_splits = range(1, 10)

result_df = pd.DataFrame(columns=["dataset", "n_clients", "client", "split", "c_index"])

for dataset_name in dataset_names:
    print(f'Dataset {dataset_name}')
    for n_client in n_clients:
        clients = f'{n_client}_clients'
        for client_n in range(1, n_client + 1):
            client = f'client_{client_n}'
            for n_split in n_splits:
                path = f'federated-analysis/{dataset_name}/{clients}/{client}/data/split_{n_split}'
                train_data = pd.read_csv(f'{path}/train.csv')
                test_data = pd.read_csv(f'{path}/test.csv')
                pipe = Pipeline([('scaler', StandardScaler()), (
                    'ssvm', FastSurvivalSVM(rank_ratio=0, alpha=alpha, max_iter=50, fit_intercept=True, tol=0.00001))])
                pipe.fit(train_data.drop(['tte', 'event'], axis=1),
                         map_to_scikit_surv(train_data.loc[:, ['event', 'tte']]))
                coefs = pipe['ssvm'].coef_
                try:
                    c_index = pipe.score(test_data.drop(['tte', 'event'], axis=1),
                                         map_to_scikit_surv(test_data.loc[:, ['event', 'tte']]))
                except ValueError:
                    c_index = np.nan
                entry = pd.DataFrame.from_dict({
                    "dataset": [dataset_name],
                    "n_clients": [n_client],
                    "client": [client_n],
                    "split": [n_split],
                    "c_index": [c_index],
                })
                result_df = pd.concat([result_df, entry], ignore_index=True)

result_df

In [None]:
c_indices = []

for n_client in [3, 5]:
    clients = f'{n_client}_clients'
    for dataset_name in dataset_names:
        print(dataset_name)
        try:
            dfs = []
            for client_n in range(1, n_client + 1):
                client = f'client_{client_n}'
                for split in n_splits:
                    path = f'federated-analysis/{dataset_name}/{clients}/fc_normalization/fc_survival_svm/fc_survival_evaluation/{client}/data/split_{split}/global_scores.tsv'
                    df = pd.read_csv(path, sep='\t', header=None, index_col=0)
                    dfs.append(df)

            df = pd.concat(dfs, axis=1).T.reset_index(drop=True)

            c_index = df['mean c-index'].to_frame()
            c_index.columns = ['c_index']
            c_index['dataset'] = dataset_name
            c_index['n_clients'] = n_client
            c_index['client'] = 'fed'
            c_index = c_index.reset_index(drop=False)
            c_indices.append(c_index)

        except FileNotFoundError:
            pass

c_indices = pd.concat(c_indices)
c_indices.columns = ['split', 'c_index', 'dataset', 'n_clients', 'client']
c_indices = c_indices.loc[:, ['dataset', 'n_clients', 'client', 'split', 'c_index']]
c_indices['split'] = c_indices['split'] + 1
display(c_indices)

In [None]:
c_indices_smpc = []

for n_client in [3, 5]:
    clients = f'{n_client}_clients'
    for dataset_name in dataset_names:
        try:
            dfs = []
            for client_n in range(1, n_client + 1):
                client = f'client_{client_n}'
                for split in n_splits:
                    path = f'federated-analysis/{dataset_name}/{clients}/fc_normalization/fc_survival_svm/fc_survival_evaluation/{client}/data/split_{split}/global_scores.tsv'
                    df = pd.read_csv(path, sep='\t', header=None, index_col=0)
                    dfs.append(df)

            df = pd.concat(dfs, axis=1).T.reset_index(drop=True)

            c_index = df['mean c-index'].to_frame()
            c_index.columns = ['c_index']
            c_index['dataset'] = dataset_name
            c_index['n_clients'] = n_client
            c_index['client'] = 'fed + smpc'
            c_index = c_index.reset_index(drop=False)
            c_indices_smpc.append(c_index)

        except FileNotFoundError:
            pass

c_indices_smpc = pd.concat(c_indices_smpc)
c_indices_smpc.columns = ['split', 'c_index', 'dataset', 'n_clients', 'client']
c_indices_smpc = c_indices_smpc.loc[:, ['dataset', 'n_clients', 'client', 'split', 'c_index']]
c_indices_smpc['split'] = c_indices_smpc['split'] + 1
display(c_indices_smpc)

In [None]:
sns.set_style("whitegrid")
sns.set_theme('paper')
sns.despine()

fig, axes = plt.subplots(2, 2, sharex='all', sharey='all', figsize=(8, 6), dpi=350)
row = 0
col = 0
for dataset_name in dataset_names:
    ax = axes[row, col]
    ax.set_title(dataset_name + " " + dataset_shapes[dataset_name])
    df1 = result_df[result_df['dataset'] == dataset_name]
    df2 = c_indices[c_indices['dataset'] == dataset_name]
    df3 = c_indices_smpc[c_indices_smpc['dataset'] == dataset_name]
    df = pd.concat([df3, df2, df1]).reset_index()
    df = df.sort_values(['n_clients', 'client'])
    df['c_index'] = df['c_index'].astype(float)
    df['n_clients'] = df['n_clients'].astype(str)
    df['client'] = df['client'].astype(str)
    hue_order = ['fed', 'fed + smpc', '1', '2', '3', '4', '5']
    sns.boxplot(data=df, y='c_index', x='n_clients', hue='client', ax=ax, hue_order=hue_order, palette=['tab:green', 'tab:orange', 'tab:blue', 'tab:blue', 'tab:blue', 'tab:blue', 'tab:blue'], boxprops=dict(alpha=.5))
    sns.stripplot(data=df, y='c_index', x='n_clients', hue='client', ax=ax, hue_order=hue_order, palette=['tab:green', 'tab:orange', 'tab:blue', 'tab:blue', 'tab:blue', 'tab:blue', 'tab:blue'], dodge=True, size=3)
    if row == 0 and col == 0:
        handles, labels = ax.get_legend_handles_labels()
    ax.get_legend().remove()
    if col > 0:
        ax.set(ylabel=None)
    else:
        ax.set(ylabel='c-index')
    if row != 1:
        ax.set(xlabel=None)
    else:
        ax.set(xlabel=None)
        ax.set(xticklabels=['central', '3 participants', '5 participants'])
    if col < 1:
        col += 1
    else:
        row += 1
        col = 0
fig.legend(handles[:3], ['federated + secure aggregation', 'federated', 'clients'], loc='lower center', ncol=3, bbox_to_anchor=(0.5, 0.065))
plt.tight_layout()
plt.show()
fig.savefig('c_index.jpg', dpi=350)
fig.savefig('c_index.eps', dpi=350)
fig.savefig('c_index.svg', dpi=350)

In [None]:
coef_results = []
fi_results = {}
for dataset_name in dataset_names:
    coefs = []
    fis = []
    for n_client in [3, 5]:
        clients = f'{n_client}_clients'

        client = f'{n_client}_clients'
        path = f'federated-analysis/{dataset_name}/{client}/fc_normalization/fc_survival_svm/client_1'
        coef = pd.Series(joblib.load(path + '/model.pickle').coef_).to_frame()
        coef.columns = [client]
        coefs.append(coef)

        fi = pd.read_csv(path + '/feature_importance.tsv', sep='\t', index_col=0).drop('Coefficient', axis=1)
        fi.columns = [client]
        fis.append(fi)

    path = f'central/{dataset_name}/data.csv'
    data = pd.read_csv(path)
    X = data.drop(['event', 'tte'], axis=1)
    y = data.loc[:, ['event', 'tte']]
    pipe = Pipeline([('scaler', StandardScaler()), (
        'ssvm', FastSurvivalSVM(rank_ratio=0, alpha=alpha, max_iter=50, fit_intercept=True, tol=0.00001))])
    pipe.fit(X, map_to_scikit_surv(y))
    coef = pd.Series(pipe['ssvm'].coef_).to_frame()
    coef.columns = ['central']
    coefs.append(coef)

    coef_df = pd.concat(coefs, axis=1)
    coef_df['difference'] = coef_df['central'] - coef_df['5_clients']
    coef_df['dataset'] = dataset_name + " " + dataset_shapes[dataset_name]
    coef_df = coef_df.reset_index().rename({'index': 'coef_no'}, axis=1)
    coef_results.append(coef_df)

    fi = create_feature_importance(pipe['ssvm'], feature_names=X.columns).drop('Coefficient', axis=1)
    fi.columns = ['central']
    fis.append(fi)
    fi_df = pd.concat(fis, axis=1)
    fi_df['match'] = fi_df['central'] == fi_df['5_clients']
    fi_result = fi_df.groupby('match').count().loc[True, 'central']
    fi_results[dataset_name] = fi_result*100 /len(X.columns)


fi_data = pd.DataFrame(fi_results, index=['match (%)'])
fi_data.to_csv('match.csv')
coef_data = pd.concat(coef_results)
coef_data = coef_data.reset_index(drop=True)

coef_data['method'] = 'federated'


In [None]:
coef_results = []
fi_results = {}
for dataset_name in dataset_names:
    coefs = []
    fis = []
    for n_client in [3, 5]:
        clients = f'{n_client}_clients'

        client = f'{n_client}_clients'
        path = f'federated-analysis/{dataset_name}/{client}/fc_normalization/fc_survival_svm/client_1'
        coef = pd.Series(joblib.load(path + '/model.pickle').coef_).to_frame()
        coef.columns = [client]
        coefs.append(coef)

        fi = pd.read_csv(path + '/feature_importance.tsv', sep='\t', index_col=0).drop('Coefficient', axis=1)
        fi.columns = [client]
        fis.append(fi)

    path = f'central/{dataset_name}/data.csv'
    data = pd.read_csv(path)
    X = data.drop(['event', 'tte'], axis=1)
    y = data.loc[:, ['event', 'tte']]
    pipe = Pipeline([('scaler', StandardScaler()), (
        'ssvm', FastSurvivalSVM(rank_ratio=0, alpha=alpha, max_iter=50, fit_intercept=True, tol=0.00001))])
    pipe.fit(X, map_to_scikit_surv(y))
    coef = pd.Series(pipe['ssvm'].coef_).to_frame()
    coef.columns = ['central']
    coefs.append(coef)

    coef_df = pd.concat(coefs, axis=1)
    coef_df['difference'] = coef_df['central'] - coef_df['5_clients']
    coef_df['dataset'] = dataset_name + " " + dataset_shapes[dataset_name]
    coef_df = coef_df.reset_index().rename({'index': 'coef_no'}, axis=1)
    coef_results.append(coef_df)

    fi = create_feature_importance(pipe['ssvm'], feature_names=X.columns).drop('Coefficient', axis=1)
    fi.columns = ['central']
    fis.append(fi)
    fi_df = pd.concat(fis, axis=1)
    fi_df['match'] = fi_df['central'] == fi_df['5_clients']
    fi_result = fi_df.groupby('match').count().loc[True, 'central']
    fi_results[dataset_name] = fi_result*100 /len(X.columns)


fi_data_smpc = pd.DataFrame(fi_results, index=['match (%)'])
coef_data_smpc = pd.concat(coef_results)
coef_data_smpc = coef_data.reset_index(drop=True)
coef_data_smpc['method'] = 'federated + secure aggregation'

display(coef_data_smpc)

In [None]:
fi_data_overall = pd.concat([fi_data, fi_data_smpc])
fi_data_overall.index = ['federated', 'federated + secure aggregation']
fi_data_overall.to_csv('match.csv')

In [None]:
coefs_overall = pd.concat([coef_data, coef_data_smpc])
coefs_overall['difference'] = coefs_overall['difference'].apply(lambda x: x)
coefs_overall

In [None]:
sns.set_style("whitegrid")
sns.set_theme('paper')
sns.despine()
sns.set(font_scale=1.1)

fig, ax = plt.subplots(1, 1, figsize=(8, 6), dpi=350)
sns.stripplot(data=coefs_overall.reset_index(), y='difference', x='dataset', hue='method', palette=['tab:orange', 'tab:green'], dodge=True, size=3, alpha=0.3)
sns.boxplot(data=coefs_overall.reset_index(), y='difference', x='dataset', hue='method', palette=['tab:orange', 'tab:green'])

handles, labels = ax.get_legend_handles_labels()
ax.get_legend().remove()
fig.legend(handles[2:], labels[2:], loc='lower center', ncol=2, bbox_to_anchor=(0.55, 0.1))
ax.set(ylabel='Coefficient differences compared to centralized')
plt.tight_layout()
plt.show()
fig.savefig('coefs.jpg', dpi=350)
fig.savefig('coefs.eps', dpi=350)
fig.savefig('coefs.svg', dpi=350)

In [None]:
def plot_fi(coef_df: pd.DataFrame, top_features: int = 10):
    def colors_from_values(values, palette_name):
        # normalize the values to range [0, 1]
        normalized = (values - min(values)) / (max(values) - min(values))
        # convert to indices
        indices = np.round(normalized * (len(values) - 1)).astype(np.int32)
        # use the indices to get the colors
        palette = sns.color_palette(palette_name, len(values))
        return np.array(palette).take(indices, axis=0)

    sns.set("talk", font_scale=1.2)
    sns.set_style("whitegrid")
    fig, ax = plt.subplots(1, 1, figsize=(top_features, round(top_features / 2, 1)))
    sns.barplot(data=coef_df, x="Feature", y="Coefficient", ax=ax,
                palette=colors_from_values(coef_df['Coefficient'].abs(), "flare"))
    ax.legend([], [], frameon=False)
    ax.xaxis.set_tick_params(rotation=90)

    return fig