In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from src.analysis.aggregate_results import collect_experiment_results
from src.analysis.visualiser import ResultsVisualiser
import matplotlib.pyplot as plt 
import pandas as pd
import pickle
import shap

In [None]:
pd.set_option('display.max_columns', None)        # pokazuj wszystkie kolumny

In [None]:
df = collect_experiment_results('results_all')

***

# PART 1

In [None]:
vis = ResultsVisualiser()

## CLASSIFICATION

In [None]:
df_cl = df[df['task'] == 'classification']

### EVALUATION

In [None]:
vis.plot_facet_grid(df_cl, x='delta_accuracy', y='dataset', hue='missing_frac', plot_type='box')

In [None]:
df_temp = df_cl[(df_cl['dataset'] == 'phoneme') & (df_cl['imputer'] != 'original')]
vis.plot_boxplot(df_temp, x='delta_auc', y='imputer', hue='missing_frac')

In [None]:
df_temp = df_cl[(df_cl['dataset'] == 'loan') & (df_cl['imputer'] != 'original')]
vis.plot_boxplot(df_temp, x='delta_auc', y='imputer', hue='missing_frac')
df_temp = df_cl[(df_cl['dataset'] == 'sensors') & (df_cl['imputer'] != 'original')]
vis.plot_boxplot(df_temp, x='delta_auc', y='imputer', hue='missing_frac')
df_temp = df_cl[(df_cl['dataset'] == 'diabetes') & (df_cl['imputer'] != 'original')]
vis.plot_boxplot(df_temp, x='delta_auc', y='imputer', hue='missing_frac')

In [None]:
df_temp = df_cl[(df_cl['dataset'] == 'phoneme') & (df_cl['imputer'] != 'original')]
vis.plot_scatter(df_temp, x='imputer_rmse', y='delta_accuracy', hue='imputer', style='model')

In [None]:
df_temp = df_cl[(df_cl['dataset'] == 'loan') & (df_cl['imputer'] != 'original')]
vis.plot_scatter(df_temp, x='imputer_rmse', y='delta_accuracy', hue='imputer', style='model')
df_temp = df_cl[(df_cl['dataset'] == 'diabetes') & (df_cl['imputer'] != 'original')]
vis.plot_scatter(df_temp, x='imputer_rmse', y='delta_accuracy', hue='imputer', style='model')
df_temp = df_cl[(df_cl['dataset'] == 'sensors') & (df_cl['imputer'] != 'original')]
vis.plot_scatter(df_temp, x='imputer_rmse', y='delta_accuracy', hue='imputer', style='model')

### XAI

In [None]:
vis.plot_boxplot(df_cl, x='shap_rmse_overall', y='dataset', hue='missing_frac')

In [None]:
df_temp = df_cl[df_cl['dataset'] =='loan']
vis.plot_scatter(df_temp, x='shap_rmse_overall', y='delta_accuracy', hue='imputer', style='model', figsize=(12,6))

In [None]:
df_temp = df_cl[df_cl['dataset'] =='sensors']
vis.plot_scatter(df_temp, x='shap_rmse_overall', y='delta_accuracy', hue='imputer', style='model')
df_temp = df_cl[df_cl['dataset'] =='diabetes']
vis.plot_scatter(df_temp, x='shap_rmse_overall', y='delta_accuracy', hue='imputer', style='model')
df_temp = df_cl[df_cl['dataset'] =='phoneme']
vis.plot_scatter(df_temp, x='shap_rmse_overall', y='delta_accuracy', hue='imputer', style='model')

In [None]:
df_temp = df_cl[df_cl['model'] == 'xgboost']
vis.plot_heatmap(df_temp, x='imputer', y='missing_frac', metric='shap_rmse_overall', aggfunc='mean')

In [None]:
df_temp = df_cl[df_cl['model'] == 'logistic_regression']
vis.plot_heatmap(df_temp, x='imputer', y='missing_frac', metric='shap_rmse_overall', aggfunc='mean')

df_temp = df_cl[df_cl['model'] == 'random_forest']
vis.plot_heatmap(df_temp, x='imputer', y='missing_frac', metric='shap_rmse_overall', aggfunc='mean')

df_temp = df_cl[df_cl['model'] == 'knn']
vis.plot_heatmap(df_temp, x='imputer', y='missing_frac', metric='shap_rmse_overall', aggfunc='mean')

In [None]:
vis.plot_feature_rank_barplot_faceted_pfi(df_cl, dataset='phoneme')

In [None]:
vis.plot_feature_rank_barplot_faceted_pfi(df_cl, dataset='sensors')
vis.plot_feature_rank_barplot_faceted_pfi(df_cl, dataset='diabetes')
vis.plot_feature_rank_barplot_faceted_pfi(df_cl, dataset='loan')

In [None]:
df_temp = df_cl[df_cl['model'] =='xgboost']
vis.plot_boxplot(df_temp, x='pdp_aggregated', y='dataset', hue='missing_frac')

In [None]:
df_temp = df_cl[df_cl['model'] =='logistic_regression']
vis.plot_boxplot(df_temp, x='pdp_aggregated', y='dataset', hue='missing_frac')
df_temp = df_cl[df_cl['model'] =='knn']
vis.plot_boxplot(df_temp, x='pdp_aggregated', y='dataset', hue='missing_frac')
df_temp = df_cl[df_cl['model'] =='random_forest']
vis.plot_boxplot(df_temp, x='pdp_aggregated', y='dataset', hue='missing_frac')

In [None]:
df_temp = df_cl[df_cl['model'] =='xgboost']
vis.plot_scatter(df_temp, x='pred_rmse', y='pdp_aggregated', hue='imputer', style='dataset')

In [None]:
df_temp = df_cl[df_cl['model'] =='logistic_regression']
vis.plot_scatter(df_temp, x='pred_rmse', y='pdp_aggregated', hue='imputer', style='dataset')
df_temp = df_cl[df_cl['model'] =='knn']
vis.plot_scatter(df_temp, x='pred_rmse', y='pdp_aggregated', hue='imputer', style='dataset')
df_temp = df_cl[df_cl['model'] =='random_forest']
vis.plot_scatter(df_temp, x='pred_rmse', y='pdp_aggregated', hue='imputer', style='dataset')

***

***

***

***

***

***

***

## REGRESSION

In [None]:
df_regression = df[df['task'] == 'regression']

### EVALUATION

In [None]:
vis.plot_facet_grid(df_regression, x='delta_rmse', y='dataset', hue='missing_frac')

In [None]:
df_temp = df_regression[~((df_regression['dataset'] == 'gym_excercises') & (df_regression['delta_rmse'] > 500)) & ~((df_regression['dataset'] == 'cpu') & (df_regression['delta_rmse'] > 10))]
vis.plot_facet_grid(df_temp, x='delta_rmse', y='dataset', hue='missing_frac')

In [None]:
df_temp = df_regression[df_regression['dataset'] == 'gym_excercises']
vis.plot_boxplot(df_temp[df_temp['imputer'] != 'original'], x='imputer_rmse', y='imputer')

In [None]:
df_temp = df_regression[df_regression['dataset'] == 'cpu']
vis.plot_boxplot(df_temp[df_temp['imputer'] != 'original'], x='imputer_rmse', y='imputer')
df_temp = df_regression[df_regression['dataset'] == 'concrete']
vis.plot_boxplot(df_temp[df_temp['imputer'] != 'original'], x='imputer_rmse', y='imputer')
df_temp = df_regression[df_regression['dataset'] == 'housing']
vis.plot_boxplot(df_temp[df_temp['imputer'] != 'original'], x='imputer_rmse', y='imputer')

In [None]:
df_temp = df_regression[~((df_regression['dataset'] == 'gym_excercises') & (df_regression['delta_rmse'] > 500)) & ~((df_regression['dataset'] == 'cpu') & (df_regression['delta_rmse'] > 10))]

vis.plot_scatter(df_temp[df_temp['dataset'] == 'cpu'], x='imputer_rmse', y='pred_rmse', hue='missing_frac')

In [None]:
vis.plot_scatter(df_temp[df_temp['dataset'] == 'housing'], x='imputer_rmse', y='pred_rmse', hue='missing_frac')
vis.plot_scatter(df_temp[df_temp['dataset'] == 'gym_excercises'], x='imputer_rmse', y='pred_rmse', hue='missing_frac')
vis.plot_scatter(df_temp[df_temp['dataset'] == 'concrete'], x='imputer_rmse', y='pred_rmse', hue='missing_frac')

In [None]:
vis.plot_scatter(df_temp[df_temp['dataset'] == 'cpu'], x='imputer_rmse', y='pred_rmse', hue='imputer', style='model')

In [None]:
vis.plot_scatter(df_temp[df_temp['dataset'] == 'concrete'], x='imputer_rmse', y='pred_rmse', hue='imputer', style='model')
vis.plot_scatter(df_temp[df_temp['dataset'] == 'gym_excercises'], x='imputer_rmse', y='pred_rmse', hue='imputer', style='model')
vis.plot_scatter(df_temp[df_temp['dataset'] == 'housing'], x='imputer_rmse', y='pred_rmse', hue='imputer', style='model')

### XAI

In [None]:
df_temp = df_regression[~((df_regression['dataset'] == 'gym_excercises') & (df_regression['shap_rmse_overall'] > 200))
                     & ~((df_regression['dataset'] == 'cpu') & (df_regression['shap_rmse_overall'] > 10))
                     & ~((df_regression['dataset'] == 'housing') & (df_regression['shap_rmse_overall'] > 100000))
                     & ~((df_regression['dataset'] == 'concrete') & (df_regression['shap_rmse_overall'] > 15))]

In [None]:
vis.plot_facet_grid(df_temp, x='shap_rmse_overall', y='dataset', hue='missing_frac', plot_type='box')

In [None]:
df_temp = df_regression[~((df_regression['model'] == 'linear_regression') & (df_regression['imputer'] == 'MICEImputer'))]
vis.plot_heatmap(df_temp[df_temp['dataset'] == 'gym_excercises'], x='imputer', y='model', metric='shap_rmse_overall')

In [None]:
vis.plot_heatmap(df_temp[df_temp['dataset'] == 'cpu'], x='imputer', y='model', metric='shap_rmse_overall')
vis.plot_heatmap(df_temp[df_temp['dataset'] == 'housing'], x='imputer', y='model', metric='shap_rmse_overall')
vis.plot_heatmap(df_temp[df_temp['dataset'] == 'concrete'], x='imputer', y='model', metric='shap_rmse_overall')

In [None]:
vis.plot_scatter(df_temp[df_temp['dataset'] == 'cpu'], x='imputer_rmse', y='shap_rmse_overall', hue='imputer', style='model')

In [None]:
vis.plot_scatter(df_temp[df_temp['dataset'] == 'concrete'], x='imputer_rmse', y='shap_rmse_overall', hue='imputer', style='model')

In [None]:
vis.plot_scatter(df_temp[df_temp['dataset'] == 'gym_excercises'], x='imputer_rmse', y='shap_rmse_overall', hue='imputer', style='model')

In [None]:
vis.plot_scatter(df_temp[df_temp['dataset'] == 'housing'], x='imputer_rmse', y='shap_rmse_overall', hue='imputer', style='model')

In [None]:
vis.plot_feature_rank_barplot_faceted_pfi(df_regression, dataset='cpu')

In [None]:
vis.plot_feature_rank_barplot_faceted_pfi(df_regression, dataset='concrete')
vis.plot_feature_rank_barplot_faceted_pfi(df_regression, dataset='gym_excercises')
vis.plot_feature_rank_barplot_faceted_pfi(df_regression, dataset='housing')

In [None]:
vis.plot_facet_grid(df_regression, x='pdp_aggregated', y='dataset', hue='missing_frac', plot_type='box')

In [None]:
# df_temp = df_regression[~((df_regression['dataset'] == 'cpu') & (df_regression['pdp_aggregated'] < 0.75))
#                      & ~((df_regression['dataset'] == 'gym_excercises') & (df_regression['pdp_aggregated'] < 0.75))
#                      & ~((df_regression['dataset'] == 'housing') & (df_regression['pdp_aggregated'] < 0.75))
#                      & ~((df_regression['dataset'] == 'concrete') & (df_regression['pdp_aggregated'] < 0.75))]   

# vis.plot_facet_grid(df_temp, x='pdp_aggregated', y='dataset', hue='missing_frac', plot_type='box')

In [None]:
vis.plot_facet_grid(df_regression[df_regression['dataset'] == 'concrete'], x='imputer', y='model', hue='pdp_aggregated', plot_type='heatmap')

In [None]:
vis.plot_facet_grid(df_regression[df_regression['dataset'] == 'cpu'], x='imputer', y='model', hue='pdp_aggregated', plot_type='heatmap')
vis.plot_facet_grid(df_regression[df_regression['dataset'] == 'gym_excercises'], x='imputer', y='model', hue='pdp_aggregated', plot_type='heatmap')
vis.plot_facet_grid(df_regression[df_regression['dataset'] == 'housing'], x='imputer', y='model', hue='pdp_aggregated', plot_type='heatmap')

***

***

***

***

***

***

# PART II

## CASE I

In [None]:
df_f = df[(df['dataset'] == 'housing') & df['model'].isin(['linear_regression', 'xgboost']) & (df['missing_frac'] != 0.25)]

In [None]:
df_g = df_f.groupby(['dataset', 'model', 'missing_frac', 'imputer'])[['model_rmse', 'imputer_rmse', 'pred_rmse']].mean()

In [None]:
df_g

In [None]:
with open('results_all/results_33/housing/50/xgboost/evaluation_original.pkl', 'rb') as file:
    data_ori = pickle.load(file)
    pd_ori = data_ori['pdp']

with open('results_all/results_33/housing/50/xgboost/evaluation_kNNImputer.pkl', 'rb') as file:
    data_knn = pickle.load(file)
    pd_knn = data_knn['pdp']

with open('results_all/results_33/housing/50/xgboost/evaluation_MICEImputer.pkl', 'rb') as file:
    data_mice = pickle.load(file)
    pd_mice = data_mice['pdp']

In [None]:
from src.analysis.visualiser import ResultsVisualiser
vis = ResultsVisualiser()

In [None]:
vis.plot_multiple_pdp_curves([pd_ori, pd_knn, pd_mice], feature='sqft_living',
                             labels=['Original', 'kNNImputer', 'MICEImputer'],)

In [None]:
with open('results_all/results_33/housing/50/linear_regression/evaluation_original.pkl', 'rb') as file:
    data_ori = pickle.load(file)
    pd_ori = data_ori['pdp']

with open('results_all/results_33/housing/50/linear_regression/evaluation_kNNImputer.pkl', 'rb') as file:
    data_knn = pickle.load(file)
    pd_knn = data_knn['pdp']

with open('results_all/results_33/housing/50/linear_regression/evaluation_MICEImputer.pkl', 'rb') as file:
    data_mice = pickle.load(file)
    pd_mice = data_mice['pdp']

vis.plot_multiple_pdp_curves([pd_ori, pd_knn, pd_mice], feature='sqft_living',
                             labels=['Original', 'kNNImputer', 'MICEImputer'],)

## CASE II

In [None]:
df_f = df[(df['dataset'].isin(['loan', 'phoneme', 'diabetes', 'sensors']))]
# & df['model'].isin(['random_forest'])

In [None]:
df_g = df_f.groupby(['dataset', 'model'])[['accuracy', 'auc', 'imputer_rmse', 'pred_rmse']].mean()

In [None]:
df_g

In [None]:
df_f = df[(df['dataset'].isin(['loan', 'phoneme', 'diabetes'])) & (df['model'].isin(['random_forest', 'knn'])) & (~df['imputer'].isin(['SoftImputer', 'kNNImputer', 'MeanImputer'])) & (df['missing_frac'] == 0.5)]

In [None]:
df_g = df_f.groupby(['dataset', 'model', 'imputer'])[['accuracy', 'auc', 'imputer_rmse', 'pred_rmse', 'shap_rmse_overall']].mean()

In [None]:
df_g

In [None]:
def compare_importances(importance_df: pd.DataFrame, other_importance: pd.DataFrame) -> pd.DataFrame:
        """
        Compares self.importance_df to another importance dataframe.

        Args:
            other_importance (pd.DataFrame): DataFrame with columns ['Feature', 'Importance']

        Returns:
            pd.DataFrame: Merged DataFrame with differences computed.
        """
        if importance_df is None:
            raise ValueError("You must call compute_importance() first!")

        # Sprawdzenie poprawności kolumn wejściowych
        required_cols = {'Feature', 'Importance'}
        if not required_cols.issubset(other_importance.columns):
            raise ValueError(f"other_importance must contain columns {required_cols}")

        # Upewnij się, że nie ma duplikatów w 'Feature'
        if other_importance['Feature'].duplicated().any() or importance_df['Feature'].duplicated().any():
            raise ValueError("Duplicate feature names found in importance DataFrames!")

        # Normalizacja nazw kolumn (np. usunięcie białych znaków)
        self_df = importance_df.copy()
        other_df = other_importance.copy()

        # Scalanie z uwzględnieniem brakujących cech
        merged = pd.merge(
            self_df,
            other_df,
            on="Feature",
            suffixes=('_self', '_other'),
            how="inner"  # lub "outer", jeśli chcesz widzieć różnice w pokryciu
        )

        merged['Difference'] = merged['Importance_other'] - merged['Importance_self']
        return merged.sort_values(by='Difference', ascending=False).reset_index(drop=True)

def plot_difference(diff_df: pd.DataFrame) -> plt.Figure:
        """Plots difference in feature importance between two models."""
        fig, ax = plt.subplots(figsize=(10, 5))
        colors = ['green' if x > 0 else 'red' for x in diff_df['Difference']]
        ax.barh(diff_df['Feature'], diff_df['Difference'], color=colors)
        ax.axvline(0, color='black', linestyle='--')
        ax.set_xlabel("Importance Difference (Imputed - Original)")
        ax.set_ylabel("Features")
        ax.set_title("Change in Feature Importance Due to Imputation")
        ax.invert_yaxis()
        fig.tight_layout()
        return fig

In [None]:
with open('results_all/results_123/diabetes/50/knn/evaluation_original.pkl', 'rb') as file:
    diabetes_knn_ori = pickle.load(file)
    diabetes_knn_ori_importance = diabetes_knn_ori['pfi']

with open('results_all/results_123/diabetes/50/knn/evaluation_RandomImputer.pkl', 'rb') as file:
    diabetes_knn_rand = pickle.load(file)
    diabetes_knn_rand_importance = diabetes_knn_rand['pfi']
with open('results_all/results_123/diabetes/50/knn/evaluation_MICEImputer.pkl', 'rb') as file:
    diabetes_knn_mice = pickle.load(file)
    diabetes_knn_mice_importance = diabetes_knn_mice['pfi']

with open('results_all/results_123/diabetes/50/random_forest/evaluation_original.pkl', 'rb') as file:
    diabetes_rf_ori = pickle.load(file)
    diabetes_rf_ori_importance = diabetes_rf_ori['pfi']
with open('results_all/results_123/diabetes/50/random_forest/evaluation_RandomImputer.pkl', 'rb') as file:
    diabetes_rf_rand = pickle.load(file)
    diabetes_rf_rand_importance = diabetes_rf_rand['pfi']

with open('results_all/results_123/diabetes/50/random_forest/evaluation_MICEImputer.pkl', 'rb') as file:
    diabetes_rf_mice = pickle.load(file)
    diabetes_rf_mice_importance = diabetes_rf_mice['pfi']

In [None]:
df_com = compare_importances(diabetes_knn_ori_importance, diabetes_knn_rand_importance)
plot_difference(df_com)
plt.show()

df_com = compare_importances(diabetes_knn_ori_importance, diabetes_knn_mice_importance)
plot_difference(df_com)
plt.show()

df_com = compare_importances(diabetes_rf_ori_importance, diabetes_rf_rand_importance)
plot_difference(df_com)  
plt.show()

df_com = compare_importances(diabetes_rf_ori_importance, diabetes_rf_mice_importance)
plot_difference(df_com)  
plt.show()

In [None]:
with open('results_all/results_123/loan/50/knn/evaluation_original.pkl', 'rb') as file:
    loan_knn_ori = pickle.load(file)
    loan_knn_ori_importance = loan_knn_ori['pfi']

with open('results_all/results_123/loan/50/knn/evaluation_RandomImputer.pkl', 'rb') as file:
    loan_knn_rand = pickle.load(file)
    loan_knn_rand_importance = loan_knn_rand['pfi']

with open('results_all/results_123/loan/50/knn/evaluation_MICEImputer.pkl', 'rb') as file:
    loan_knn_mice = pickle.load(file)
    loan_knn_mice_importance = loan_knn_mice['pfi']
with open('results_all/results_123/loan/50/random_forest/evaluation_original.pkl', 'rb') as file:
    loan_rf_ori = pickle.load(file)
    loan_rf_ori_importance = loan_rf_ori['pfi']

with open('results_all/results_123/loan/50/random_forest/evaluation_RandomImputer.pkl', 'rb') as file:
    loan_rf_rand = pickle.load(file)
    loan_rf_rand_importance = loan_rf_rand['pfi']
with open('results_all/results_123/loan/50/random_forest/evaluation_MICEImputer.pkl', 'rb') as file:
    loan_rf_mice = pickle.load(file)
    loan_rf_mice_importance = loan_rf_mice['pfi']

In [None]:
loan_rf_mice['shap_compared']

In [None]:
df_com = compare_importances(loan_knn_ori_importance, loan_knn_rand_importance)
plot_difference(df_com)
plt.show()

df_com = compare_importances(loan_knn_ori_importance, loan_knn_mice_importance)
plot_difference(df_com)
plt.show()

df_com = compare_importances(loan_rf_ori_importance, loan_rf_rand_importance)
plot_difference(df_com)  
plt.show()

df_com = compare_importances(loan_rf_ori_importance, loan_rf_mice_importance)
plot_difference(df_com)  
plt.show()

In [None]:
with open('results_all/results_123/phoneme/50/knn/evaluation_original.pkl', 'rb') as file:
    phoneme_knn_ori = pickle.load(file)
    phoneme_knn_ori_importance = phoneme_knn_ori['pfi']

with open('results_all/results_123/phoneme/50/knn/evaluation_RandomImputer.pkl', 'rb') as file:
    phoneme_knn_rand = pickle.load(file)
    phoneme_knn_rand_importance = phoneme_knn_rand['pfi']
with open('results_all/results_123/phoneme/50/knn/evaluation_MICEImputer.pkl', 'rb') as file:
    phoneme_knn_mice = pickle.load(file)
    phoneme_knn_mice_importance = phoneme_knn_mice['pfi']
with open('results_all/results_123/phoneme/50/random_forest/evaluation_original.pkl', 'rb') as file:
    phoneme_rf_ori = pickle.load(file)
    phoneme_rf_ori_importance = phoneme_rf_ori['pfi']

with open('results_all/results_123/phoneme/50/random_forest/evaluation_RandomImputer.pkl', 'rb') as file:
    phoneme_rf_rand = pickle.load(file)
    phoneme_rf_rand_importance = phoneme_rf_rand['pfi']
with open('results_all/results_123/phoneme/50/random_forest/evaluation_MICEImputer.pkl', 'rb') as file:
    phoneme_rf_mice = pickle.load(file)
    phoneme_rf_mice_importance = phoneme_rf_mice['pfi']

In [None]:
df_com = compare_importances(phoneme_knn_ori_importance, phoneme_knn_rand_importance)
plot_difference(df_com)
plt.show()

df_com = compare_importances(phoneme_knn_ori_importance, phoneme_knn_mice_importance)
plot_difference(df_com)
plt.show()

df_com = compare_importances(phoneme_rf_ori_importance, phoneme_rf_rand_importance)
plot_difference(df_com)  
plt.show()

df_com = compare_importances(phoneme_rf_ori_importance, phoneme_rf_mice_importance)
plot_difference(df_com)  
plt.show()

In [None]:
import pandas as pd
import pickle

models = ['random_forest', 'knn']
imputers = ['RandomImputer', 'MICEImputer']
base_dir = 'results_all/results_123'
ratio = 50

def load_shap_compared(dataset, model, imputer):
    if imputer == 'original':
        filename = f'evaluation_original.pkl'
    else:
        filename = f'evaluation_{imputer}.pkl'
    path = f"{base_dir}/{dataset}/{ratio}/{model}/{filename}"
    try:
        with open(path, 'rb') as file:
            data = pickle.load(file)
            return data.get('shap_compared', None)
    except Exception as e:
        print(f"Missing or error in {path}: {e}")
        return None

def create_shap_rmse_table(datasets, models, imputers):
    rows = []
    for dataset in datasets:
        for model in models:
            # Load original for reference
            orig = load_shap_compared(dataset, model, 'original')
            for imputer in imputers:
                comp = load_shap_compared(dataset, model, imputer)
                if comp is not None and 'rmse' in comp:
                    per_feature = comp['rmse']['per_feature']
                    overall = comp['rmse']['overall']
                    for feature, value in per_feature.items():
                        rows.append({
                            'dataset': dataset,
                            'model': model,
                            'imputer': imputer,
                            'feature': feature,
                            'rmse_diff': value,
                            'overall_rmse_diff': overall
                        })
                    # Add overall row
                    rows.append({
                        'dataset': dataset,
                        'model': model,
                        'imputer': imputer,
                        'feature': 'OVERALL',
                        'rmse_diff': overall,
                        'overall_rmse_diff': overall
                    })

    summary_df = pd.DataFrame(rows)
    summary_df = summary_df[['dataset', 'model', 'imputer', 'feature', 'rmse_diff', 'overall_rmse_diff']]
    pivot_df = summary_df.pivot_table(index='feature', columns=['dataset', 'model', 'imputer'], values='rmse_diff')
    return pivot_df

In [None]:
create_shap_rmse_table(['loan'], models, imputers)

In [None]:
create_shap_rmse_table(['diabetes'], models, imputers)

In [None]:
create_shap_rmse_table(['phoneme'], models, imputers)

In [None]:
df_f.pdp_compared

In [None]:
# Aggregate PDP L2 distances from df_f['pdp_compared']
def create_pdp_l2_table(df: pd.DataFrame, dataset) -> pd.DataFrame:
    df_f = df[df['dataset'] == dataset]
    import numpy as np
    pdp_rows = []
    for idx, row in df_f.iterrows():
        pdp_dict = row['pdp_compared']
        if isinstance(pdp_dict, dict):
            dataset = row['dataset']
            model = row['model']
            imputer = row['imputer']
            for feature, l2_dist in pdp_dict.items():
                pdp_rows.append({
                    'dataset': dataset,
                    'model': model,
                    'imputer': imputer,
                    'feature': feature,
                    'l2_dist': l2_dist
                })
            # Add overall average for this row
            avg_l2 = np.mean(list(pdp_dict.values()))
            pdp_rows.append({
                'dataset': dataset,
                'model': model,
                'imputer': imputer,
                'feature': 'OVERALL',
                'l2_dist': avg_l2
            })

    pdp_df = pd.DataFrame(pdp_rows)
    pdp_pivot = pdp_df.pivot_table(index='feature', columns=['dataset', 'model', 'imputer'], values='l2_dist')
    return pdp_pivot

In [None]:
create_pdp_l2_table(df_f, 'diabetes')

In [None]:
create_pdp_l2_table(df_f, 'loan')

In [None]:
create_pdp_l2_table(df_f, 'phoneme')

***

***

***

***

***

## CASE III

In [None]:
vis = ResultsVisualiser()

In [None]:
df_temp = df[
    ((df['dataset'] == 'gym_excercises') & (df['shap_rmse_overall'] > 6000)) |
    ((df['dataset'] == 'cpu') & (df['shap_rmse_overall'] > 50000)) |
   # ((df['dataset'] == 'housing') & (df['shap_rmse_overall'] > 100000)) |
    ((df['dataset'] == 'concrete') & (df['shap_rmse_overall'] > 50))
]

In [None]:
df_temp2 = df[((df['dataset'] == 'cpu') & (df['imputer'] == 'original') &  (df['model'] == 'linear_regression')) & (df['missing_frac'] == 0.5) & (df['seed']==123)
              | ((df['dataset'] == 'gym_excercises') & (df['imputer'] == 'original') &  (df['model'] == 'linear_regression')) & (df['missing_frac'] == 0.5) & (df['seed']==33)
              | ((df['dataset'] == 'concrete') & (df['imputer'] == 'original') &  (df['model'] == 'linear_regression')) & (df['missing_frac'] == 0.5) & (df['seed']==987)]

In [None]:
cases = [
    {'dataset': 'cpu', 'seed': 123, 'missing_frac': 0.5, 'model': 'linear_regression'},
    {'dataset': 'gym_excercises', 'seed': 33, 'missing_frac': 0.5, 'model': 'linear_regression'},
    {'dataset': 'concrete', 'seed': 987, 'missing_frac': 0.5, 'model': 'linear_regression'},
]

# Find rows matching each case and get the one with the lowest shap_rmse_overall
lowest_shap_rows = []
for case in cases:
    subset = df[
        (df['dataset'] == case['dataset']) &
        (df['seed'] == case['seed']) &
        (df['missing_frac'] == case['missing_frac']) &
        (df['model'] == case['model'])
    ]
    if not subset.empty:
        min_row = subset.loc[subset['shap_rmse_overall'].idxmin()]
        lowest_shap_rows.append(min_row)

# Convert to DataFrame for display
df_lowest_shap = pd.DataFrame(lowest_shap_rows)

In [None]:
# for col in df_temp.columns:
#     if df_temp[col].apply(lambda x: isinstance(x, list)).any():
#         df_temp[col] = df_temp[col].apply(str)
# for col in df_temp2.columns:
#     if df_temp2[col].apply(lambda x: isinstance(x, list)).any():
#         df_temp2[col] = df_temp2[col].apply(str)

df_finish = pd.concat([df_temp, df_temp2, df_lowest_shap]).reset_index(drop=True)

In [None]:
df_finish['model_rmse'] = df_finish['model_rmse'].apply(lambda x: f"{float(x):.4f}")
df_finish['pred_rmse'] = df_finish['pred_rmse'].apply(lambda x: f"{float(x):.4f}")
df_finish = df_finish[['seed', 'dataset', 'model', 'imputer', 'missing_frac', 'model_rmse', 'imputer_rmse', 'pred_rmse', 'shap_rmse_overall', 'result_path']].sort_values(by='dataset').reset_index(drop=True)

In [None]:
df_finish

In [None]:
#PLACE FOR SHAP FI PLOTS. 

In [None]:
df_finish['result_path'].iloc[5]

In [None]:
df[df['dataset'] == 'cpu']['pdp_compared'].iloc[2]

In [None]:
with open(df_finish['result_path'].iloc[3], 'rb') as file:
    data_bad = pickle.load(file)
    pd_bad = data_bad['pdp']

with open(df_finish['result_path'].iloc[4], 'rb') as file:
    data_ori = pickle.load(file)
    pd_ori = data_ori['pdp']

with open(df_finish['result_path'].iloc[5], 'rb') as file:
    data_good = pickle.load(file)
    pd_good = data_good['pdp']

vis.plot_multiple_pdp_curves([pd_ori, pd_bad, pd_good], feature='freeswap',
                             labels=['Original', 'Bad Case - MICE', 'Good Case - Mean'],)

In [None]:
shap_good = data_good['shap']
shap_bad = data_bad['shap']
shap_ori = data_ori['shap']

In [None]:
df_cpu = pd.read_csv('datasets/cpu.csv')
df_cpu.drop('usr', axis=1, inplace=True)
feature_names = df_cpu.columns.to_list()

def plot_shap_waterfall(shaps, index):
    print("feature_names:", len(feature_names))
    print("shaps.data shape:", shaps.data.shape)
    print("shaps.values shape:", shaps.values.shape)
    base_values = shaps.base_values
    # If base_values is 2D (e.g., shape (n_samples, 2)), take the first column
    if hasattr(base_values, 'ndim') and base_values.ndim == 2:
        base_values = base_values[:, 0]
    expl = shap.Explanation(
        values=shaps.values,
        base_values=base_values,
        data=shaps.data,
        feature_names=feature_names
    )
    fig = shap.plots.waterfall(expl[index], show=False)
    plt.show()

In [None]:
plot_shap_waterfall(shap_ori, index=56)

In [None]:
plot_shap_waterfall(shap_bad, index=56)

In [None]:
df_temp = df[((df['dataset'] == 'phoneme') & (df['imputer'].isin(['original', 'MeanImputer'])) & (df['model'] == 'xgboost') & (df['missing_frac'] == 0.5) & (df['seed']==42))]

In [None]:
df_temp_best = df[
    (df['dataset'] == 'phoneme') &
    (df['imputer'].isin(['MeanImputer'])) &
    (df['missing_frac'] == 0.5) &
    (df['seed'] == 42)
]

best_row = df_temp_best.loc[(df_temp_best['model'] == 'random_forest')]# & (df_temp_best['shap_rmse_overall'].idxmin())]

In [None]:
best_row

In [None]:
df_temp

In [None]:
best_row.columns

In [None]:
df_temp.columns

In [None]:
df_temp = pd.concat([df_temp, best_row], ignore_index=True)
df_finish = df_temp[['seed', 'dataset', 'model', 'imputer', 'missing_frac', 'accuracy', 'imputer_rmse', 'pred_rmse', 'shap_rmse_overall', 'pdp_aggregated', 'result_path']].sort_values(by='dataset').reset_index(drop=True)

In [None]:
df_finish

In [None]:
with open(df_finish['result_path'].iloc[0], 'rb') as file:
    data_ori = pickle.load(file)
    pd_ori = data_ori['pdp']

with open(df_finish['result_path'].iloc[1], 'rb') as file:
    data_bad = pickle.load(file)
    pd_bad = data_bad['pdp']

with open(df_finish['result_path'].iloc[2], 'rb') as file:
    data_good = pickle.load(file)
    pd_good = data_good['pdp']

vis.plot_multiple_pdp_curves([pd_ori, pd_bad, pd_good], feature='V4',
                             labels=['Original', 'Bad Case - xgboost', 'Good Case - random forest'],)

In [None]:
df_phoneme = pd.read_csv('datasets/phoneme.csv')
df_phoneme.drop('Class', axis=1, inplace=True)
feature_names = df_phoneme.columns.to_list()


In [None]:
shap_good = data_good['shap']
shap_bad = data_bad['shap']
shap_ori = data_ori['shap']

In [None]:
def plot_shap_waterfall(shaps, index):
    base_values = shaps.base_values
    # If base_values is 2D (e.g., shape (n_samples, 2)), take the first column
    if hasattr(base_values, 'ndim') and base_values.ndim == 2:
        base_values = base_values[:, 0]
    expl = shap.Explanation(
        values=shaps.values,
        base_values=base_values,
        data=shaps.data,
        feature_names=feature_names
    )
    fig = shap.plots.waterfall(expl[index], show=False)
    plt.show()

In [None]:
index=137
plot_shap_waterfall(shap_ori, index=index)
plot_shap_waterfall(shap_bad, index=index)