In [11]:
import os
import json

import pandas as pd
from pandas import json_normalize 


def read_and_concat_json(files):
    """
    複数のJSONファイルを読み込んで行方向に連結されたDataFrameを返す関数

    Parameters:
    - files: list, 読み込むJSONファイルのリスト

    Returns:
    - DataFrame: 行方向に連結されたDataFrame
    """
    dfs = []  # DataFrameを格納するリスト

    for file in files:
        with open(file, 'r') as f:
            data = json.load(f)

        # 必要な列のみを選択してDataFrameに変換
        target_columns = ['name', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'Auc', 'len_U', 'Rules.violation', 'Rules.total']
        df = pd.json_normalize(data)[target_columns]

        # "Violation ratio" カラムを追加
        df['Violation ratio (= violation / total)'] = df['Rules.violation'] / df['Rules.total']

        # DataFrameをリストに追加
        dfs.append(df)

    # リスト内のDataFrameを行方向に連結
    result_df = pd.concat(dfs, ignore_index=True)

    return result_df

In [80]:
directories = [
    'fold_0',
    'fold_1',
    'fold_2',
    'fold_3',
    'fold_4'
]

json_files = [
    'result_1.json',
    'result_2.json',
    'result_3.json',
    'result_4.json',
    'result_luka_1.json',
    'result_luka_2.json',
    'result_luka_3.json',
    'result_rulefit_1.json',
    'result_rulefit_2.json',
]


dfs = []
original_col_order = None
tmp_name = None
tmp_len_U = None
for dir_name in directories:
    output_dir_path = './../../outputs/pima_indian_diabetes_2/'

    files_path = []
    for file_name in json_files:
        files_path.append(os.path.join(output_dir_path, dir_name ,file_name))

    df = read_and_concat_json(files_path)

    if original_col_order is None:
        original_col_order = df.columns
    if tmp_name is None:
        tmp_name = df['name']
    if tmp_len_U is None:
        tmp_len_U = df['len_U']

    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=False)
combined_df = combined_df.drop(['name', 'len_U'], axis=1)
result_df_mean = combined_df.groupby(combined_df.index).mean()
result_df_mean['name'] = tmp_name
result_df_mean['len_U'] = tmp_len_U
result_df_mean = result_df_mean[original_col_order]

result_df_std = combined_df.groupby(combined_df.index).std()
result_df_std['name'] = tmp_name
result_df_std['len_U'] = tmp_len_U
result_df_std = result_df_std[original_col_order]

In [81]:
result_df_mean

Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,linear svm,0.777612,0.70662,0.545042,0.614569,0.839456,,32.4,46.0,0.704348
1,non-linear svm (rbf),0.771642,0.739439,0.468617,0.570778,0.825633,,28.0,46.0,0.608696
2,logistic regression,0.779104,0.70706,0.555639,0.620318,0.840584,,32.0,46.0,0.695652
3,random forest,0.783582,0.715369,0.572454,0.631111,0.831329,,33.8,46.0,0.734783
4,luka linear svm,0.762687,0.7201,0.440418,0.546356,0.797451,15.0,20.8,46.0,0.452174
5,luka linear svm loss,0.762687,0.7201,0.440418,0.546356,0.800063,15.0,20.8,46.0,0.452174
6,luka logistic regression loss,0.767164,0.721151,0.463979,0.563612,0.807159,15.0,22.2,46.0,0.482609
7,random forest (rulefit),0.71791,0.741313,0.200667,0.314169,0.807826,,20.4,46.0,0.443478
8,RuleFitClassifier,0.762687,0.675771,0.540138,0.598208,0.803497,,28.6,46.0,0.621739


In [82]:
result_df_std

Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,linear svm,0.042347,0.044474,0.041333,0.034771,0.045271,,3.209361,0.0,0.069769
1,non-linear svm (rbf),0.045577,0.102245,0.054765,0.055239,0.045847,,4.743416,0.0,0.103118
2,logistic regression,0.043706,0.069242,0.049377,0.041328,0.046465,,3.535534,0.0,0.076859
3,random forest,0.039135,0.071385,0.073789,0.03563,0.040772,,3.34664,0.0,0.072753
4,luka linear svm,0.037089,0.051562,0.023171,0.030995,0.054181,15.0,1.30384,0.0,0.028344
5,luka linear svm loss,0.037089,0.051562,0.023171,0.030995,0.053632,15.0,1.30384,0.0,0.028344
6,luka logistic regression loss,0.039979,0.055345,0.044435,0.040868,0.053523,15.0,2.588436,0.0,0.05627
7,random forest (rulefit),0.032271,0.113664,0.036309,0.051031,0.049412,,0.547723,0.0,0.011907
8,RuleFitClassifier,0.058142,0.06056,0.067079,0.049739,0.046009,,1.67332,0.0,0.036377


In [5]:
json_files = [
    # 'result_1.json',
    # 'result_2.json',
    # 'result_3.json',
    # 'result_4.json',
    'result_luka_1.json',
    'result_luka_2.json',
    'result_luka_3.json',
    # 'result_rulefit_1.json',
    # 'result_rulefit_2.json',
]

output_dir_path = './../../outputs/pima_indian_diabetes_2/fold_0'
json_files = [os.path.join(output_dir_path, file_name) for file_name in json_files]

combined_df = read_and_concat_json(json_files)

print('-')
combined_df

-


Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,luka linear svm,0.761194,0.730769,0.431818,0.542857,0.792929,15,22,46,0.478261
1,luka linear svm loss,0.761194,0.730769,0.431818,0.542857,0.791667,15,22,46,0.478261
2,luka logistic regression loss,0.761194,0.714286,0.454545,0.555556,0.817172,15,25,46,0.543478


In [7]:
json_files = [
    # 'result_1.json',
    # 'result_2.json',
    # 'result_3.json',
    # 'result_4.json',
    'result_luka_1.json',
    'result_luka_2.json',
    'result_luka_3.json',
    # 'result_rulefit_1.json',
    # 'result_rulefit_2.json',
]

output_dir_path = './../../outputs/pima_indian_diabetes_2/fold_1'
json_files = [os.path.join(output_dir_path, file_name) for file_name in json_files]

combined_df = read_and_concat_json(json_files)

print('-')
combined_df

-


Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,luka linear svm,0.708955,0.666667,0.408163,0.506329,0.721969,15,22,46,0.478261
1,luka linear svm loss,0.708955,0.666667,0.408163,0.506329,0.727251,15,22,46,0.478261
2,luka logistic regression loss,0.708955,0.666667,0.408163,0.506329,0.726771,15,23,46,0.5


In [8]:
json_files = [
    # 'result_1.json',
    # 'result_2.json',
    # 'result_3.json',
    # 'result_4.json',
    'result_luka_1.json',
    'result_luka_2.json',
    'result_luka_3.json',
    # 'result_rulefit_1.json',
    # 'result_rulefit_2.json',
]

output_dir_path = './../../outputs/pima_indian_diabetes_2/fold_2'
json_files = [os.path.join(output_dir_path, file_name) for file_name in json_files]

combined_df = read_and_concat_json(json_files)

print('-')
combined_df

-


Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,luka linear svm,0.768657,0.777778,0.456522,0.575342,0.778656,15,20,46,0.434783
1,luka linear svm loss,0.768657,0.777778,0.456522,0.575342,0.782609,15,20,46,0.434783
2,luka logistic regression loss,0.776119,0.807692,0.456522,0.583333,0.799901,15,19,46,0.413043


In [9]:
json_files = [
    # 'result_1.json',
    # 'result_2.json',
    # 'result_3.json',
    # 'result_4.json',
    'result_luka_1.json',
    'result_luka_2.json',
    'result_luka_3.json',
    # 'result_rulefit_1.json',
    # 'result_rulefit_2.json',
]

output_dir_path = './../../outputs/pima_indian_diabetes_2/fold_3'
json_files = [os.path.join(output_dir_path, file_name) for file_name in json_files]

combined_df = read_and_concat_json(json_files)

print('-')
combined_df

-


Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,luka linear svm,0.813433,0.666667,0.4375,0.528302,0.866728,15,19,46,0.413043
1,luka linear svm loss,0.813433,0.666667,0.4375,0.528302,0.871324,15,19,46,0.413043
2,luka logistic regression loss,0.820896,0.681818,0.46875,0.555556,0.876225,15,20,46,0.434783


In [10]:
json_files = [
    # 'result_1.json',
    # 'result_2.json',
    # 'result_3.json',
    # 'result_4.json',
    'result_luka_1.json',
    'result_luka_2.json',
    'result_luka_3.json',
    # 'result_rulefit_1.json',
    # 'result_rulefit_2.json',
]

output_dir_path = './../../outputs/pima_indian_diabetes_2/fold_4'
json_files = [os.path.join(output_dir_path, file_name) for file_name in json_files]

combined_df = read_and_concat_json(json_files)

print('-')
combined_df

-


Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,luka linear svm,0.761194,0.758621,0.468085,0.578947,0.826975,15,21,46,0.456522
1,luka linear svm loss,0.761194,0.758621,0.468085,0.578947,0.827464,15,21,46,0.456522
2,luka logistic regression loss,0.768657,0.735294,0.531915,0.617284,0.815725,15,24,46,0.521739
