In [2]:
import os
import json

import pandas as pd
from pandas import json_normalize 


def read_and_concat_json(files):
    """
    複数のJSONファイルを読み込んで行方向に連結されたDataFrameを返す関数

    Parameters:
    - files: list, 読み込むJSONファイルのリスト

    Returns:
    - DataFrame: 行方向に連結されたDataFrame
    """
    dfs = []  # DataFrameを格納するリスト

    for file in files:
        with open(file, 'r') as f:
            data = json.load(f)

        # 必要な列のみを選択してDataFrameに変換
        target_columns = ['name', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'Auc', 'len_U', 'Rules.violation', 'Rules.total']
        df = pd.json_normalize(data)[target_columns]

        # "Violation ratio" カラムを追加
        df['Violation ratio (= violation / total)'] = df['Rules.violation'] / df['Rules.total']

        # DataFrameをリストに追加
        dfs.append(df)

    # リスト内のDataFrameを行方向に連結
    result_df = pd.concat(dfs, ignore_index=True)

    return result_df

In [80]:
directories = [
    'fold_0',
    'fold_1',
    'fold_2',
    'fold_3',
    'fold_4'
]

json_files = [
    'result_1.json',
    'result_2.json',
    'result_3.json',
    'result_4.json',
    'result_luka_1.json',
    'result_luka_2.json',
    'result_luka_3.json',
    'result_rulefit_1.json',
    'result_rulefit_2.json',
]


dfs = []
original_col_order = None
tmp_name = None
tmp_len_U = None
for dir_name in directories:
    output_dir_path = './../../outputs/pima_indian_diabetes_2/'

    files_path = []
    for file_name in json_files:
        files_path.append(os.path.join(output_dir_path, dir_name ,file_name))

    df = read_and_concat_json(files_path)

    if original_col_order is None:
        original_col_order = df.columns
    if tmp_name is None:
        tmp_name = df['name']
    if tmp_len_U is None:
        tmp_len_U = df['len_U']

    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=False)
combined_df = combined_df.drop(['name', 'len_U'], axis=1)
result_df_mean = combined_df.groupby(combined_df.index).mean()
result_df_mean['name'] = tmp_name
result_df_mean['len_U'] = tmp_len_U
result_df_mean = result_df_mean[original_col_order]

result_df_std = combined_df.groupby(combined_df.index).std()
result_df_std['name'] = tmp_name
result_df_std['len_U'] = tmp_len_U
result_df_std = result_df_std[original_col_order]

In [85]:
print()
print('mean')
result_df_mean


mean


Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,linear svm,0.777612,0.70662,0.545042,0.614569,0.839456,,32.4,46.0,0.704348
1,non-linear svm (rbf),0.771642,0.739439,0.468617,0.570778,0.825633,,28.0,46.0,0.608696
2,logistic regression,0.779104,0.70706,0.555639,0.620318,0.840584,,32.0,46.0,0.695652
3,random forest,0.783582,0.715369,0.572454,0.631111,0.831329,,33.8,46.0,0.734783
4,luka linear svm,0.762687,0.7201,0.440418,0.546356,0.797451,15.0,20.8,46.0,0.452174
5,luka linear svm loss,0.762687,0.7201,0.440418,0.546356,0.800063,15.0,20.8,46.0,0.452174
6,luka logistic regression loss,0.767164,0.721151,0.463979,0.563612,0.807159,15.0,22.2,46.0,0.482609
7,random forest (rulefit),0.71791,0.741313,0.200667,0.314169,0.807826,,20.4,46.0,0.443478
8,RuleFitClassifier,0.762687,0.675771,0.540138,0.598208,0.803497,,28.6,46.0,0.621739


In [84]:
print()
print('standard deviation')
result_df_std


standard deviation


Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,linear svm,0.042347,0.044474,0.041333,0.034771,0.045271,,3.209361,0.0,0.069769
1,non-linear svm (rbf),0.045577,0.102245,0.054765,0.055239,0.045847,,4.743416,0.0,0.103118
2,logistic regression,0.043706,0.069242,0.049377,0.041328,0.046465,,3.535534,0.0,0.076859
3,random forest,0.039135,0.071385,0.073789,0.03563,0.040772,,3.34664,0.0,0.072753
4,luka linear svm,0.037089,0.051562,0.023171,0.030995,0.054181,15.0,1.30384,0.0,0.028344
5,luka linear svm loss,0.037089,0.051562,0.023171,0.030995,0.053632,15.0,1.30384,0.0,0.028344
6,luka logistic regression loss,0.039979,0.055345,0.044435,0.040868,0.053523,15.0,2.588436,0.0,0.05627
7,random forest (rulefit),0.032271,0.113664,0.036309,0.051031,0.049412,,0.547723,0.0,0.011907
8,RuleFitClassifier,0.058142,0.06056,0.067079,0.049739,0.046009,,1.67332,0.0,0.036377


# rurlefit treegenerator 修正 ver. これでも教師ありデータのサイズに比べると５分の１ぐらい

In [90]:
directories = [
    'fold_0',
    'fold_1',
    'fold_2',
    'fold_3',
    'fold_4'
]

json_files = [
    'result_1.json',
    'result_2.json',
    'result_3.json',
    'result_4.json',
    'result_luka_1.json',
    'result_luka_2.json',
    'result_luka_3.json',
    'result_rulefit_1.json',
    'result_rulefit_2.json',
]


dfs = []
original_col_order = None
tmp_name = None
tmp_len_U = None
for dir_name in directories:
    output_dir_path = './../../outputs/pima_indian_diabetes_3/'

    files_path = []
    for file_name in json_files:
        files_path.append(os.path.join(output_dir_path, dir_name ,file_name))

    df = read_and_concat_json(files_path)

    if original_col_order is None:
        original_col_order = df.columns
    if tmp_name is None:
        tmp_name = df['name']
    if tmp_len_U is None:
        tmp_len_U = df['len_U']

    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=False)
combined_df = combined_df.drop(['name', 'len_U'], axis=1)
result_df_mean = combined_df.groupby(combined_df.index).mean()
result_df_mean['name'] = tmp_name
result_df_mean['len_U'] = tmp_len_U
result_df_mean = result_df_mean[original_col_order]

result_df_std = combined_df.groupby(combined_df.index).std()
result_df_std['name'] = tmp_name
result_df_std['len_U'] = tmp_len_U
result_df_std = result_df_std[original_col_order]

In [91]:
print()
print('mean')
result_df_mean


mean


Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,linear svm,0.777612,0.70662,0.545042,0.614569,0.839456,,15.6,27.0,0.577778
1,non-linear svm (rbf),0.771642,0.739439,0.468617,0.570778,0.825633,,14.6,27.0,0.540741
2,logistic regression,0.779104,0.70706,0.555639,0.620318,0.840584,,15.0,27.0,0.555556
3,random forest,0.783582,0.715369,0.572454,0.631111,0.831329,,16.4,27.0,0.607407
4,luka linear svm,0.762687,0.7201,0.440418,0.546356,0.769145,15.0,13.8,27.0,0.511111
5,luka linear svm loss,0.762687,0.7201,0.440418,0.546356,0.790253,15.0,13.8,27.0,0.511111
6,luka logistic regression loss,0.768657,0.728866,0.459526,0.562795,0.80784,15.0,13.2,27.0,0.488889
7,random forest (rulefit),0.741791,0.627843,0.557706,0.582608,0.772322,,12.2,27.0,0.451852
8,RuleFitClassifier,0.783582,0.742431,0.521271,0.610252,0.815762,,11.6,27.0,0.42963


In [92]:
print()
print('standard deviation')
result_df_std


standard deviation


Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,linear svm,0.042347,0.044474,0.041333,0.034771,0.045271,,2.50998,0.0,0.092962
1,non-linear svm (rbf),0.045577,0.102245,0.054765,0.055239,0.045847,,2.701851,0.0,0.100069
2,logistic regression,0.043706,0.069242,0.049377,0.041328,0.046465,,3.162278,0.0,0.117121
3,random forest,0.039135,0.071385,0.073789,0.03563,0.040772,,2.880972,0.0,0.106703
4,luka linear svm,0.037089,0.051562,0.023171,0.030995,0.049616,15.0,1.30384,0.0,0.04829
5,luka linear svm loss,0.037089,0.051562,0.023171,0.030995,0.058101,15.0,1.30384,0.0,0.04829
6,luka logistic regression loss,0.038052,0.052783,0.04017,0.038494,0.052367,15.0,1.095445,0.0,0.040572
7,random forest (rulefit),0.04995,0.094845,0.077244,0.037668,0.049073,,1.48324,0.0,0.054935
8,RuleFitClassifier,0.044776,0.048551,0.064326,0.044856,0.051082,,1.516575,0.0,0.056169


# len(U) = 100

In [93]:
directories = [
    'fold_0',
    'fold_1',
    'fold_2',
    'fold_3',
    'fold_4'
]

json_files = [
    'result_1.json',
    'result_2.json',
    'result_3.json',
    'result_4.json',
    'result_luka_1.json',
    'result_luka_2.json',
    'result_luka_3.json',
    'result_rulefit_1.json',
    'result_rulefit_2.json',
]


dfs = []
original_col_order = None
tmp_name = None
tmp_len_U = None
for dir_name in directories:
    output_dir_path = './../../outputs/pima_indian_diabetes_4/'

    files_path = []
    for file_name in json_files:
        files_path.append(os.path.join(output_dir_path, dir_name ,file_name))

    df = read_and_concat_json(files_path)

    if original_col_order is None:
        original_col_order = df.columns
    if tmp_name is None:
        tmp_name = df['name']
    if tmp_len_U is None:
        tmp_len_U = df['len_U']

    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=False)
combined_df = combined_df.drop(['name', 'len_U'], axis=1)
result_df_mean = combined_df.groupby(combined_df.index).mean()
result_df_mean['name'] = tmp_name
result_df_mean['len_U'] = tmp_len_U
result_df_mean = result_df_mean[original_col_order]

result_df_std = combined_df.groupby(combined_df.index).std()
result_df_std['name'] = tmp_name
result_df_std['len_U'] = tmp_len_U
result_df_std = result_df_std[original_col_order]

In [94]:
print()
print('mean')
result_df_mean


mean


Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,linear svm,0.777612,0.70662,0.545042,0.614569,0.839456,,15.6,27.0,0.577778
1,non-linear svm (rbf),0.771642,0.739439,0.468617,0.570778,0.825633,,14.6,27.0,0.540741
2,logistic regression,0.779104,0.70706,0.555639,0.620318,0.840584,,15.0,27.0,0.555556
3,random forest,0.783582,0.715369,0.572454,0.631111,0.831329,,16.4,27.0,0.607407
4,luka linear svm,0.762687,0.7201,0.440418,0.546356,0.801919,100.0,13.8,27.0,0.511111
5,luka linear svm loss,0.762687,0.7201,0.440418,0.546356,0.801396,100.0,13.8,27.0,0.511111
6,luka logistic regression loss,0.774627,0.728157,0.495878,0.587849,0.812874,100.0,13.0,27.0,0.481481
7,random forest (rulefit),0.741791,0.627843,0.557706,0.582608,0.772322,,12.2,27.0,0.451852
8,RuleFitClassifier,0.783582,0.742431,0.521271,0.610252,0.815762,,11.6,27.0,0.42963


In [95]:
print()
print('standard deviation')
display(result_df_std)


standard deviation


Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,linear svm,0.042347,0.044474,0.041333,0.034771,0.045271,,2.50998,0.0,0.092962
1,non-linear svm (rbf),0.045577,0.102245,0.054765,0.055239,0.045847,,2.701851,0.0,0.100069
2,logistic regression,0.043706,0.069242,0.049377,0.041328,0.046465,,3.162278,0.0,0.117121
3,random forest,0.039135,0.071385,0.073789,0.03563,0.040772,,2.880972,0.0,0.106703
4,luka linear svm,0.037089,0.051562,0.023171,0.030995,0.054558,100.0,1.30384,0.0,0.04829
5,luka linear svm loss,0.037089,0.051562,0.023171,0.030995,0.050741,100.0,1.30384,0.0,0.04829
6,luka logistic regression loss,0.045209,0.056586,0.063391,0.05061,0.049279,100.0,1.224745,0.0,0.045361
7,random forest (rulefit),0.04995,0.094845,0.077244,0.037668,0.049073,,1.48324,0.0,0.054935
8,RuleFitClassifier,0.044776,0.048551,0.064326,0.044856,0.051082,,1.516575,0.0,0.056169


# 矛盾したルールの除去　

In [5]:
directories = [
    'fold_0',
    'fold_1',
    'fold_2',
    'fold_3',
    'fold_4'
]

json_files = [
    'result_1.json',
    'result_2.json',
    'result_3.json',
    'result_4.json',
    'result_luka_1.json',
    'result_luka_2.json',
    'result_luka_3.json',
    'result_rulefit_1.json',
    'result_rulefit_2.json',
]


dfs = []
original_col_order = None
tmp_name = None
tmp_len_U = None
for dir_name in directories:
    output_dir_path = './../../outputs/pima_indian_diabetes_5/'

    files_path = []
    for file_name in json_files:
        files_path.append(os.path.join(output_dir_path, dir_name ,file_name))

    df = read_and_concat_json(files_path)

    if original_col_order is None:
        original_col_order = df.columns
    if tmp_name is None:
        tmp_name = df['name']
    if tmp_len_U is None:
        tmp_len_U = df['len_U']

    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=False)
combined_df = combined_df.drop(['name', 'len_U'], axis=1)
result_df_mean = combined_df.groupby(combined_df.index).mean()
result_df_mean['name'] = tmp_name
result_df_mean['len_U'] = tmp_len_U
result_df_mean = result_df_mean[original_col_order]

result_df_std = combined_df.groupby(combined_df.index).std()
result_df_std['name'] = tmp_name
result_df_std['len_U'] = tmp_len_U
result_df_std = result_df_std[original_col_order]


print()
print('mean')
display(result_df_mean)

print()
print('standard deviation')
display(result_df_std)


mean


Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,linear svm,0.777612,0.70662,0.545042,0.614569,0.839456,,13.8,24.0,0.575
1,non-linear svm (rbf),0.771642,0.739439,0.468617,0.570778,0.825633,,12.8,24.0,0.533333
2,logistic regression,0.779104,0.70706,0.555639,0.620318,0.840584,,13.0,24.0,0.541667
3,random forest,0.783582,0.715369,0.572454,0.631111,0.831329,,14.4,24.0,0.6
4,luka linear svm,0.762687,0.7201,0.440418,0.546356,0.799214,15.0,12.0,24.0,0.5
5,luka linear svm loss,0.762687,0.7201,0.440418,0.546356,0.797224,15.0,12.0,24.0,0.5
6,luka logistic regression loss,0.765672,0.72244,0.455468,0.557909,0.806728,15.0,12.2,24.0,0.508333
7,random forest (rulefit),0.741791,0.627843,0.557706,0.582608,0.772322,,11.6,24.0,0.483333
8,RuleFitClassifier,0.783582,0.742431,0.521271,0.610252,0.815827,,9.8,24.0,0.408333



standard deviation


Unnamed: 0,name,Accuracy,Precision,Recall,F1-score,Auc,len_U,Rules.violation,Rules.total,Violation ratio (= violation / total)
0,linear svm,0.042347,0.044474,0.041333,0.034771,0.045271,,2.774887,0.0,0.11562
1,non-linear svm (rbf),0.045577,0.102245,0.054765,0.055239,0.045847,,2.48998,0.0,0.103749
2,logistic regression,0.043706,0.069242,0.049377,0.041328,0.046465,,2.738613,0.0,0.114109
3,random forest,0.039135,0.071385,0.073789,0.03563,0.040772,,2.880972,0.0,0.120041
4,luka linear svm,0.037089,0.051562,0.023171,0.030995,0.054492,15.0,1.0,0.0,0.041667
5,luka linear svm loss,0.037089,0.051562,0.023171,0.030995,0.059547,15.0,1.0,0.0,0.041667
6,luka logistic regression loss,0.037906,0.050182,0.029853,0.028654,0.052321,15.0,1.30384,0.0,0.054327
7,random forest (rulefit),0.04995,0.094845,0.077244,0.037668,0.049073,,1.81659,0.0,0.075691
8,RuleFitClassifier,0.044776,0.048551,0.064326,0.044856,0.051854,,1.643168,0.0,0.068465
