In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from src.rulefit import RuleFit, RuleFitClassifier

# RuleFit

__train_test_split__ の際に分割されたデータのインデックスがほしい

In [15]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

feature_names = data.columns.to_list()[1:]

X = data.drop(["Outcome"], axis=1)
y = data["Outcome"]

random_state = 42
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state)

train_index = X_train.index
test_index = X_test.index

import os
project_dir_path = "/home/onoue/ws/lukasiewicz_1/"
data_dir_path = "inputs/pima_indian_diabetes"
save_dir_path = os.path.join(project_dir_path, data_dir_path)

file_path_1 = os.path.join(save_dir_path, 'train_index.csv')
pd.DataFrame(train_index).to_csv(file_path_1)

file_path_2 = os.path.join(save_dir_path, 'test_index.csv')
pd.DataFrame(test_index).to_csv(file_path_2)



X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values


In [18]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

feature_names = data.columns.to_list()[1:]

X = np.array(data.drop(["Outcome"], axis=1))
y = np.array(data["Outcome"])

random_state = 42
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state)

# SVM（線形カーネル）のモデルを作成
svm_linear = SVC(kernel='linear')

# モデルを訓練
svm_linear.fit(X_train, y_train)

# テストデータで予測
y_pred_linear = svm_linear.predict(X_test)

# Accuracyを計算
accuracy_linear = accuracy_score(y_test, y_pred_linear)

# Confusion Matrixを計算
cm_linear = confusion_matrix(y_test, y_pred_linear)

# Classification Reportを出力
report_linear = classification_report(y_test, y_pred_linear)

# 結果を出力
print("SVM (Linear Kernel) Accuracy:", accuracy_linear)
print()
print()
print("SVM (Linear Kernel) Confusion Matrix:\n", cm_linear)
print()
print()
print("SVM (Linear Kernel) Classification Report:\n", report_linear)

SVM (Linear Kernel) Accuracy: 0.7761194029850746


SVM (Linear Kernel) Confusion Matrix:
 [[82  8]
 [22 22]]


SVM (Linear Kernel) Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.91      0.85        90
           1       0.73      0.50      0.59        44

    accuracy                           0.78       134
   macro avg       0.76      0.71      0.72       134
weighted avg       0.77      0.78      0.76       134



In [19]:
# SVM（RBFカーネル）のモデルを作成
svm_rbf = SVC(kernel='rbf')

# モデルを訓練
svm_rbf.fit(X_train, y_train)

# テストデータで予測
y_pred_rbf = svm_rbf.predict(X_test)

# Accuracyを計算
accuracy_rbf = accuracy_score(y_test, y_pred_rbf)

# Confusion Matrixを計算
cm_rbf = confusion_matrix(y_test, y_pred_rbf)

# Classification Reportを出力
report_rbf = classification_report(y_test, y_pred_rbf)

# 結果を出力
print("SVM (RBF Kernel) Accuracy:", accuracy_rbf)
print()
print()
print("SVM (RBF Kernel) Confusion Matrix:\n", cm_rbf)
print()
print()
print("SVM (RBF Kernel) Classification Report:\n", report_rbf)

SVM (RBF Kernel) Accuracy: 0.7835820895522388


SVM (RBF Kernel) Confusion Matrix:
 [[79 11]
 [18 26]]


SVM (RBF Kernel) Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.88      0.84        90
           1       0.70      0.59      0.64        44

    accuracy                           0.78       134
   macro avg       0.76      0.73      0.74       134
weighted avg       0.78      0.78      0.78       134



In [5]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

feature_names = data.columns.to_list()[1:]

X = np.array(data.drop(["Outcome"], axis=1))
y = np.array(data["Outcome"])

random_state = 42
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state)

rfmode = 'classify'
tree_generator = RandomForestClassifier(random_state=random_state)

rf = RuleFitClassifier(rfmode=rfmode,
                       tree_generator=tree_generator,
                       random_state=random_state)

rf.fit(X_train, y_train, feature_names=feature_names)
display(rf)
print(rf)
# y_pred = rf.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# conf_matrix = confusion_matrix(y_test, y_pred)
# class_report = classification_report(y_test, y_pred)

# # Print the results
# print("離散データ（one-hot encoding）")
# print("---------------------------")
# print()
# print("Accuracy:", accuracy)
# print()
# print()
# print("Confusion Matrix:\n", conf_matrix)
# print()
# print()
# print("Classification Report:\n", class_report)

RuleFitClassifier(random_state=42,
                  tree_generator=RandomForestClassifier(max_leaf_nodes=5,
                                                        n_estimators=552,
                                                        random_state=593))


In [6]:
rf.tree_generator

In [20]:
feature_names

['Pregnancies_Low',
 'Pregnancies_Medium',
 'Pregnancies_High',
 'Glucose_Low',
 'Glucose_Medium',
 'Glucose_High',
 'BloodPressure_Low',
 'BloodPressure_Medium',
 'BloodPressure_High',
 'SkinThickness_Low',
 'SkinThickness_Medium',
 'SkinThickness_High',
 'BMI_Low',
 'BMI_Medium',
 'BMI_High',
 'DiabetesPedigreeFunction_Low',
 'DiabetesPedigreeFunction_Medium',
 'DiabetesPedigreeFunction_High',
 'Age_Low',
 'Age_Medium',
 'Age_High']

In [21]:
rules_df = rf.get_rules(exclude_zero_coef=True)

pd.set_option("display.max_colwidth", 999) #ruleが長すぎて表示が省略される場合用
pd.set_option("display.max_rows", 999) 
rules_df

Unnamed: 0,rule,type,coef,support,importance
63,Glucose_Low <= 0.5 & Age_Low <= 0.5 & BMI_Low <= 0.5,rule,0.072451,0.211144,0.029569
126,Glucose_High <= 0.5 & DiabetesPedigreeFunction_Medium <= 0.5,rule,-0.184762,0.603499,0.09038
129,Pregnancies_High <= 0.5 & Age_Low > 0.5,rule,-0.024991,0.645533,0.011955
146,Glucose_High <= 0.5 & BMI_Low > 0.5,rule,-0.032622,0.29683,0.014904
173,DiabetesPedigreeFunction_High <= 0.5 & Glucose_Low > 0.5,rule,-1.053859,0.211765,0.430564
174,Glucose_High <= 0.5 & Age_Medium <= 0.5 & SkinThickness_Low <= 0.5 & Glucose_Low <= 0.5 & DiabetesPedigreeFunction_Medium <= 0.5 & BloodPressure_Low <= 0.5,rule,-0.084519,0.127536,0.028193
185,Glucose_High <= 0.5 & Age_Low <= 0.5 & Pregnancies_Medium > 0.5,rule,-0.101664,0.093567,0.029607
189,Age_Medium <= 0.5 & BMI_Low > 0.5 & BloodPressure_Medium <= 0.5,rule,-0.058031,0.134328,0.019789
259,BMI_Low <= 0.5 & Glucose_Low <= 0.5 & DiabetesPedigreeFunction_Low <= 0.5,rule,0.033293,0.242604,0.014271
301,Glucose_Low <= 0.5 & Pregnancies_High <= 0.5 & DiabetesPedigreeFunction_High <= 0.5 & BMI_High <= 0.5 & Age_Medium <= 0.5,rule,-0.04683,0.468208,0.023368


In [22]:
# src に入れたい

class ArrangeRules:
    def __init__(self, rules_df, feature_names=None, conclusion_name=None):
        self.rules_df = rules_df
        self.feature_names = feature_names

        if not conclusion_name:
            self.conclusion_name = 'Outcome'
        else:
            self.conclusion_name = conclusion_name

        self.rules_extracted = None
        self.rules_additional = None
        self.KB = None

    def extract_rules_from_df(self):
        rules_list = self.rules_df['rule'].to_list()
        coef_list  = self.rules_df['coef'].to_list()

        rules_list = [rule.split(' ') for rule in rules_list]

        self.rules_extracted = []
        for rule, coef in zip(rules_list, coef_list):

            # '&' を目印にして複数のリストに分割する
            sublists = []
            current_sublist = []
            for item in rule:
                if item != '&':
                    current_sublist.append(item)
                else:
                    sublists.append(current_sublist)
                    current_sublist = []

            # ループ終了後に最後のサブリストを追加
            sublists.append(current_sublist)

            rule_new = []
            cnt = 0
            for sublist in sublists:
                if len(sublist) <= 1:
                    rule_new.append(sublist[0])
                else:
                    if sublist[1] in ['<', '<=']:
                        rule_new.append('¬')
                        rule_new.append(sublist[0])
                    else:
                        rule_new.append(sublist[0])
                
                cnt +=1
                if cnt < len(sublists):
                    rule_new.append('⊗')
                else:
                    rule_new.append('→')

                    # coef == 0 の rule は除外されているため
                    if coef > 0:
                        rule_new.append(self.conclusion_name)
                    elif coef < 0:
                        rule_new.append('¬')
                        rule_new.append(self.conclusion_name)

            self.rules_extracted.append(rule_new)

        return self.rules_extracted

    def generate_rules_from_df(self):
        if self.feature_names:
            tmp_dict = {}
            for item in self.feature_names:
                key, value = item.rsplit('_', 1)
                if key not in tmp_dict:
                    tmp_dict[key] = []

                tmp_dict[key].append(item)
            
            self.rules_additional = list(tmp_dict.values())
            self.rules_additional = [' ⊕ '.join(rule) for rule in self.rules_additional]
            self.rules_additional = [rule.split(' ') for rule in self.rules_additional]

            return self.rules_additional

        else:
            return []


    def construct_KB(self):
        rules_extracted = self.extract_rules_from_df()
        rules_additional = self.generate_rules_from_df()

        self.KB = rules_extracted + rules_additional
        return self.KB
    
    def save_KB_as_txt(self, file_name):
        if self.KB:
            rules = [' '.join(rule) for rule in self.KB]

        with open(file_name, 'w') as file:
            for item in rules:
                file.write("%s\n" % item)

In [23]:
rule_processor = ArrangeRules(rules_df, 
                              feature_names=feature_names, 
                              conclusion_name="Outcome")
KB = rule_processor.construct_KB()
rule_processor.save_KB_as_txt("./data/rules.txt")

In [24]:
for rule in KB:
    print(rule)

['¬', 'Glucose_Low', '⊗', '¬', 'Age_Low', '⊗', '¬', 'BMI_Low', '→', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'DiabetesPedigreeFunction_Medium', '→', '¬', 'Outcome']
['¬', 'Pregnancies_High', '⊗', 'Age_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', 'BMI_Low', '→', '¬', 'Outcome']
['¬', 'DiabetesPedigreeFunction_High', '⊗', 'Glucose_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'Age_Medium', '⊗', '¬', 'SkinThickness_Low', '⊗', '¬', 'Glucose_Low', '⊗', '¬', 'DiabetesPedigreeFunction_Medium', '⊗', '¬', 'BloodPressure_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'Age_Low', '⊗', 'Pregnancies_Medium', '→', '¬', 'Outcome']
['¬', 'Age_Medium', '⊗', 'BMI_Low', '⊗', '¬', 'BloodPressure_Medium', '→', '¬', 'Outcome']
['¬', 'BMI_Low', '⊗', '¬', 'Glucose_Low', '⊗', '¬', 'DiabetesPedigreeFunction_Low', '→', 'Outcome']
['¬', 'Glucose_Low', '⊗', '¬', 'Pregnancies_High', '⊗', '¬', 'DiabetesPedigreeFunction_High', '⊗', '¬', 'BMI_High', '⊗', '¬', 'Age_Medium', '→', '¬', 

# max_iter=50 としたものも採用して比較してみる

In [14]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

feature_names = data.columns.to_list()[1:]

X = np.array(data.drop(["Outcome"], axis=1))
y = np.array(data["Outcome"])

random_state = 42
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state)

rfmode = 'classify'
tree_generator = RandomForestClassifier(random_state=random_state)

rf = RuleFitClassifier(max_rules=50,
                       rfmode=rfmode,
                       tree_generator=RandomForestClassifier(),
                       random_state=random_state)

rf.fit(X_train, y_train, feature_names=feature_names)
display(rf)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("離散データ（one-hot encoding）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

離散データ（one-hot encoding）
---------------------------

Accuracy: 0.7761194029850746


Confusion Matrix:
 [[73 17]
 [13 31]]


Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.81      0.83        90
           1       0.65      0.70      0.67        44

    accuracy                           0.78       134
   macro avg       0.75      0.76      0.75       134
weighted avg       0.78      0.78      0.78       134



In [16]:
rules_df = rf.get_rules(exclude_zero_coef=True)
rules_df

Unnamed: 0,rule,type,coef,support,importance
1,Pregnancies_Medium,linear,-0.481035,1.0,0.207774
2,Pregnancies_High,linear,0.136026,1.0,0.040943
3,Glucose_Low,linear,-0.187261,1.0,0.076873
5,Glucose_High,linear,0.181535,1.0,0.071525
7,BloodPressure_Medium,linear,0.124705,1.0,0.059211
8,BloodPressure_High,linear,-0.205998,1.0,0.077339
11,SkinThickness_High,linear,-0.186036,1.0,0.053121
12,BMI_Low,linear,-0.09169,1.0,0.043059
13,BMI_Medium,linear,0.174153,1.0,0.086727
15,DiabetesPedigreeFunction_Low,linear,-0.367591,1.0,0.17665


In [18]:
rule_processor = ArrangeRules(rules_df, 
                              feature_names=feature_names, 
                              conclusion_name="Outcome")
KB = rule_processor.construct_KB()
rule_processor.save_KB_as_txt("./data/rules_max_50.txt")

for rule in KB:
    print(rule)

['Pregnancies_Medium', '→', '¬', 'Outcome']
['Pregnancies_High', '→', 'Outcome']
['Glucose_Low', '→', '¬', 'Outcome']
['Glucose_High', '→', 'Outcome']
['BloodPressure_Medium', '→', 'Outcome']
['BloodPressure_High', '→', '¬', 'Outcome']
['SkinThickness_High', '→', '¬', 'Outcome']
['BMI_Low', '→', '¬', 'Outcome']
['BMI_Medium', '→', 'Outcome']
['DiabetesPedigreeFunction_Low', '→', '¬', 'Outcome']
['DiabetesPedigreeFunction_High', '→', 'Outcome']
['¬', 'Glucose_High', '→', '¬', 'Outcome']
['Glucose_High', '→', 'Outcome']
['¬', 'BMI_High', '⊗', '¬', 'Pregnancies_Low', '⊗', '¬', 'BMI_Medium', '→', 'Outcome']
['¬', 'BMI_High', '⊗', 'Pregnancies_Low', '⊗', 'Age_Low', '⊗', '¬', 'Glucose_Low', '→', '¬', 'Outcome']
['BMI_High', '⊗', '¬', 'Glucose_Low', '⊗', 'BloodPressure_High', '→', 'Outcome']
['BMI_High', '⊗', 'Glucose_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'Age_Medium', '⊗', 'BloodPressure_Low', '→', '¬', 'Outcome']
['Age_Low', '⊗', 'BMI_Low', '→', '¬', 'Outcome']
['BMI_Lo