In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from rulefit import RuleFit, RuleFitClassifier

# RuleFit

In [3]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

feature_names = data.columns.to_list()[1:]

X = np.array(data.drop(["Outcome"], axis=1))
y = np.array(data["Outcome"])

random_state = 42
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state)

rfmode = 'classify'
tree_generator = RandomForestClassifier(random_state=random_state)

rf = RuleFitClassifier(rfmode=rfmode,
                       tree_generator=RandomForestClassifier(),
                       random_state=random_state)

rf.fit(X_train, y_train, feature_names=feature_names)
display(rf)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("離散データ（one-hot encoding）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

離散データ（one-hot encoding）
---------------------------

Accuracy: 0.7388059701492538


Confusion Matrix:
 [[72 18]
 [17 27]]


Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.80      0.80        90
           1       0.60      0.61      0.61        44

    accuracy                           0.74       134
   macro avg       0.70      0.71      0.71       134
weighted avg       0.74      0.74      0.74       134



In [16]:
feature_names

['Pregnancies_Low',
 'Pregnancies_Medium',
 'Pregnancies_High',
 'Glucose_Low',
 'Glucose_Medium',
 'Glucose_High',
 'BloodPressure_Low',
 'BloodPressure_Medium',
 'BloodPressure_High',
 'SkinThickness_Low',
 'SkinThickness_Medium',
 'SkinThickness_High',
 'BMI_Low',
 'BMI_Medium',
 'BMI_High',
 'DiabetesPedigreeFunction_Low',
 'DiabetesPedigreeFunction_Medium',
 'DiabetesPedigreeFunction_High',
 'Age_Low',
 'Age_Medium',
 'Age_High']

In [4]:
rules = rf.get_rules(exclude_zero_coef=True)

pd.set_option("display.max_colwidth", 999) #ruleが長すぎて表示が省略される場合用
pd.set_option("display.max_rows", 999) 
rules

Unnamed: 0,rule,type,coef,support,importance
63,Glucose_Low <= 0.5 & Age_Low <= 0.5 & BMI_Low <= 0.5,rule,0.072857,0.211144,0.029735
126,Glucose_High <= 0.5 & DiabetesPedigreeFunction_Medium <= 0.5,rule,-0.187202,0.603499,0.091574
129,Pregnancies_High <= 0.5 & Age_Low > 0.5,rule,-0.063791,0.645533,0.030515
146,Glucose_High <= 0.5 & BMI_Low > 0.5,rule,-0.015909,0.29683,0.007268
172,DiabetesPedigreeFunction_High <= 0.5 & Glucose_Low > 0.5,rule,-1.055592,0.211765,0.431272
173,Glucose_High <= 0.5 & Age_Medium <= 0.5 & SkinThickness_Low <= 0.5 & Glucose_Low <= 0.5 & DiabetesPedigreeFunction_Medium <= 0.5 & BloodPressure_Low <= 0.5,rule,-0.088789,0.127536,0.029618
184,Glucose_High <= 0.5 & Age_Low <= 0.5 & Pregnancies_Medium > 0.5,rule,-0.092192,0.093567,0.026849
188,Age_Medium <= 0.5 & BMI_Low > 0.5 & BloodPressure_Medium <= 0.5,rule,-0.050446,0.134328,0.017202
258,BMI_Low <= 0.5 & Glucose_Low <= 0.5 & DiabetesPedigreeFunction_Low <= 0.5,rule,0.032057,0.242604,0.013741
300,Glucose_Low <= 0.5 & Pregnancies_High <= 0.5 & DiabetesPedigreeFunction_High <= 0.5 & BMI_High <= 0.5 & Age_Medium <= 0.5,rule,-0.049542,0.468208,0.024721


In [11]:
def create_KB(rules_df, conclusion_name='Outcome'):
    rules_list = rules_df['rule'].to_list()
    coef_list  = rules_df['coef'].to_list()

    rules_list = [rule.split(' ') for rule in rules_list]

    KB = []
    for rule, coef in zip(rules_list, coef_list):

        # '&' を目印にして複数のリストに分割する
        sublists = []
        current_sublist = []
        for item in rule:
            if item != '&':
                current_sublist.append(item)
            else:
                sublists.append(current_sublist)
                current_sublist = []

        # ループ終了後に最後のサブリストを追加
        sublists.append(current_sublist)

        rule_new = []
        cnt = 0
        for sublist in sublists:
            if len(sublist) <= 1:
                rule_new.append(sublist[0])
            else:
                if sublist[1] in ['<', '<=']:
                    rule_new.append('¬')
                    rule_new.append(sublist[0])
                else:
                    rule_new.append(sublist[0])
            
            cnt +=1
            if cnt < len(sublists):
                rule_new.append('⊗')
            else:
                rule_new.append('→')

                # coef == 0 の rule は除外されているため
                if coef > 0:
                    rule_new.append(conclusion_name)
                elif coef < 0:
                    rule_new.append('¬')
                    rule_new.append(conclusion_name)

        KB.append(rule_new)

    return KB

In [12]:
KB = create_KB(rules)

for formula in KB:
    print(formula)

['¬', 'Glucose_Low', '⊗', '¬', 'Age_Low', '⊗', '¬', 'BMI_Low', '→', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'DiabetesPedigreeFunction_Medium', '→', '¬', 'Outcome']
['¬', 'Pregnancies_High', '⊗', 'Age_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', 'BMI_Low', '→', '¬', 'Outcome']
['¬', 'DiabetesPedigreeFunction_High', '⊗', 'Glucose_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'Age_Medium', '⊗', '¬', 'SkinThickness_Low', '⊗', '¬', 'Glucose_Low', '⊗', '¬', 'DiabetesPedigreeFunction_Medium', '⊗', '¬', 'BloodPressure_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'Age_Low', '⊗', 'Pregnancies_Medium', '→', '¬', 'Outcome']
['¬', 'Age_Medium', '⊗', 'BMI_Low', '⊗', '¬', 'BloodPressure_Medium', '→', '¬', 'Outcome']
['¬', 'BMI_Low', '⊗', '¬', 'Glucose_Low', '⊗', '¬', 'DiabetesPedigreeFunction_Low', '→', 'Outcome']
['¬', 'Glucose_Low', '⊗', '¬', 'Pregnancies_High', '⊗', '¬', 'DiabetesPedigreeFunction_High', '⊗', '¬', 'BMI_High', '⊗', '¬', 'Age_Medium', '→', '¬', 

In [15]:
rules

Unnamed: 0,rule,type,coef,support,importance
63,Glucose_Low <= 0.5 & Age_Low <= 0.5 & BMI_Low <= 0.5,rule,0.072857,0.211144,0.029735
126,Glucose_High <= 0.5 & DiabetesPedigreeFunction_Medium <= 0.5,rule,-0.187202,0.603499,0.091574
129,Pregnancies_High <= 0.5 & Age_Low > 0.5,rule,-0.063791,0.645533,0.030515
146,Glucose_High <= 0.5 & BMI_Low > 0.5,rule,-0.015909,0.29683,0.007268
172,DiabetesPedigreeFunction_High <= 0.5 & Glucose_Low > 0.5,rule,-1.055592,0.211765,0.431272
173,Glucose_High <= 0.5 & Age_Medium <= 0.5 & SkinThickness_Low <= 0.5 & Glucose_Low <= 0.5 & DiabetesPedigreeFunction_Medium <= 0.5 & BloodPressure_Low <= 0.5,rule,-0.088789,0.127536,0.029618
184,Glucose_High <= 0.5 & Age_Low <= 0.5 & Pregnancies_Medium > 0.5,rule,-0.092192,0.093567,0.026849
188,Age_Medium <= 0.5 & BMI_Low > 0.5 & BloodPressure_Medium <= 0.5,rule,-0.050446,0.134328,0.017202
258,BMI_Low <= 0.5 & Glucose_Low <= 0.5 & DiabetesPedigreeFunction_Low <= 0.5,rule,0.032057,0.242604,0.013741
300,Glucose_Low <= 0.5 & Pregnancies_High <= 0.5 & DiabetesPedigreeFunction_High <= 0.5 & BMI_High <= 0.5 & Age_Medium <= 0.5,rule,-0.049542,0.468208,0.024721


In [39]:
class ArrangeRules:
    def __init__(self, rules_df, feature_names=None, conclusion_name=None):
        self.rules_df = rules_df
        self.feature_names = feature_names

        if not conclusion_name:
            self.conclusion_name = 'Outcome'
        else:
            self.conclusion_name = conclusion_name

        self.rules_extracted = None
        self.rules_additional = None
        self.KB = None

    def extract_rules_from_df(self):
        rules_list = self.rules_df['rule'].to_list()
        coef_list  = self.rules_df['coef'].to_list()

        rules_list = [rule.split(' ') for rule in rules_list]

        self.rules_extracted = []
        for rule, coef in zip(rules_list, coef_list):

            # '&' を目印にして複数のリストに分割する
            sublists = []
            current_sublist = []
            for item in rule:
                if item != '&':
                    current_sublist.append(item)
                else:
                    sublists.append(current_sublist)
                    current_sublist = []

            # ループ終了後に最後のサブリストを追加
            sublists.append(current_sublist)

            rule_new = []
            cnt = 0
            for sublist in sublists:
                if len(sublist) <= 1:
                    rule_new.append(sublist[0])
                else:
                    if sublist[1] in ['<', '<=']:
                        rule_new.append('¬')
                        rule_new.append(sublist[0])
                    else:
                        rule_new.append(sublist[0])
                
                cnt +=1
                if cnt < len(sublists):
                    rule_new.append('⊗')
                else:
                    rule_new.append('→')

                    # coef == 0 の rule は除外されているため
                    if coef > 0:
                        rule_new.append(self.conclusion_name)
                    elif coef < 0:
                        rule_new.append('¬')
                        rule_new.append(self.conclusion_name)

            self.rules_extracted.append(rule_new)

        return self.rules_extracted

    def generate_rules_from_df(self):
        if self.feature_names:
            tmp_dict = {}
            for item in self.feature_names:
                key, value = item.rsplit('_', 1)
                if key not in tmp_dict:
                    tmp_dict[key] = []

                tmp_dict[key].append(item)
            
            self.rules_additional = list(tmp_dict.values())
            self.rules_additional = [' ⊕ '.join(rule) for rule in self.rules_additional]
            self.rules_additional = [rule.split(' ') for rule in self.rules_additional]

            return self.rules_additional

        else:
            return []


    def construct_KB(self):
        rules_extracted = self.extract_rules_from_df()
        rules_additional = self.generate_rules_from_df()

        self.KB = rules_extracted + rules_additional
        return self.KB
    
    def save_KB_as_txt(self, file_name):
        if self.KB:
            rules = [' '.join(rule) for rule in self.KB]

        with open(file_name, 'w') as file:
            for item in rules:
                file.write("%s\n" % item)
            
            

In [40]:
rule_processor = ArrangeRules(rules, 
                              feature_names=feature_names, 
                              conclusion_name="Outcome")

KB = rule_processor.construct_KB()

import os
project_dir_path = './../../../lukasiewicz_1/'
file_name = os.path.join(project_dir_path, "rules.txt")
rule_processor.save_KB_as_txt(file_name)

In [41]:
for rule in KB:
    print(rule)

['¬', 'Glucose_Low', '⊗', '¬', 'Age_Low', '⊗', '¬', 'BMI_Low', '→', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'DiabetesPedigreeFunction_Medium', '→', '¬', 'Outcome']
['¬', 'Pregnancies_High', '⊗', 'Age_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', 'BMI_Low', '→', '¬', 'Outcome']
['¬', 'DiabetesPedigreeFunction_High', '⊗', 'Glucose_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'Age_Medium', '⊗', '¬', 'SkinThickness_Low', '⊗', '¬', 'Glucose_Low', '⊗', '¬', 'DiabetesPedigreeFunction_Medium', '⊗', '¬', 'BloodPressure_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'Age_Low', '⊗', 'Pregnancies_Medium', '→', '¬', 'Outcome']
['¬', 'Age_Medium', '⊗', 'BMI_Low', '⊗', '¬', 'BloodPressure_Medium', '→', '¬', 'Outcome']
['¬', 'BMI_Low', '⊗', '¬', 'Glucose_Low', '⊗', '¬', 'DiabetesPedigreeFunction_Low', '→', 'Outcome']
['¬', 'Glucose_Low', '⊗', '¬', 'Pregnancies_High', '⊗', '¬', 'DiabetesPedigreeFunction_High', '⊗', '¬', 'BMI_High', '⊗', '¬', 'Age_Medium', '→', '¬', 

In [38]:
KB

[['¬',
  'Glucose_Low',
  '⊗',
  '¬',
  'Age_Low',
  '⊗',
  '¬',
  'BMI_Low',
  '→',
  'Outcome'],
 ['¬',
  'Glucose_High',
  '⊗',
  '¬',
  'DiabetesPedigreeFunction_Medium',
  '→',
  '¬',
  'Outcome'],
 ['¬', 'Pregnancies_High', '⊗', 'Age_Low', '→', '¬', 'Outcome'],
 ['¬', 'Glucose_High', '⊗', 'BMI_Low', '→', '¬', 'Outcome'],
 ['¬',
  'DiabetesPedigreeFunction_High',
  '⊗',
  'Glucose_Low',
  '→',
  '¬',
  'Outcome'],
 ['¬',
  'Glucose_High',
  '⊗',
  '¬',
  'Age_Medium',
  '⊗',
  '¬',
  'SkinThickness_Low',
  '⊗',
  '¬',
  'Glucose_Low',
  '⊗',
  '¬',
  'DiabetesPedigreeFunction_Medium',
  '⊗',
  '¬',
  'BloodPressure_Low',
  '→',
  '¬',
  'Outcome'],
 ['¬',
  'Glucose_High',
  '⊗',
  '¬',
  'Age_Low',
  '⊗',
  'Pregnancies_Medium',
  '→',
  '¬',
  'Outcome'],
 ['¬',
  'Age_Medium',
  '⊗',
  'BMI_Low',
  '⊗',
  '¬',
  'BloodPressure_Medium',
  '→',
  '¬',
  'Outcome'],
 ['¬',
  'BMI_Low',
  '⊗',
  '¬',
  'Glucose_Low',
  '⊗',
  '¬',
  'DiabetesPedigreeFunction_Low',
  '→',
  'Outcome

In [36]:
os.listdir(project_dir_path)

['notebooks',
 'src',
 'README.MD',
 'log.md',
 'myenv',
 'inputs',
 'images',
 'docs',
 '.git',
 'requirements.txt',
 '.gitignore']

In [28]:
tmp = [1, 2, 3]
a = []

tmp + a

[1, 2, 3]

In [26]:
a = {}
b = None

if a:
    print(a)
elif not a:
    print("-------------")

if b:
    print("-------------")
elif not b:
    print(b)

-------------
None


# max_iter=50 としたものも採用して比較してみる

In [16]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

feature_names = data.columns.to_list()[1:]

X = np.array(data.drop(["Outcome"], axis=1))
y = np.array(data["Outcome"])

random_state = 42
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state)

rfmode = 'classify'
tree_generator = RandomForestClassifier(random_state=random_state)

rf = RuleFitClassifier(max_rules=50,
                       rfmode=rfmode,
                       tree_generator=RandomForestClassifier(),
                       random_state=random_state)

rf.fit(X_train, y_train, feature_names=feature_names)
display(rf)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("離散データ（one-hot encoding）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

離散データ（one-hot encoding）
---------------------------

Accuracy: 0.7761194029850746


Confusion Matrix:
 [[73 17]
 [13 31]]


Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.81      0.83        90
           1       0.65      0.70      0.67        44

    accuracy                           0.78       134
   macro avg       0.75      0.76      0.75       134
weighted avg       0.78      0.78      0.78       134



In [17]:
rules = rf.get_rules(exclude_zero_coef=True)
rules

Unnamed: 0,rule,type,coef,support,importance
1,Pregnancies_Medium,linear,-0.481035,1.0,0.207774
2,Pregnancies_High,linear,0.136026,1.0,0.040943
3,Glucose_Low,linear,-0.187261,1.0,0.076873
5,Glucose_High,linear,0.181535,1.0,0.071525
7,BloodPressure_Medium,linear,0.124705,1.0,0.059211
8,BloodPressure_High,linear,-0.205998,1.0,0.077339
11,SkinThickness_High,linear,-0.186036,1.0,0.053121
12,BMI_Low,linear,-0.09169,1.0,0.043059
13,BMI_Medium,linear,0.174153,1.0,0.086727
15,DiabetesPedigreeFunction_Low,linear,-0.367591,1.0,0.17665
