In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


import sys
project_dir_path = '/Users/keisukeonoue/ws/lukasiewicz_1/'
sys.path.append(project_dir_path)
from src.rulefit import RuleFit, RuleFitClassifier

# RuleFit

__train_test_split__ の際に分割されたデータのインデックスがほしい

In [3]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

feature_names = data.columns.to_list()[1:]

X = data.drop(["Outcome"], axis=1)
y = data["Outcome"]

random_state = 42
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state)

train_index = X_train.index
test_index = X_test.index

# import os
# project_dir_path = "/home/onoue/ws/lukasiewicz_1/"
# data_dir_path = "inputs/pima_indian_diabetes"
# save_dir_path = os.path.join(project_dir_path, data_dir_path)

# file_path_1 = os.path.join(save_dir_path, 'train_index.csv')
# pd.DataFrame(train_index).to_csv(file_path_1)

# file_path_2 = os.path.join(save_dir_path, 'test_index.csv')
# pd.DataFrame(test_index).to_csv(file_path_2)



X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values


In [4]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

feature_names = data.columns.to_list()[1:]

X = np.array(data.drop(["Outcome"], axis=1))
y = np.array(data["Outcome"])

random_state = 42
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state)

rfmode = 'classify'
tree_generator = RandomForestClassifier(random_state=random_state)

rf = RuleFitClassifier(rfmode=rfmode,
                       tree_generator=tree_generator,
                       random_state=random_state,
                       exp_rand_tree_size=False)

rf.fit(X_train, y_train, feature_names=feature_names)
display(rf)
print(rf)
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("離散データ（one-hot encoding）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

RuleFitClassifier(exp_rand_tree_size=False, random_state=42,
                  tree_generator=RandomForestClassifier(random_state=42))
離散データ（one-hot encoding）
---------------------------

Accuracy: 0.7985074626865671


Confusion Matrix:
 [[77 13]
 [14 30]]


Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.86      0.85        90
           1       0.70      0.68      0.69        44

    accuracy                           0.80       134
   macro avg       0.77      0.77      0.77       134
weighted avg       0.80      0.80      0.80       134



In [5]:
rules_df = rf.get_rules(exclude_zero_coef=True)

pd.set_option("display.max_colwidth", 999) #ruleが長すぎて表示が省略される場合用
pd.set_option("display.max_rows", 999) 
rules_df

Unnamed: 0,rule,type,coef,support,importance
0,Pregnancies_Low,linear,-0.151962,1.0,0.072427
2,Pregnancies_High,linear,0.305828,1.0,0.092052
3,Glucose_Low,linear,-1.478275,1.0,0.606849
5,Glucose_High,linear,1.206659,1.0,0.475425
12,BMI_Low,linear,-1.041453,1.0,0.489083
13,BMI_Medium,linear,0.020003,1.0,0.009961
15,DiabetesPedigreeFunction_Low,linear,-0.394431,1.0,0.189549
18,Age_Low,linear,-0.297883,1.0,0.138185
19,Age_Medium,linear,0.244737,1.0,0.104064
228,Pregnancies_Medium <= 0.5 & Glucose_High > 0.5 & BMI_Low <= 0.5 & DiabetesPedigreeFunction_Low <= 0.5 & BMI_Medium > 0.5,rule,0.099318,0.029412,0.01678


In [11]:
# src に入れたい

class ArrangeRules:
    def __init__(self, rules_df, feature_names=None, conclusion_name=None):
        self.rules_df = rules_df
        self.feature_names = feature_names

        if not conclusion_name:
            self.conclusion_name = 'Outcome'
        else:
            self.conclusion_name = conclusion_name

        self.rules_extracted = None
        self.rules_additional = None
        self.KB = None

    def extract_rules_from_df(self):
        rules_list = self.rules_df['rule'].to_list()
        coef_list  = self.rules_df['coef'].to_list()

        rules_list = [rule.split(' ') for rule in rules_list]

        self.rules_extracted = []
        for rule, coef in zip(rules_list, coef_list):

            # '&' を目印にして複数のリストに分割する
            sublists = []
            current_sublist = []
            for item in rule:
                if item != '&':
                    current_sublist.append(item)
                else:
                    sublists.append(current_sublist)
                    current_sublist = []

            # ループ終了後に最後のサブリストを追加
            sublists.append(current_sublist)

            rule_new = []
            cnt = 0
            for sublist in sublists:
                if len(sublist) <= 1:
                    rule_new.append(sublist[0])
                else:
                    if sublist[1] in ['<', '<=']:
                        rule_new.append('¬')
                        rule_new.append(sublist[0])
                    else:
                        rule_new.append(sublist[0])
                
                cnt +=1
                if cnt < len(sublists):
                    rule_new.append('⊗')
                else:
                    rule_new.append('→')

                    # coef == 0 の rule は除外されているため
                    if coef > 0:
                        rule_new.append(self.conclusion_name)
                    elif coef < 0:
                        rule_new.append('¬')
                        rule_new.append(self.conclusion_name)

            self.rules_extracted.append(rule_new)

        return self.rules_extracted

    def generate_rules_from_df(self):
        if self.feature_names:
            tmp_dict = {}
            for item in self.feature_names:
                key, value = item.rsplit('_', 1)
                if key not in tmp_dict:
                    tmp_dict[key] = []

                tmp_dict[key].append(item)
            
            self.rules_additional = list(tmp_dict.values())
            self.rules_additional = [' ⊕ '.join(rule) for rule in self.rules_additional]
            self.rules_additional = [rule.split(' ') for rule in self.rules_additional]

            return self.rules_additional

        else:
            return []


    def construct_KB(self):
        rules_extracted = self.extract_rules_from_df()
        rules_additional = self.generate_rules_from_df()

        self.KB = rules_extracted + rules_additional
        return self.KB
    
    def save_KB_as_txt(self, file_name):
        if self.KB:
            rules = [' '.join(rule) for rule in self.KB]

        with open(file_name, 'w') as file:
            for item in rules:
                file.write("%s\n" % item)

In [12]:
rule_processor = ArrangeRules(rules_df, 
                              feature_names=feature_names, 
                              conclusion_name="Outcome")
KB = rule_processor.construct_KB()
rule_processor.save_KB_as_txt("./data/rules_2.txt")

In [13]:
for rule in KB:
    print(rule)

['Pregnancies_Low', '→', '¬', 'Outcome']
['Pregnancies_High', '→', 'Outcome']
['Glucose_Low', '→', '¬', 'Outcome']
['Glucose_High', '→', 'Outcome']
['BMI_Low', '→', '¬', 'Outcome']
['BMI_Medium', '→', 'Outcome']
['DiabetesPedigreeFunction_Low', '→', '¬', 'Outcome']
['Age_Low', '→', '¬', 'Outcome']
['Age_Medium', '→', 'Outcome']
['¬', 'Pregnancies_Medium', '⊗', 'Glucose_High', '⊗', '¬', 'BMI_Low', '⊗', '¬', 'DiabetesPedigreeFunction_Low', '⊗', 'BMI_Medium', '→', 'Outcome']
['¬', 'Glucose_Low', '⊗', '¬', 'DiabetesPedigreeFunction_Low', '⊗', 'Age_Medium', '⊗', 'BloodPressure_Medium', '⊗', '¬', 'BMI_Low', '⊗', 'Glucose_Medium', '→', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'Glucose_Low', '⊗', '¬', 'Pregnancies_High', '⊗', 'DiabetesPedigreeFunction_Low', '⊗', '¬', 'BloodPressure_Medium', '⊗', '¬', 'BloodPressure_Low', '⊗', '¬', 'Age_Low', '⊗', '¬', 'SkinThickness_Medium', '→', '¬', 'Outcome']
['¬', 'BMI_Low', '⊗', 'Glucose_Medium', '⊗', '¬', 'Pregnancies_High', '⊗', '¬', 'SkinThickness_Lo

# max_iter=50 としたものも採用して比較してみる

In [14]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

feature_names = data.columns.to_list()[1:]

X = np.array(data.drop(["Outcome"], axis=1))
y = np.array(data["Outcome"])

random_state = 42
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state)

rfmode = 'classify'
tree_generator = RandomForestClassifier(random_state=random_state)

rf = RuleFitClassifier(max_rules=50,
                       rfmode=rfmode,
                       tree_generator=RandomForestClassifier(),
                       random_state=random_state)

rf.fit(X_train, y_train, feature_names=feature_names)
display(rf)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("離散データ（one-hot encoding）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

離散データ（one-hot encoding）
---------------------------

Accuracy: 0.7761194029850746


Confusion Matrix:
 [[73 17]
 [13 31]]


Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.81      0.83        90
           1       0.65      0.70      0.67        44

    accuracy                           0.78       134
   macro avg       0.75      0.76      0.75       134
weighted avg       0.78      0.78      0.78       134



In [16]:
rules_df = rf.get_rules(exclude_zero_coef=True)
rules_df

Unnamed: 0,rule,type,coef,support,importance
1,Pregnancies_Medium,linear,-0.481035,1.0,0.207774
2,Pregnancies_High,linear,0.136026,1.0,0.040943
3,Glucose_Low,linear,-0.187261,1.0,0.076873
5,Glucose_High,linear,0.181535,1.0,0.071525
7,BloodPressure_Medium,linear,0.124705,1.0,0.059211
8,BloodPressure_High,linear,-0.205998,1.0,0.077339
11,SkinThickness_High,linear,-0.186036,1.0,0.053121
12,BMI_Low,linear,-0.09169,1.0,0.043059
13,BMI_Medium,linear,0.174153,1.0,0.086727
15,DiabetesPedigreeFunction_Low,linear,-0.367591,1.0,0.17665


In [18]:
rule_processor = ArrangeRules(rules_df, 
                              feature_names=feature_names, 
                              conclusion_name="Outcome")
KB = rule_processor.construct_KB()
rule_processor.save_KB_as_txt("./data/rules_max_50.txt")

for rule in KB:
    print(rule)

['Pregnancies_Medium', '→', '¬', 'Outcome']
['Pregnancies_High', '→', 'Outcome']
['Glucose_Low', '→', '¬', 'Outcome']
['Glucose_High', '→', 'Outcome']
['BloodPressure_Medium', '→', 'Outcome']
['BloodPressure_High', '→', '¬', 'Outcome']
['SkinThickness_High', '→', '¬', 'Outcome']
['BMI_Low', '→', '¬', 'Outcome']
['BMI_Medium', '→', 'Outcome']
['DiabetesPedigreeFunction_Low', '→', '¬', 'Outcome']
['DiabetesPedigreeFunction_High', '→', 'Outcome']
['¬', 'Glucose_High', '→', '¬', 'Outcome']
['Glucose_High', '→', 'Outcome']
['¬', 'BMI_High', '⊗', '¬', 'Pregnancies_Low', '⊗', '¬', 'BMI_Medium', '→', 'Outcome']
['¬', 'BMI_High', '⊗', 'Pregnancies_Low', '⊗', 'Age_Low', '⊗', '¬', 'Glucose_Low', '→', '¬', 'Outcome']
['BMI_High', '⊗', '¬', 'Glucose_Low', '⊗', 'BloodPressure_High', '→', 'Outcome']
['BMI_High', '⊗', 'Glucose_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'Age_Medium', '⊗', 'BloodPressure_Low', '→', '¬', 'Outcome']
['Age_Low', '⊗', 'BMI_Low', '→', '¬', 'Outcome']
['BMI_Lo