In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from rulefit import RuleFit

In [16]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

# feature_names = data.columns.to_list()

X = data.drop(["Outcome"], axis=1)
y = data["Outcome"]

random_state = 42
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state)


from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=random_state)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("離散データ（one-hot encoding）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

離散データ（one-hot encoding）
---------------------------

Accuracy: 0.746268656716418


Confusion Matrix:
 [[72 18]
 [16 28]]


Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.80      0.81        90
           1       0.61      0.64      0.62        44

    accuracy                           0.75       134
   macro avg       0.71      0.72      0.72       134
weighted avg       0.75      0.75      0.75       134



In [34]:
import shap

explainer = shap.TreeExplainer(rf_model, data=X_train)
shap_values = explainer.shap_values(X_train)



In [17]:
from sklearn.svm import SVC

svm_model = SVC(kernel="linear", random_state=random_state)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("離散データ（one-hot encoding）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

離散データ（one-hot encoding）
---------------------------

Accuracy: 0.7761194029850746


Confusion Matrix:
 [[82  8]
 [22 22]]


Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.91      0.85        90
           1       0.73      0.50      0.59        44

    accuracy                           0.78       134
   macro avg       0.76      0.71      0.72       134
weighted avg       0.77      0.78      0.76       134



In [18]:
from sklearn.svm import SVC

svm_model = SVC(kernel="rbf", random_state=random_state)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("離散データ（one-hot encoding）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

離散データ（one-hot encoding）
---------------------------

Accuracy: 0.7835820895522388


Confusion Matrix:
 [[79 11]
 [18 26]]


Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.88      0.84        90
           1       0.70      0.59      0.64        44

    accuracy                           0.78       134
   macro avg       0.76      0.73      0.74       134
weighted avg       0.78      0.78      0.78       134



# RuleFit

In [93]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

feature_names = data.columns.to_list()[1:]

X = np.array(data.drop(["Outcome"], axis=1))
y = np.array(data["Outcome"])

random_state = 42
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state)

In [94]:
feature_names

['Pregnancies_Low',
 'Pregnancies_Medium',
 'Pregnancies_High',
 'Glucose_Low',
 'Glucose_Medium',
 'Glucose_High',
 'BloodPressure_Low',
 'BloodPressure_Medium',
 'BloodPressure_High',
 'SkinThickness_Low',
 'SkinThickness_Medium',
 'SkinThickness_High',
 'BMI_Low',
 'BMI_Medium',
 'BMI_High',
 'DiabetesPedigreeFunction_Low',
 'DiabetesPedigreeFunction_Medium',
 'DiabetesPedigreeFunction_High',
 'Age_Low',
 'Age_Medium',
 'Age_High']

In [95]:
from rulefit import RuleFitClassifier
from sklearn.ensemble import RandomForestClassifier


rfmode = 'classify'
tree_generator = RandomForestClassifier(random_state=random_state)

rf = RuleFitClassifier(rfmode=rfmode,
                       tree_generator=RandomForestClassifier(),
                       random_state=random_state)

rf.fit(X_train, y_train, feature_names=feature_names)

display(rf)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("離散データ（one-hot encoding）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

離散データ（one-hot encoding）
---------------------------

Accuracy: 0.7388059701492538


Confusion Matrix:
 [[72 18]
 [17 27]]


Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.80      0.80        90
           1       0.60      0.61      0.61        44

    accuracy                           0.74       134
   macro avg       0.70      0.71      0.71       134
weighted avg       0.74      0.74      0.74       134



In [96]:
rules = rf.get_rules(exclude_zero_coef=True)

In [97]:
pd.set_option("display.max_colwidth", 999) #ruleが長すぎて表示が省略される場合用
pd.set_option("display.max_rows", 999) 
rules

Unnamed: 0,rule,type,coef,support,importance
63,Glucose_Low <= 0.5 & Age_Low <= 0.5 & BMI_Low <= 0.5,rule,0.074327,0.211144,0.030334
126,Glucose_High <= 0.5 & DiabetesPedigreeFunction_Medium <= 0.5,rule,-0.186842,0.603499,0.091398
129,Pregnancies_High <= 0.5 & Age_Low > 0.5,rule,-0.052975,0.645533,0.025341
146,Glucose_High <= 0.5 & BMI_Low > 0.5,rule,-0.094062,0.29683,0.042973
173,DiabetesPedigreeFunction_High <= 0.5 & Glucose_Low > 0.5,rule,-1.055865,0.211765,0.431383
174,Glucose_High <= 0.5 & Age_Medium <= 0.5 & SkinThickness_Low <= 0.5 & Glucose_Low <= 0.5 & DiabetesPedigreeFunction_Medium <= 0.5 & BloodPressure_Low <= 0.5,rule,-0.089146,0.127536,0.029737
185,Glucose_High <= 0.5 & Age_Low <= 0.5 & Pregnancies_Medium > 0.5,rule,-0.099373,0.093567,0.02894
189,Age_Medium <= 0.5 & BMI_Low > 0.5 & BloodPressure_Medium <= 0.5,rule,-0.050964,0.134328,0.017379
259,BMI_Low <= 0.5 & Glucose_Low <= 0.5 & DiabetesPedigreeFunction_Low <= 0.5,rule,0.030736,0.242604,0.013175
301,Glucose_Low <= 0.5 & Pregnancies_High <= 0.5 & DiabetesPedigreeFunction_High <= 0.5 & BMI_High <= 0.5 & Age_Medium <= 0.5,rule,-0.050065,0.468208,0.024982


In [98]:
rules['rule'].to_list()

['Glucose_Low <= 0.5 & Age_Low <= 0.5 & BMI_Low <= 0.5',
 'Glucose_High <= 0.5 & DiabetesPedigreeFunction_Medium <= 0.5',
 'Pregnancies_High <= 0.5 & Age_Low > 0.5',
 'Glucose_High <= 0.5 & BMI_Low > 0.5',
 'DiabetesPedigreeFunction_High <= 0.5 & Glucose_Low > 0.5',
 'Glucose_High <= 0.5 & Age_Medium <= 0.5 & SkinThickness_Low <= 0.5 & Glucose_Low <= 0.5 & DiabetesPedigreeFunction_Medium <= 0.5 & BloodPressure_Low <= 0.5',
 'Glucose_High <= 0.5 & Age_Low <= 0.5 & Pregnancies_Medium > 0.5',
 'Age_Medium <= 0.5 & BMI_Low > 0.5 & BloodPressure_Medium <= 0.5',
 'BMI_Low <= 0.5 & Glucose_Low <= 0.5 & DiabetesPedigreeFunction_Low <= 0.5',
 'Glucose_Low <= 0.5 & Pregnancies_High <= 0.5 & DiabetesPedigreeFunction_High <= 0.5 & BMI_High <= 0.5 & Age_Medium <= 0.5',
 'DiabetesPedigreeFunction_Low <= 0.5 & BMI_Medium > 0.5',
 'Glucose_High <= 0.5 & BloodPressure_Medium > 0.5 & Age_Medium <= 0.5',
 'Age_Low <= 0.5 & BloodPressure_Low <= 0.5 & SkinThickness_Medium > 0.5',
 'Glucose_High <= 0.5 & Pr

In [99]:
rules['coef'].to_list()

[0.07432722672293267,
 -0.18684215948425859,
 -0.052974770117989955,
 -0.09406229566792239,
 -1.0558648104402797,
 -0.08914640574339584,
 -0.0993733116859669,
 -0.05096406706986821,
 0.030736386193210674,
 -0.05006538633615083,
 0.27365126739682,
 -0.030813612958937246,
 0.09045800299717027,
 -0.2791335078579396,
 -0.021045513076233577,
 0.8090469488003349,
 -0.07418099769686855,
 -0.056767355647081894,
 0.11906372196269616,
 0.3705618840863954,
 0.0794033818877824,
 0.0240249388830329,
 -0.19836962168805894,
 -0.1681239593980275,
 0.6967254651076645,
 -0.06858991635501352,
 -0.032943579544958815,
 0.03547211148941087,
 -0.41571863133509973,
 -0.7299594159634366,
 -0.06190759866656954,
 -0.1221896449369362,
 0.04249586885041769,
 -0.044515460772968266,
 -0.09897582432101674,
 0.02132803536052671,
 -0.7396936654470444]

In [100]:
# rules_from_RuleFit = ['Glucose_Low <= 0.5 & Age_Low <= 0.5 & BMI_Low <= 0.5',
#                       'Glucose_High <= 0.5 & DiabetesPedigreeFunction_Medium <= 0.5',
#                       'Pregnancies_High <= 0.5 & Age_Low > 0.5',
#                       'Glucose_High <= 0.5 & BMI_Low > 0.5',
#                       'DiabetesPedigreeFunction_High <= 0.5 & Glucose_Low > 0.5']
rules_from_RuleFit = rules['rule'].to_list()
coefs_from_RuleFit = rules['coef'].to_list()

rules_list = [rule.split(' ') for rule in rules_from_RuleFit]

rules_list_new = []

for rule, coef in zip(rules_list, coefs_from_RuleFit):

    # '&' を目印にして複数のリストに分割する
    sublists = []
    current_sublist = []
    for item in rule:
        if item != '&':
            current_sublist.append(item)
        else:
            sublists.append(current_sublist)
            current_sublist = []

    # ループ終了後に最後のサブリストを追加
    sublists.append(current_sublist)

    rule_new = []
    cnt = 0
    for sublist in sublists:
        if len(sublist) <= 1:
            rule_new.append(sublist[0])
        else:
            if sublist[1] in ['<', '<=']:
                rule_new.append('¬')
                rule_new.append(sublist[0])
            else:
                rule_new.append(sublist[0])
        
        cnt +=1
        if cnt < len(sublists):
            rule_new.append('⊗')
        else:
            rule_new.append('→')

            # coef == 0 の rule は除外されているため
            if coef > 0:
                rule_new.append('Outcome')
            elif coef < 0:
                rule_new.append('¬')
                rule_new.append('Outcome')

    
    rules_list_new.append(rule_new)



for rule in rules_list_new:
    print(rule)

['¬', 'Glucose_Low', '⊗', '¬', 'Age_Low', '⊗', '¬', 'BMI_Low', '→', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'DiabetesPedigreeFunction_Medium', '→', '¬', 'Outcome']
['¬', 'Pregnancies_High', '⊗', 'Age_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', 'BMI_Low', '→', '¬', 'Outcome']
['¬', 'DiabetesPedigreeFunction_High', '⊗', 'Glucose_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'Age_Medium', '⊗', '¬', 'SkinThickness_Low', '⊗', '¬', 'Glucose_Low', '⊗', '¬', 'DiabetesPedigreeFunction_Medium', '⊗', '¬', 'BloodPressure_Low', '→', '¬', 'Outcome']
['¬', 'Glucose_High', '⊗', '¬', 'Age_Low', '⊗', 'Pregnancies_Medium', '→', '¬', 'Outcome']
['¬', 'Age_Medium', '⊗', 'BMI_Low', '⊗', '¬', 'BloodPressure_Medium', '→', '¬', 'Outcome']
['¬', 'BMI_Low', '⊗', '¬', 'Glucose_Low', '⊗', '¬', 'DiabetesPedigreeFunction_Low', '→', 'Outcome']
['¬', 'Glucose_Low', '⊗', '¬', 'Pregnancies_High', '⊗', '¬', 'DiabetesPedigreeFunction_High', '⊗', '¬', 'BMI_High', '⊗', '¬', 'Age_Medium', '→', '¬', 

In [101]:
rules_generated = []

for rule in rules_list_new[:10]:
    rules_generated.append(' '.join(rule))

rules_generated

['¬ Glucose_Low ⊗ ¬ Age_Low ⊗ ¬ BMI_Low → Outcome',
 '¬ Glucose_High ⊗ ¬ DiabetesPedigreeFunction_Medium → ¬ Outcome',
 '¬ Pregnancies_High ⊗ Age_Low → ¬ Outcome',
 '¬ Glucose_High ⊗ BMI_Low → ¬ Outcome',
 '¬ DiabetesPedigreeFunction_High ⊗ Glucose_Low → ¬ Outcome',
 '¬ Glucose_High ⊗ ¬ Age_Medium ⊗ ¬ SkinThickness_Low ⊗ ¬ Glucose_Low ⊗ ¬ DiabetesPedigreeFunction_Medium ⊗ ¬ BloodPressure_Low → ¬ Outcome',
 '¬ Glucose_High ⊗ ¬ Age_Low ⊗ Pregnancies_Medium → ¬ Outcome',
 '¬ Age_Medium ⊗ BMI_Low ⊗ ¬ BloodPressure_Medium → ¬ Outcome',
 '¬ BMI_Low ⊗ ¬ Glucose_Low ⊗ ¬ DiabetesPedigreeFunction_Low → Outcome',
 '¬ Glucose_Low ⊗ ¬ Pregnancies_High ⊗ ¬ DiabetesPedigreeFunction_High ⊗ ¬ BMI_High ⊗ ¬ Age_Medium → ¬ Outcome']

In [104]:
print(feature_names)
print()

tmp_dict = {}

for item in feature_names:
    key, value = item.rsplit('_', 1)
    if key not in tmp_dict:
        tmp_dict[key] = []
    
    tmp_dict[key].append(item)
    

rules_additional = list(tmp_dict.values())
rules_additional = [' ⊕ '.join(rule) for rule in rules_additional]


for rule in rules_additional:
    print(rule)


['Pregnancies_Low', 'Pregnancies_Medium', 'Pregnancies_High', 'Glucose_Low', 'Glucose_Medium', 'Glucose_High', 'BloodPressure_Low', 'BloodPressure_Medium', 'BloodPressure_High', 'SkinThickness_Low', 'SkinThickness_Medium', 'SkinThickness_High', 'BMI_Low', 'BMI_Medium', 'BMI_High', 'DiabetesPedigreeFunction_Low', 'DiabetesPedigreeFunction_Medium', 'DiabetesPedigreeFunction_High', 'Age_Low', 'Age_Medium', 'Age_High']

Pregnancies_Low ⊕ Pregnancies_Medium ⊕ Pregnancies_High
Glucose_Low ⊕ Glucose_Medium ⊕ Glucose_High
BloodPressure_Low ⊕ BloodPressure_Medium ⊕ BloodPressure_High
SkinThickness_Low ⊕ SkinThickness_Medium ⊕ SkinThickness_High
BMI_Low ⊕ BMI_Medium ⊕ BMI_High
DiabetesPedigreeFunction_Low ⊕ DiabetesPedigreeFunction_Medium ⊕ DiabetesPedigreeFunction_High
Age_Low ⊕ Age_Medium ⊕ Age_High


In [105]:
rules_generated + rules_additional

['¬ Glucose_Low ⊗ ¬ Age_Low ⊗ ¬ BMI_Low → Outcome',
 '¬ Glucose_High ⊗ ¬ DiabetesPedigreeFunction_Medium → ¬ Outcome',
 '¬ Pregnancies_High ⊗ Age_Low → ¬ Outcome',
 '¬ Glucose_High ⊗ BMI_Low → ¬ Outcome',
 '¬ DiabetesPedigreeFunction_High ⊗ Glucose_Low → ¬ Outcome',
 '¬ Glucose_High ⊗ ¬ Age_Medium ⊗ ¬ SkinThickness_Low ⊗ ¬ Glucose_Low ⊗ ¬ DiabetesPedigreeFunction_Medium ⊗ ¬ BloodPressure_Low → ¬ Outcome',
 '¬ Glucose_High ⊗ ¬ Age_Low ⊗ Pregnancies_Medium → ¬ Outcome',
 '¬ Age_Medium ⊗ BMI_Low ⊗ ¬ BloodPressure_Medium → ¬ Outcome',
 '¬ BMI_Low ⊗ ¬ Glucose_Low ⊗ ¬ DiabetesPedigreeFunction_Low → Outcome',
 '¬ Glucose_Low ⊗ ¬ Pregnancies_High ⊗ ¬ DiabetesPedigreeFunction_High ⊗ ¬ BMI_High ⊗ ¬ Age_Medium → ¬ Outcome',
 'Pregnancies_Low ⊕ Pregnancies_Medium ⊕ Pregnancies_High',
 'Glucose_Low ⊕ Glucose_Medium ⊕ Glucose_High',
 'BloodPressure_Low ⊕ BloodPressure_Medium ⊕ BloodPressure_High',
 'SkinThickness_Low ⊕ SkinThickness_Medium ⊕ SkinThickness_High',
 'BMI_Low ⊕ BMI_Medium ⊕ BMI_High',
 

In [106]:
len(rules_generated + rules_additional)

17

In [None]:
# ファイル名
file_name = './data/rules.txt'

# リストをtxtファイルに書き込む
with open(file_name, 'w') as file:
    for item in rules_generated + rules_additional:
        file.write("%s\n" % item)


In [26]:
from rulefit import RuleFitClassifier
from sklearn.ensemble import RandomForestClassifier


rfmode = 'classify'
tree_generator = RandomForestClassifier(random_state=random_state)

rf = RuleFitClassifier(max_rules=50,
                       rfmode=rfmode,
                       tree_generator=RandomForestClassifier(),
                       random_state=random_state)

rf.fit(X_train, y_train, feature_names=feature_names)

display(rf)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("離散データ（one-hot encoding）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

離散データ（one-hot encoding）
---------------------------

Accuracy: 0.7761194029850746


Confusion Matrix:
 [[73 17]
 [13 31]]


Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.81      0.83        90
           1       0.65      0.70      0.67        44

    accuracy                           0.78       134
   macro avg       0.75      0.76      0.75       134
weighted avg       0.78      0.78      0.78       134



In [37]:
feature_names

['Outcome',
 'Pregnancies_Low',
 'Pregnancies_Medium',
 'Pregnancies_High',
 'Glucose_Low',
 'Glucose_Medium',
 'Glucose_High',
 'BloodPressure_Low',
 'BloodPressure_Medium',
 'BloodPressure_High',
 'SkinThickness_Low',
 'SkinThickness_Medium',
 'SkinThickness_High',
 'BMI_Low',
 'BMI_Medium',
 'BMI_High',
 'DiabetesPedigreeFunction_Low',
 'DiabetesPedigreeFunction_Medium',
 'DiabetesPedigreeFunction_High',
 'Age_Low',
 'Age_Medium',
 'Age_High']

In [27]:
rules = rf.get_rules(exclude_zero_coef=True)
rules

Unnamed: 0,rule,type,coef,support,importance
1,Pregnancies_Low,linear,-0.481035,1.0,0.207774
2,Pregnancies_Medium,linear,0.136026,1.0,0.040943
3,Pregnancies_High,linear,-0.187261,1.0,0.076873
5,Glucose_Medium,linear,0.181535,1.0,0.071525
7,BloodPressure_Low,linear,0.124705,1.0,0.059211
8,BloodPressure_Medium,linear,-0.205998,1.0,0.077339
11,SkinThickness_Medium,linear,-0.186036,1.0,0.053121
12,SkinThickness_High,linear,-0.09169,1.0,0.043059
13,BMI_Low,linear,0.174153,1.0,0.086727
15,BMI_High,linear,-0.367591,1.0,0.17665


In [28]:
rules.sort_values(by="importance", ascending=False)

Unnamed: 0,rule,type,coef,support,importance
26,BMI_Medium <= 0.5 & Outcome > 0.5 & DiabetesPedigreeFunction_High > 0.5 & Pregnancies_High <= 0.5,rule,-0.96273,0.340361,0.456171
67,Pregnancies_High > 0.5,rule,-0.986815,0.231672,0.416337
22,Glucose_Medium > 0.5,rule,1.024102,0.195906,0.406463
48,DiabetesPedigreeFunction_High > 0.5 & Pregnancies_High > 0.5,rule,-1.069032,0.166667,0.398405
63,Pregnancies_High <= 0.5 & DiabetesPedigreeFunction_High <= 0.5 & SkinThickness_High <= 0.5,rule,0.720004,0.211144,0.293848
40,SkinThickness_High > 0.5,rule,-0.582851,0.297059,0.266341
60,BMI_Medium > 0.5 & BloodPressure_Medium <= 0.5 & Outcome > 0.5 & BloodPressure_High <= 0.5,rule,-0.996665,0.047904,0.212851
1,Pregnancies_Low,linear,-0.481035,1.0,0.207774
23,BMI_Medium <= 0.5 & Outcome <= 0.5 & BMI_Low <= 0.5,rule,0.636929,0.105422,0.195599
15,BMI_High,linear,-0.367591,1.0,0.17665
