In [2]:
%load_ext autoreload
%autoreload 2

In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from rulefit import RuleFit, RuleFitClassifier

# RuleFit

In [14]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

feature_names = data.columns.to_list()[1:]

X = np.array(data.drop(["Outcome"], axis=1))
y = np.array(data["Outcome"])

random_state = 42
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state)

rfmode = 'classify'
tree_generator = RandomForestClassifier(random_state=random_state)

rf = RuleFitClassifier(rfmode=rfmode,
                       tree_generator=RandomForestClassifier(),
                       random_state=random_state)

rf.fit(X_train, y_train, feature_names=feature_names)
display(rf)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("離散データ（one-hot encoding）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

離散データ（one-hot encoding）
---------------------------

Accuracy: 0.7388059701492538


Confusion Matrix:
 [[72 18]
 [17 27]]


Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.80      0.80        90
           1       0.60      0.61      0.61        44

    accuracy                           0.74       134
   macro avg       0.70      0.71      0.71       134
weighted avg       0.74      0.74      0.74       134



In [15]:
rules = rf.get_rules(exclude_zero_coef=True)

pd.set_option("display.max_colwidth", 999) #ruleが長すぎて表示が省略される場合用
pd.set_option("display.max_rows", 999) 
rules

Unnamed: 0,rule,type,coef,support,importance
63,Glucose_Low <= 0.5 & Age_Low <= 0.5 & BMI_Low <= 0.5,rule,0.072953,0.211144,0.029774
126,Glucose_High <= 0.5 & DiabetesPedigreeFunction_Medium <= 0.5,rule,-0.186848,0.603499,0.091401
129,Pregnancies_High <= 0.5 & Age_Low > 0.5,rule,-0.064627,0.645533,0.030914
173,DiabetesPedigreeFunction_High <= 0.5 & Glucose_Low > 0.5,rule,-1.056492,0.211765,0.431639
174,Glucose_High <= 0.5 & Age_Medium <= 0.5 & SkinThickness_Low <= 0.5 & Glucose_Low <= 0.5 & DiabetesPedigreeFunction_Medium <= 0.5 & BloodPressure_Low <= 0.5,rule,-0.088616,0.127536,0.02956
185,Glucose_High <= 0.5 & Age_Low <= 0.5 & Pregnancies_Medium > 0.5,rule,-0.092192,0.093567,0.026849
189,Age_Medium <= 0.5 & BMI_Low > 0.5 & BloodPressure_Medium <= 0.5,rule,-0.050136,0.134328,0.017096
259,BMI_Low <= 0.5 & Glucose_Low <= 0.5 & DiabetesPedigreeFunction_Low <= 0.5,rule,0.034002,0.242604,0.014575
301,Glucose_Low <= 0.5 & Pregnancies_High <= 0.5 & DiabetesPedigreeFunction_High <= 0.5 & BMI_High <= 0.5 & Age_Medium <= 0.5,rule,-0.050353,0.468208,0.025125
389,DiabetesPedigreeFunction_Low <= 0.5 & BMI_Medium > 0.5,rule,0.270746,0.22807,0.113602


In [None]:
def create_KB(rules_df):
    rules = rules_df['rule'].to_list()
    coef  = rules_df['coef'].to_list()

    ``

# max_iter=50 としたものも採用して比較してみる

In [16]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

feature_names = data.columns.to_list()[1:]

X = np.array(data.drop(["Outcome"], axis=1))
y = np.array(data["Outcome"])

random_state = 42
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state)

rfmode = 'classify'
tree_generator = RandomForestClassifier(random_state=random_state)

rf = RuleFitClassifier(max_rules=50,
                       rfmode=rfmode,
                       tree_generator=RandomForestClassifier(),
                       random_state=random_state)

rf.fit(X_train, y_train, feature_names=feature_names)
display(rf)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("離散データ（one-hot encoding）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

離散データ（one-hot encoding）
---------------------------

Accuracy: 0.7761194029850746


Confusion Matrix:
 [[73 17]
 [13 31]]


Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.81      0.83        90
           1       0.65      0.70      0.67        44

    accuracy                           0.78       134
   macro avg       0.75      0.76      0.75       134
weighted avg       0.78      0.78      0.78       134



In [17]:
rules = rf.get_rules(exclude_zero_coef=True)
rules

Unnamed: 0,rule,type,coef,support,importance
1,Pregnancies_Medium,linear,-0.481035,1.0,0.207774
2,Pregnancies_High,linear,0.136026,1.0,0.040943
3,Glucose_Low,linear,-0.187261,1.0,0.076873
5,Glucose_High,linear,0.181535,1.0,0.071525
7,BloodPressure_Medium,linear,0.124705,1.0,0.059211
8,BloodPressure_High,linear,-0.205998,1.0,0.077339
11,SkinThickness_High,linear,-0.186036,1.0,0.053121
12,BMI_Low,linear,-0.09169,1.0,0.043059
13,BMI_Medium,linear,0.174153,1.0,0.086727
15,DiabetesPedigreeFunction_Low,linear,-0.367591,1.0,0.17665
