In [57]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from rulefit import RuleFit

In [59]:
data_cleaned = pd.read_csv("./data/diabetes_cleaned.csv", index_col=0)
data_discretized = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

X_cleaned = data_cleaned.drop(["Outcome"], axis=1)
y_cleaned = data_cleaned["Outcome"]
X_discretized = data_discretized.drop(["Outcome"], axis=1)
y_discretized = data_discretized["Outcome"]


random_state = 42
test_size = 0.2
    
X1_train, X1_test, y1_train, y1_test = train_test_split(X_cleaned, y_cleaned, 
                                                        test_size=test_size, 
                                                        random_state=random_state)
X2_train, X2_test, y2_train, y2_test = train_test_split(X_discretized, y_discretized, 
                                                        test_size=test_size, 
                                                        random_state=random_state)

In [60]:
# Import the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# Create and train the random forest model using the cleaned data
rf_model = RandomForestClassifier(random_state=random_state)
rf_model.fit(X1_train, y1_train)

# Make predictions using the trained model
y1_pred = rf_model.predict(X1_test)

# Calculate accuracy
accuracy = accuracy_score(y1_test, y1_pred)

# Generate confusion matrix
conf_matrix = confusion_matrix(y1_test, y1_pred)

# Generate classification report
class_report = classification_report(y1_test, y1_pred)

# Print the results
print("元データ（cleaning 済み）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

元データ（cleaning 済み）
---------------------------

Accuracy: 0.8059701492537313


Confusion Matrix:
 [[74 16]
 [10 34]]


Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.82      0.85        90
           1       0.68      0.77      0.72        44

    accuracy                           0.81       134
   macro avg       0.78      0.80      0.79       134
weighted avg       0.81      0.81      0.81       134



In [10]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=random_state)
rf_model.fit(X2_train, y2_train)

y2_pred = rf_model.predict(X2_test)

accuracy = accuracy_score(y2_test, y2_pred)

conf_matrix = confusion_matrix(y2_test, y2_pred)

class_report = classification_report(y2_test, y2_pred)

print("離散データ")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

離散データ
---------------------------

Accuracy: 0.753731343283582


Confusion Matrix:
 [[72 18]
 [15 29]]


Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.81        90
           1       0.62      0.66      0.64        44

    accuracy                           0.75       134
   macro avg       0.72      0.73      0.73       134
weighted avg       0.76      0.75      0.76       134



# RuleFit

In [61]:
data_cleaned = pd.read_csv("./data/diabetes_cleaned.csv", index_col=0)
data_discretized = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)

X_cleaned = data_cleaned.drop(["Outcome"], axis=1)
y_cleaned = data_cleaned["Outcome"]
X_discretized = data_discretized.drop(["Outcome"], axis=1)
y_discretized = data_discretized["Outcome"]


random_state = 42
test_size = 0.2
    
X1_train, X1_test, y1_train, y1_test = train_test_split(X_cleaned, y_cleaned, 
                                                        test_size=test_size, 
                                                        random_state=random_state)
X2_train, X2_test, y2_train, y2_test = train_test_split(X_discretized, y_discretized, 
                                                        test_size=test_size, 
                                                        random_state=random_state)

In [62]:
feature_names = data_cleaned.columns
feature_names

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI',
       'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [63]:
from rulefit import RuleFitClassifier

X1_train = np.array(X1_train)
y1_train = np.array(y1_train)
X1_test = np.array(X1_test)
y1_test = np.array(y1_test)

rfmode = 'classify'
tree_generator = RandomForestClassifier(random_state=random_state)

rf = RuleFitClassifier(rfmode=rfmode,
                       tree_generator=tree_generator,
                       random_state=random_state)
rf.fit(X1_train, y1_train, feature_names=feature_names)

display(rf)

y1_pred = rf.predict(X1_test)

# Calculate accuracy
accuracy = accuracy_score(y1_test, y1_pred)

# Generate confusion matrix
conf_matrix = confusion_matrix(y1_test, y1_pred)

# Generate classification report
class_report = classification_report(y1_test, y1_pred)

# Print the results
print("元データ（cleaning 済み）")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

元データ（cleaning 済み）
---------------------------

Accuracy: 0.7164179104477612


Confusion Matrix:
 [[69 21]
 [17 27]]


Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.77      0.78        90
           1       0.56      0.61      0.59        44

    accuracy                           0.72       134
   macro avg       0.68      0.69      0.69       134
weighted avg       0.72      0.72      0.72       134



In [64]:
rules_1 = rf.get_rules()
rules_1.to_csv("./data/rulefit_original.csv")
display(rules_1)

Unnamed: 0,rule,type,coef,support,importance
0,Pregnancies,linear,-0.002143,1.000000,0.006682
1,Glucose,linear,0.000164,1.000000,0.004737
2,BloodPressure,linear,0.000911,1.000000,0.009835
3,SkinThickness,linear,-0.001175,1.000000,0.017821
4,BMI,linear,0.001253,1.000000,0.007869
...,...,...,...,...,...
1479,Glucose > 124.5 & Glucose > 180.5,rule,0.035398,0.047478,0.007528
1480,BMI > 27.84999942779541 & Pregnancies <= 6.5 &...,rule,0.746782,0.117647,0.240606
1481,BMI > 27.84999942779541 & Pregnancies <= 6.5 &...,rule,-0.479987,0.367647,0.231433
1482,BMI > 27.84999942779541 & Pregnancies <= 6.5 &...,rule,-0.230056,0.076471,0.061137


In [50]:
from itertools import compress
import re

def arrange_conditions(rules, feature_names, n_round=3):
    """
    ルールを整理する関数    
    """
    rule_output = []
    num = re.compile(r'[+-]?(?:\d+\.?\d*|\.\d+)')

    for rule in rules:    #個別の決定規則について
        conds = rule.split(' & ') #決定規則を個別の条件に分ける

        #まとめられる条件の整理
        for f in feature_names:
            f_index = [f in s for s in conds] #特定の特徴量が登場する要素番号の特定
            while sum(f_index) >= 2:    #決定規則内に同じ特徴量名が2度以上登場するとき（整理の余地があるとき）
                redundancy = list(compress(conds, f_index))
                le_index = ['<=' in s for s in redundancy]
                gt_index = ['>' in s for s in redundancy]

                if sum(le_index) >= 2:    #「<=」の条件が重複するとき
                    th = []
                    for cond in list(compress(redundancy, le_index)):
                        th.append(float(num.findall(cond)[0]))
                        conds.remove(cond)
                    min_th = min(th)
                    conds.insert(le_index[0], f+' <= '+str(min_th))
                if sum(gt_index) >= 2:    #「>」の条件が重複するとき
                    th = []
                    for cond in list(compress(redundancy, gt_index)):
                        th.append(float(num.findall(cond)[0]))
                        conds.remove(cond)
                    max_th = max(th)
                    conds.insert(gt_index[0], f+' > '+str(max_th))
                if sum(le_index) == 1 and sum(gt_index) == 1: #「<=」と「>」があるとき
                    le_cond = list(compress(redundancy, le_index))[0]
                    gt_cond = list(compress(redundancy, gt_index))[0]
                    min_th = float(num.findall(gt_cond)[0])
                    max_th = float(num.findall(le_cond)[0])
                    conds.remove(le_cond)
                    conds.remove(gt_cond)
                    conds.insert(min(le_index[0], gt_index[0]), str(min_th)+' < '+f+' <= '+str(max_th))

                f_index = [f in s for s in conds] #特徴量が登場する要素番号の更新

        #閾値の丸め
        for i,cond in enumerate(conds):
            elem = cond.split(' ')
            for j,s in enumerate(elem):
                try:
                    elem[j] = str(round(float(s),n_round))
                except:
                    pass                    
            conds[i] = ' '.join(elem)

        rule_output.append(' & '.join(conds))
    return rule_output

# # ルールを整理
# rules.rule = arrange_conditions(rules.rule, features, n_round=3)
# rules.head(10)

In [54]:
rules_1_arranged = arrange_conditions(rules_1.rule, feature_names, n_round=3)
display(rules_1_arranged)

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'BMI <= 29.6',
 'BMI > 29.6',
 'BMI <= 27.45',
 'BMI > 27.45 & Glucose <= 94.5 & Pregnancies <= 6.5',
 '27.45 < BMI <= 31.1 & 94.5 < Glucose <= 157.5 & Pregnancies <= 6.5',
 '31.1 < BMI <= 32.35 & 94.5 < Glucose <= 157.5 & Pregnancies <= 6.5',
 '94.5 < Glucose <= 157.5 & BMI > 32.35 & Pregnancies <= 6.5 & DiabetesPedigreeFunction <= 0.219',
 '94.5 < Glucose <= 157.5 & BMI > 32.35 & Pregnancies <= 6.5 & DiabetesPedigreeFunction > 0.219',
 'BMI > 27.45 & Pregnancies <= 6.5 & Glucose > 157.5',
 'BMI > 27.45 & Pregnancies > 6.5',
 'Age <= 26.5 & Glucose <= 133.0',
 'Age <= 26.5 & Glucose > 133.0 & Pregnancies <= 1.5',
 'Age <= 26.5 & Glucose > 133.0 & Pregnancies > 1.5',
 'Age > 26.5',
 'BMI <= 30.05',
 'BMI > 30.05 & Age <= 30.5',
 'BMI > 30.05 & Age > 30.5',
 'BMI > 30.05',
 'Glucose <= 139.5',
 'Glucose > 139.5',
 'Age <= 28.5',
 'Age > 28.5',
 'BMI <= 30.05 & Glucose <= 161.5',

In [53]:
rules.rule

0                                             Pregnancies
1                                                 Glucose
2                                           BloodPressure
3                                           SkinThickness
4                                                     BMI
                              ...                        
1479                    Glucose > 124.5 & Glucose > 180.5
1480    BMI > 27.84999942779541 & Pregnancies <= 6.5 &...
1481    BMI > 27.84999942779541 & Pregnancies <= 6.5 &...
1482    BMI > 27.84999942779541 & Pregnancies <= 6.5 &...
1483          BMI > 27.84999942779541 & Pregnancies > 6.5
Name: rule, Length: 1484, dtype: object

In [55]:
from rulefit import RuleFitClassifier

X2_train = np.array(X2_train)
y2_train = np.array(y2_train)
X2_test = np.array(X2_test)
y2_test = np.array(y2_test)

rfmode = 'classify'
tree_generator = RandomForestClassifier(random_state=random_state)

rf = RuleFitClassifier(rfmode=rfmode,
                       tree_generator=tree_generator,
                       random_state=random_state)
rf.fit(X2_train, y2_train, feature_names=feature_names)

display(rf)

y2_pred = rf.predict(X2_test)

# Calculate accuracy
accuracy = accuracy_score(y2_test, y2_pred)

# Generate confusion matrix
conf_matrix = confusion_matrix(y2_test, y2_pred)

# Generate classification report
class_report = classification_report(y2_test, y2_pred)

# Print the results
print("離散データ")
print("---------------------------")
print()
print("Accuracy:", accuracy)
print()
print()
print("Confusion Matrix:\n", conf_matrix)
print()
print()
print("Classification Report:\n", class_report)

離散データ
---------------------------

Accuracy: 0.7611940298507462


Confusion Matrix:
 [[72 18]
 [14 30]]


Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.80      0.82        90
           1       0.62      0.68      0.65        44

    accuracy                           0.76       134
   macro avg       0.73      0.74      0.74       134
weighted avg       0.77      0.76      0.76       134



In [56]:
rules = rf.get_rules()
rules.to_csv('./data/rulefit_discrete.csv')