In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
! pwd

/Users/keisukeonoue/ws/lukasiewicz_1/notebooks/pima_indian_diabetes


In [3]:
import os 
import sys 
# project_dir_path = '/home/onoue/ws/lukasiewicz_1'
project_dir_path = '/Users/keisukeonoue/ws/lukasiewicz_1/'
sys.path.append(project_dir_path)
import shutil

import pandas as pd
import numpy as np
import cvxpy as cp 

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from src.setup_problem_primal_modular import Setup

In [3]:
data = pd.read_csv('./data/diabetes_cleaned.csv', index_col=0)
X = data.drop('Outcome', axis=1)

#####################################################3
y = data['Outcome']
y.replace(0, -1, inplace=True)
y

0      1
1     -1
2      1
3     -1
5     -1
      ..
763   -1
764   -1
765   -1
766    1
767   -1
Name: Outcome, Length: 670, dtype: int64

In [3]:
import os
from typing import Dict, Any, List
import json

# from .setup_problem import Setup
class Setup_:
    """
    型ヒント用（circular import の回避のため）
    """
    def __init__(self):
        pass

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
# from sklearn.metrics import confusion_matrix, classification_report

from src.misc import is_symbol
from src.operators import negation



class EvaluateModel:
    def __init__(self,
                 path_cleaned: str,
                 path_discretized: str,
                 model: object,
                 KB_origin: List[List[str]],
                 random_state: int = 42,
                 test_size: float = 0.2,
                 name: str = None,
                 note: str = None) -> None:

        self.path_cleaned = path_cleaned
        self.path_discretized = path_discretized
        self.model = model
        self.KB_origin = KB_origin
        self.random_state = random_state 
        self.test_size = test_size

        self.result_dict = {
            'name'     : name,
            'note'     : note,
            'Accuracy' : None,
            'Precision': None,
            'Recall'   : None,
            'F1-score' : None,
            'Auc'      : None,
            'len_U': None,
            'Rules'    : {'violation': 0, 'total': len(self.KB_origin)},
            'Rules_detail': {}
        }

    def calculate_scores(self) -> None:
        # まずは連続データで計算
        data = pd.read_csv(self.path_cleaned, index_col=0)
        X = data.drop('Outcome', axis=1)

        #####################################################3
        y = data['Outcome']
        y.replace(0, -1, inplace=True)

        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=self.test_size, 
                                                            random_state=self.random_state)
        self.model.fit(X_train, y_train)
        y_pred = self.model.predict(X_test)

        y_pred_interpreted = np.where(y_pred == 0, -1, y_pred)
        
        y_pred = self.model.predict_proba(X_test)[:, 1]

        # 精度等の一般的な評価指標の計算
        accuracy = accuracy_score(y_test, y_pred_interpreted)
        # conf_matrix = confusion_matrix(y_test, y_pred_interpreted)
        precision = precision_score(y_test, y_pred_interpreted)
        recall = recall_score(y_test, y_pred_interpreted)
        f1 = f1_score(y_test, y_pred_interpreted)
        # class_report = classification_report(y_test, y_pred_interpreted)
        roc_auc = roc_auc_score(y_test, y_pred)

        self.result_dict['Accuracy'] = float(accuracy)
        # self.result_dict['Confusion_matrix'] = conf_matrix.tolist()
        self.result_dict['Precision'] = float(precision)
        self.result_dict['Recall'] = float(recall)
        self.result_dict['F1-score'] = float(f1)
        # self.result_dict['Classification_report'] = class_report
        self.result_dict['Auc'] = float(roc_auc)

        # ルール違反の計算の前に X_test を離散化（discretized のほうを読み込む）
        data = pd.read_csv(self.path_discretized, index_col=0)

        X = data.drop('Outcome', axis=1)

    
        ###########################################################
        y = data['Outcome']
        y.replace(0, -1, inplace=True)

        
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=self.test_size, 
                                                            random_state=self.random_state)
        
        # ルール違反
        rules_tmp = []
        for rule in self.KB_origin:
            if "Outcome" in rule:
                tmp = {}
                for idx, item in enumerate(rule):
                    if not is_symbol(item):
                        if idx == 0 or rule[idx - 1] != '¬':
                            tmp[item] = 1
                        elif item != "Outcome":
                            tmp[item] = 0
                        else:
                            tmp[item] = -1
                rules_tmp.append(tmp)

        idx_tmp = X_test.index
        y_pred_interpreted = pd.DataFrame(y_pred_interpreted, index=idx_tmp)
        

        for i, rule in enumerate(rules_tmp):
            outcome = rule["Outcome"]
            condition = " & ".join([f"{column} == {value}" for column, value in rule.items() if column != "Outcome"])

            tmp = y_pred_interpreted.loc[X_test.query(condition).index]

            violation_tmp = int((tmp != outcome).sum().iloc[0])
            self.result_dict['Rules']['violation'] += violation_tmp
            self.result_dict['Rules']['total'] += tmp.shape[0]
            self.result_dict['Rules_detail'][i] = {
                'violation': violation_tmp,
                'total': tmp.shape[0]
            }

    def save_result_as_json(self, file_path) -> None:
        with open(file_path, 'w') as f:
            json.dump(self.result_dict, f, indent=4)

    def evaluate(self, save_file_path: str = './result_1.json') -> None:
        self.calculate_scores()
        self.save_result_as_json(file_path=save_file_path)

In [4]:
# KB_origin の生成
data_dir_path = os.path.join(project_dir_path, 'inputs/pima_indian_diabetes')
file_list = os.listdir(os.path.join(data_dir_path, 'train'))

L_files = [filename for filename in file_list 
           if filename.startswith('L') and filename.endswith('.csv')]

U_files = [filename for filename in file_list 
           if filename.startswith('U') and filename.endswith('.csv')]

file_names_dict = {
    'supervised': L_files,
    'unsupervised': U_files,
    'rule': ['rules.txt']
}

problem_instance = Setup(data_dir_path, 
                         file_names_dict, 
                         specimen_construct_objective_function)

problem_instance.load_data()
problem_instance.load_rules()

load_data took 0.10036587715148926 seconds!
load_rules took 0.0002872943878173828 seconds!


# non-linear svm (rbf)

In [10]:
path_cleaned = "./data/diabetes_cleaned.csv"
path_discretized = "./data/diabetes_discretized.csv"
random_state = 42
test_size = 0.2


model = SVC(kernel='rbf',
            random_state=random_state,
            probability=True) 


#########################################
save_file_path = "./result_3.json"
model_name = "non-linear svm (rbf)"
note = None

evaluate_model = EvaluateModel(path_cleaned=path_cleaned,
                               path_discretized=path_discretized,
                               model=model,
                               name=model_name,
                               random_state=random_state,
                               test_size=test_size,
                               KB_origin=problem_instance.KB_origin)

evaluate_model.evaluate(save_file_path=save_file_path)

# linear svm

In [8]:
path_cleaned = "./data/diabetes_cleaned.csv"
path_discretized = "./data/diabetes_discretized.csv"
random_state = 42
test_size = 0.2


model = SVC(kernel='linear',
            random_state=random_state,
            probability=True) 


#########################################
save_file_path = "./result_4.json"
model_name = "linear svm"
note = None

evaluate_model = EvaluateModel(path_cleaned=path_cleaned,
                               path_discretized=path_discretized,
                               model=model,
                               name=model_name,
                               random_state=random_state,
                               test_size=test_size,
                               KB_origin=problem_instance.KB_origin)

evaluate_model.evaluate(save_file_path=save_file_path)

# logistic regression

In [5]:
path_cleaned = "./data/diabetes_cleaned.csv"
path_discretized = "./data/diabetes_discretized.csv"
random_state = 42
test_size = 0.2

model = SVC(kernel='linear',
            random_state=random_state,
            probability=True) 
model = LogisticRegression(random_state=random_state)

#########################################
save_file_path = "./result_5.json"
model_name = "logistic regression"
note = None

evaluate_model = EvaluateModel(path_cleaned=path_cleaned,
                               path_discretized=path_discretized,
                               model=model,
                               name=model_name,
                               test_size=test_size,
                               random_state=random_state,
                               KB_origin=problem_instance.KB_origin)

evaluate_model.evaluate(save_file_path=save_file_path)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# random forest

In [7]:
path_cleaned = "./data/diabetes_cleaned.csv"
path_discretized = "./data/diabetes_discretized.csv"
random_state = 42
test_size = 0.2

model = RandomForestClassifier(random_state=random_state)


#########################################
save_file_path = "./result_6.json"
model_name = "random forest"
note = None

evaluate_model = EvaluateModel(path_cleaned=path_cleaned,
                               path_discretized=path_discretized,
                               model=model,
                               name=model_name,
                               random_state=random_state,
                               test_size=test_size,
                               KB_origin=problem_instance.KB_origin)

evaluate_model.evaluate(save_file_path=save_file_path)