In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
project_dir_path = '/Users/keisukeonoue/ws/lukasiewicz_1/'
sys.path.append(project_dir_path)

In [5]:
import os
import shutil

import pandas as pd
import numpy as np
import cvxpy as cp
from sklearn.model_selection import KFold

from src.setup_problem_primal_modular import Setup
from src.objective_function import linear_svm
from src.objective_function import linear_svm_loss
from src.objective_function import logistic_regression_loss
from src.evaluation import EvaluateModel

In [14]:
setting_dict = {
    'seed': 42,
    # 'test_size': 0.2,
    'n_splits': 5,
    'source_path': './../../data/pima_indian_diabetes',
    'source_data_file_name': 'diabetes_discretized.csv',
    'source_rule_file_name': 'rules.txt',
    'input_path': './../../inputs/pima_indian_diabetes_cv',
    'unsupervised_file_name': 'U.csv',
    'unsupervised_shape': (15, 21), # (data_num, data_dim)
    'output_path': './../../outputs/pima_indian_diabetes_2'
}

objectives_dict = {
    'luka_1': {
        'model_name': 'luka linear svm',
        'model': linear_svm,
        'params': {'c1': 10, 'c2': 10},
        'constraints_flag': {
            'pointwise': True,
            'logical': True,
            'consistency': True
        }
    },
    'luka_2': {
        'model_name': 'luka linear svm loss',
        'model': linear_svm_loss,
        'params': {'c1': 10, 'c2': 10},
        'constraints_flag': {
            'pointwise': False,
            'logical': False,
            'consistency': True
        }
    },
    'luka_3': {
        'model_name': 'luka logistic regression loss',
        'model': logistic_regression_loss,
        'params': {'c1': 10, 'c2': 10},
        'constraints_flag': {
            'pointwise': False,
            'logical': False,
            'consistency': True
        }
    },
}

In [12]:
def prepare_data(setting: dict) -> None:
    random_state = setting['seed']
    n_splits = setting['n_splits']

    source_data_path = os.path.join(setting['source_path'], setting['source_data_file_name'])
    input_path = setting['input_path']

    data = pd.read_csv(source_data_path, index_col=0)
    
    data = data.reset_index(drop=True)

    X = data.drop(["Outcome"], axis=1)
    y = data["Outcome"]

    kf = KFold(n_splits=n_splits)
    
    for i, (train_index, test_index) in enumerate(kf.split(X)):

        train_data = data.loc[train_index, :]
        outcome = train_data['Outcome']
        features = train_data.drop(['Outcome'], axis=1)
        feature_names = list(features.columns)

        input_train_path = os.path.join(input_path, f'fold_{i}', 'train')

        if not os.path.exists(input_train_path):
            os.makedirs(input_train_path)

        df = features.copy()
        df['target'] = outcome.replace(0, -1)

        file_name = "L_" + "Outcome" + '.csv'
        file_path = os.path.join(input_train_path, file_name)
        df.to_csv(file_path)

        for feature_name in feature_names:
            df = features.copy()
            df['target'] = df[feature_name].replace(0, -1)
            # display(df)

            file_name = "L_" + feature_name + '.csv'
            file_path = os.path.join(input_train_path, file_name)
            df.to_csv(file_path)

        unsupervised_path = os.path.join(input_train_path, setting['unsupervised_file_name'])
        unsupervised_shape = setting['unsupervised_shape']

        arr_U = np.random.randint(2, size=unsupervised_shape)
        df_U = pd.DataFrame(arr_U)
        df_U.to_csv(unsupervised_path)

        rule_file_name = setting['source_rule_file_name']
        source_rule_path = os.path.join(setting['source_path'], rule_file_name)
        rule_path = os.path.join(input_train_path, rule_file_name)

        shutil.copy(source_rule_path, rule_path)

        test_data = data.loc[test_index, :]

        outcome = test_data['Outcome']
        features = test_data.drop(['Outcome'], axis=1)
        feature_names = list(features.columns)

        input_test_path = os.path.join(input_path, f'fold_{i}', 'test')

        if not os.path.exists(input_test_path):
            os.makedirs(input_test_path)

        
        df = features.copy()
        df['target'] = outcome.replace(0, -1)

        file_name = "L_" + "Outcome" + '.csv'
        file_path = os.path.join(input_test_path, file_name)
        df.to_csv(file_path)

In [15]:
prepare_data(setting_dict)