In [7]:
import json
import os
import copy
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import sys
sys.path.append("../")

from cfmining.algorithms import MAPOFCEM
from cfmining.predictors import GeneralClassifier_Shap, MonotoneClassifier
from cfmining.action_set import ActionSet
from cfmining.utils import get_data_model, FakeOutlierDetection
from cfmining.baselines import Bruteforce, MAPOCAM, Nice, Dice
import cfmining.algorithms as algs


%load_ext autoreload
%autoreload 2
%load_ext line_profiler

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [2]:
SEED = 0

## Helper

In [3]:
def run_experiments(
        method,
        individuals, 
        model, 
        output_file = None,
    ):
    results = []

    if not output_file is None:
        folder = "/".join(output_file.split("/")[:-1])
        if not os.path.exists(folder):
            os.makedirs(folder, exist_ok = True)

    for i in tqdm(range(len(individuals))):
        individual = individuals.iloc[i]
        try:
            model.clear_cache()
        except:
            pass
        start = time.time()
        method.fit(individual.values)
        end = time.time()

        solutions = method.solutions
        
        results.append({
            "individual" : individual.values.tolist(),
            "prob" : model.predict_proba(individual.values),
            "time" : end - start,
            "n_solutions" : len(method.solutions),
            "solutions" : solutions,
        })

        if output_file is not None:
            pd.DataFrame(results).to_csv(output_file, index=False)

        

    results = pd.DataFrame(results)
    if output_file is not None:
        results.to_csv(output_file, index=False)
    else:
        return results

## German

In [5]:
X_train, Y_train, model, outlier_detection, individuals = get_data_model("german", "LogisticRegression")
individuals = individuals.sample(n = 50, random_state=SEED)

In [8]:
not_mutable_features = ['Age', 'OwnsHouse', 'isMale', 'JobClassIsSkilled', 'Single', 'ForeignWorker', 'RentsHouse']
mutable_features = [feat for feat in X_train.columns if feat not in not_mutable_features]
continuous_features = ['Age', 'LoanDuration', 'LoanAmount', 'LoanRateAsPercentOfIncome','YearsAtCurrentHome']
categoric_features = [col for col in X_train.columns if col not in continuous_features]

action_set = ActionSet(X = X_train, default_step_size = 0.1, mutable_features = mutable_features)
model_shap = MonotoneClassifier(
    model, 
    FakeOutlierDetection(),
    X_train, 
    threshold = 0.5
)

In [10]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "lp",
    max_changes = 3,
    estimate_proba_max=False,
    estimate_outlier=False   
)

In [11]:
def prof_function():
    method.fit(individuals.values[0, :])

In [13]:
%lprun -f MAPOFCEM.find_candidates prof_function()

Timer unit: 1e-09 s

Total time: 16.4859 s
File: /work/giovani.valdrighi/cfmining/notebooks/../cfmining/algorithms.py
Function: find_candidates at line 671

Line #      Hits         Time  Per Hit   % Time  Line Contents
   671                                               def find_candidates(self, solution=None, size=0, changes=0):
   672      1174    1262335.0   1075.2      0.0          next_idx = self.sequence[size]
   673      1174     685654.0    584.0      0.0          next_name = self.names[next_idx]
   674                                           
   675                                                   # Looking for a candidate following features
   676      4115    7436633.0   1807.2      0.0          for value in self.feas_grid[next_name]:
   677      2941   13893641.0   4724.1      0.1              new_solution = solution.copy()
   678      2941    3159328.0   1074.2      0.0              new_solution[next_idx] = value
   679                                           
   68

In [14]:
method = MAPOFCEM(
    action_set = action_set,
    classifier = model_shap,
    compare = "percentile",
    max_changes = 3,
    estimate_proba_max=False,
    estimate_outlier=False   
)
def prof_function():
    method.fit(individuals.values[0, :])

In [15]:
%lprun -f MAPOFCEM.find_candidates prof_function()

Timer unit: 1e-09 s

Total time: 16.3681 s
File: /work/giovani.valdrighi/cfmining/notebooks/../cfmining/algorithms.py
Function: find_candidates at line 671

Line #      Hits         Time  Per Hit   % Time  Line Contents
   671                                               def find_candidates(self, solution=None, size=0, changes=0):
   672      1174    1254159.0   1068.3      0.0          next_idx = self.sequence[size]
   673      1174     681498.0    580.5      0.0          next_name = self.names[next_idx]
   674                                           
   675                                                   # Looking for a candidate following features
   676      4115    7339197.0   1783.5      0.0          for value in self.feas_grid[next_name]:
   677      2941   11468274.0   3899.4      0.1              new_solution = solution.copy()
   678      2941    2931963.0    996.9      0.0              new_solution[next_idx] = value
   679                                           
   68