# Basic use of MIDS

In [1]:
import os

import numpy as np
from typing import List, Dict
import pandas as pd

from data_structures.rules.multi_target_class_association_rule import MCAR

from mdrsl.toy_data.titanic import prepare_data_titanic

from mdrsl.rule_generation.association_rule_mining.mlext_impl.mlext_interaction import mine_MCARs_mlext
from mdrsl.rule_models.mids.model_fitting.mids_with_value_reuse import MIDSValueReuse

%load_ext autoreload
%autoreload 2

## Loading the Titanic toy dataset

In [2]:
from mdrsl.project_info import project_dir
data_dir = os.path.join(project_dir, 'data/external')

In [3]:
df_train: pd.DataFrame
df_test: pd.DataFrame

df_train, df_test, dataset_name = prepare_data_titanic(data_dir, prop=0.25)
df_train.head()

Unnamed: 0,Passenger_Cat,Age_Cat,Gender,Survived
0,3rd_class,adult,male,0
1,3rd_class,adult,female,0
2,crew,adult,male,0
3,crew,adult,male,0
4,2nd_class,adult,male,0


In [4]:
cols: np.ndarray = df_train.columns.values
cols

array(['Passenger_Cat', 'Age_Cat', 'Gender', 'Survived'], dtype=object)

## Mining multi-target association rules

In [5]:
min_support = 0.01
min_confidence = 0.5
rule_cutoff = 150

cars: List[MCAR]
time_info: Dict[str, float]

cars, time_info = mine_MCARs_mlext(df_train, min_support=min_support, min_confidence=min_confidence)
if len(cars) > rule_cutoff:
    cars = cars[:rule_cutoff]

In [6]:
from mdrsl.data_structures.rules.pretty_printing import mids_mcar_to_pretty_string
for rule in cars:
    print(mids_mcar_to_pretty_string(rule))

Age_Cat=adult -> Gender=male
Gender=male -> Age_Cat=adult
Survived=0 -> Age_Cat=adult
Age_Cat=adult -> Survived=0
Survived=0 -> Gender=male
Gender=male -> Survived=0
Age_Cat=adult, Survived=0 -> Gender=male
Gender=male, Survived=0 -> Age_Cat=adult
Age_Cat=adult, Gender=male -> Survived=0
Survived=0 -> Age_Cat=adult, Gender=male
Age_Cat=adult -> Gender=male, Survived=0
Gender=male -> Age_Cat=adult, Survived=0
Passenger_Cat=3rd_class -> Age_Cat=adult
Passenger_Cat=3rd_class -> Survived=0
Passenger_Cat=3rd_class -> Gender=male
Passenger_Cat=3rd_class, Survived=0 -> Age_Cat=adult
Age_Cat=adult, Passenger_Cat=3rd_class -> Survived=0
Passenger_Cat=3rd_class -> Age_Cat=adult, Survived=0
Age_Cat=adult, Passenger_Cat=3rd_class -> Gender=male
Gender=male, Passenger_Cat=3rd_class -> Age_Cat=adult
Passenger_Cat=3rd_class -> Age_Cat=adult, Gender=male
Passenger_Cat=3rd_class, Survived=0 -> Gender=male
Gender=male, Passenger_Cat=3rd_class -> Survived=0
Passenger_Cat=3rd_class -> Gender=male, Survive

## Fitting a MIDS model

You can choose whether you want to cache the interactions between rules as used by the objective function.
If we cache the interaction between rules, we might find a solution faster.
Note that this might not be an option when using a lot of rules, as it might require to much memory.

In [7]:
from mdrsl.rule_models.mids.objective_function.mids_objective_function_abstract import AbstractMIDSObjectiveFunction

from mdrsl.utils.value_collection import ValueCollector

ValueCollector.collect_values = True
use_targets_from_rule_set = False
debug=False
AbstractMIDSObjectiveFunction.should_cache_f2_f3 = True


You can choose different submodular maximization algorithms

In [8]:
algorithm="RDGS"
from submodmax.value_reuse.randomized_double_greedy_search import RandomizedDoubleGreedySearch
RandomizedDoubleGreedySearch.N_TRIES = 1

mids = MIDSValueReuse()
mids.fit(df_train,
         use_targets_from_rule_set=use_targets_from_rule_set,
         class_association_rules=cars, algorithm=algorithm, debug=debug)

The following columns are not stringly typed:
	Survived: int64 Maybe you did not discretize all numerical attributes?
CONVERTED THESE COLUMNS TO STRING!
  Passenger_Cat Age_Cat Gender Survived
0          crew   adult   male        0


overlap cache prepared
INITIALIZE f2 f3 cache
max nb of integers necessary: 44700
rough estimate nb of bytes necessary: 1072800
FINISHED INITIALIZATION f2 f3 cache


<mdrsl.rule_models.mids.model_fitting.mids_with_value_reuse.MIDSValueReuse at 0x7f867c08e670>

## Inspecting the fitted model

In [9]:
print(str(mids.classifier))

MIDS classifier (13 rules)
	Rule combination stategy: RuleCombiningStrategy.WEIGHTED_VOTE
	Default value strategy: DefaultClassStrategy.MAJORITY_VALUE_OVER_WHOLE_TRAINING_SET
		Default predictions:{'Gender': 'male', 'Survived': '0', 'Age_Cat': 'adult', 'Passenger_Cat': 'crew'}



In [10]:
tree_mids_rules = mids.classifier.rules
max_len: int = max([len(mids_mcar_to_pretty_string(mids_rule.car)) for mids_rule in tree_mids_rules])

mids_rule_strs = []
for mids_rule in tree_mids_rules:
    mids_rule_strs.append(mids_mcar_to_pretty_string(mids_rule.car).ljust(
            max_len) + "\t" + f"s: {mids_rule.car.support:0.3f}, c: {mids_rule.car.confidence:0.3f}")

mids_rule_strs.sort()
for s in mids_rule_strs:
    print(s)
print()
print("Default predictions:\n", mids.classifier.default_predictions)
print("---")


Age_Cat=adult -> Gender=male, Survived=0                           	s: 0.597, c: 0.626
Age_Cat=adult, Gender=male -> Passenger_Cat=crew                   	s: 0.379, c: 0.506
Age_Cat=adult, Passenger_Cat=1st_class, Survived=1 -> Gender=female	s: 0.062, c: 0.726
Age_Cat=adult, Passenger_Cat=crew -> Survived=0                    	s: 0.294, c: 0.755
Age_Cat=child, Gender=female -> Passenger_Cat=3rd_class            	s: 0.015, c: 0.741
Gender=female -> Age_Cat=adult                                     	s: 0.202, c: 0.908
Gender=female -> Survived=1                                        	s: 0.159, c: 0.714
Gender=male -> Age_Cat=adult, Survived=0                           	s: 0.597, c: 0.767
Gender=male, Passenger_Cat=2nd_class -> Age_Cat=adult, Survived=0  	s: 0.076, c: 0.877
Gender=male, Passenger_Cat=2nd_class -> Survived=0                 	s: 0.076, c: 0.877
Gender=male, Survived=0 -> Age_Cat=adult                           	s: 0.597, c: 0.975
Passenger_Cat=1st_class -> Survived=1      

We can print statistics about the values of the subobjective function during optimization:

In [11]:
print(mids.objective_function.stat_collector)

type              f0          f1          f2          f3         f4          f5          f6     f_total
--------  ----------  ----------  ----------  ----------  ---------  ----------  ----------  ----------
count     302         302         302         302         302        302         302          302
sum       204.633     226.205     300.752     301.913     215.625    297.786     220.14      1767.05
min         0           0.21        0.980919    0.99876     0          0.963       0            4.70586
avg         0.677594    0.749023    0.995868    0.999713    0.71399    0.986045    0.728942     5.85118
max         1           1           1           1           0.75       1           0.803179     6.34336
last_val    0.913333    0.935       0.999601    0.999954    0.75       0.99301     0.75246      6.34336


## Evaluating the MIDS classifier on test data



### Interpretability statistics
We can calculate metrics capturing the interpretability of the selected rule set:

In [12]:
from mdrsl.rule_models.mids.model_evaluation.mids_interpretability_metrics import MIDSInterpretabilityStatisticsCalculator
from mdrsl.rule_models.mids.mids_ruleset import MIDSRuleSet

metrics_calc = MIDSInterpretabilityStatisticsCalculator()
rule_set_stats = metrics_calc.calculate_ruleset_statistics(
    MIDSRuleSet(mids.classifier.rules), df_test, target_attributes=mids.classifier.target_attrs)
print(rule_set_stats)

Rule length stats: count=13, sum=39, min=2, average=3.0, max=4
Fraction bodily overlap: 0.08787878787878788
Fraction uncovered examples: 0.0
Avg fraction predicted classes: 0.75
Fraction predicted classs by target:
	{'Gender': 1.0, 'Survived': 1.0, 'Age_Cat': 0.5, 'Passenger_Cat': 0.5}



  boolean_array: np.ndarray = value_array == value


### Predictive performance
The predictive performance of the MIDS classifier can be evaluated for each attribute in the dataset.

In [14]:
from sklearn.metrics import f1_score
import functools
micro_avged_f1_score = functools.partial(f1_score, average='micro')

metric = micro_avged_f1_score
col_to_acc = {}
for target_attribute in df_test.columns:
    predicted_values = mids.predict(df_test, target_attribute)
    actual_values = df_test[target_attribute].values
    print("column:", target_attribute)
    score = metric(predicted_values, actual_values)
    print("\tscore:", score)
    print("---")
    col_to_acc[target_attribute] = score
    
print(col_to_acc)



column: Passenger_Cat
	score: 0.45
---
column: Age_Cat
	score: 0.9363636363636364
---
column: Gender
	score: 0.8227272727272728
---
column: Survived
	score: 0.756818181818182
---
{'Passenger_Cat': 0.45, 'Age_Cat': 0.9363636363636364, 'Gender': 0.8227272727272728, 'Survived': 0.756818181818182}
