# Basic use of Round Robin on Titanic

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os

import numpy as np
import pandas as pd

from typing import Dict, List, Set
from mdrsl.data_structures.rules.multi_target_class_association_rule import MCAR
from mdrsl.evaluation.predictive_performance_metrics import ScoreInfo
from mdrsl.rule_models.mids.mids_rule import MIDSRule
from mdrsl.rule_generation.association_rule_mining.mlext_impl.mlext_interaction import mine_MCARs_mlext
from mdrsl.rule_models.rr.io_rr_rule_set_learner import store_greedy_naive_classifier, load_greedy_naive_classifier
from mdrsl.rule_models.rr.scoring import score_mt_clf_on_its_targets_without_nans

import matplotlib.pyplot as plt
import seaborn as sns

TargetAttr = str
RuleSetScore = float

## Loading the Titanic toy dataset

In [2]:
from mdrsl.project_info import project_dir
from mdrsl.toy_data.titanic import prepare_data_titanic

data_dir = os.path.join(project_dir, 'data/external')

df_train: pd.DataFrame
df_test: pd.DataFrame
df_train, df_test, dataset_name = prepare_data_titanic(data_dir, prop=0.25)
df_train.head()

Unnamed: 0,Passenger_Cat,Age_Cat,Gender,Survived
0,3rd_class,adult,male,0
1,crew,adult,male,0
2,crew,adult,male,0
3,2nd_class,adult,male,0
4,crew,adult,male,0


In [3]:
cols: np.ndarray = df_train.columns.values
cols

array(['Passenger_Cat', 'Age_Cat', 'Gender', 'Survived'], dtype=object)

## Mining multi-target association rules

In [4]:
min_support = 0.01
min_confidence = 0.5
rule_cutoff = 150

cars: List[MCAR]
time_info: Dict[str, float]

cars, time_info = mine_MCARs_mlext(df_train, min_support=min_support, min_confidence=min_confidence)
if len(cars) > rule_cutoff:
    cars = cars[:rule_cutoff]

In [5]:
from mdrsl.data_structures.rules.pretty_printing import mids_mcar_to_pretty_string
for rule in cars:
    print(mids_mcar_to_pretty_string(rule))

Age_Cat=adult -> Gender=male
Gender=male -> Age_Cat=adult
Age_Cat=adult -> Survived=0
Survived=0 -> Age_Cat=adult
Gender=male -> Survived=0
Survived=0 -> Gender=male
Age_Cat=adult, Gender=male -> Survived=0
Age_Cat=adult, Survived=0 -> Gender=male
Gender=male, Survived=0 -> Age_Cat=adult
Age_Cat=adult -> Gender=male, Survived=0
Gender=male -> Age_Cat=adult, Survived=0
Survived=0 -> Age_Cat=adult, Gender=male
Passenger_Cat=3rd_class -> Age_Cat=adult
Passenger_Cat=3rd_class -> Survived=0
Passenger_Cat=3rd_class -> Gender=male
Age_Cat=adult, Passenger_Cat=3rd_class -> Survived=0
Passenger_Cat=3rd_class, Survived=0 -> Age_Cat=adult
Passenger_Cat=3rd_class -> Age_Cat=adult, Survived=0
Age_Cat=adult, Passenger_Cat=3rd_class -> Gender=male
Gender=male, Passenger_Cat=3rd_class -> Age_Cat=adult
Passenger_Cat=3rd_class -> Age_Cat=adult, Gender=male
Gender=male, Passenger_Cat=3rd_class -> Survived=0
Passenger_Cat=3rd_class, Survived=0 -> Gender=male
Passenger_Cat=3rd_class -> Gender=male, Survive

## Fitting a Round Robin model

In [6]:
mids_rules = list(map(MIDSRule, cars))

In [None]:
from mdrsl.rule_models.rr.rr_rule_set_learner import GreedyRoundRobinTargetRuleClassifier

clf = GreedyRoundRobinTargetRuleClassifier(df_train.columns,verbose=True, keep_score_evolution=True)

selected_set: Set[MIDSRule]
selected_set_scores: Dict[TargetAttr, RuleSetScore]
selected_set, selected_set_scores = clf.fit(df_train, mids_rules)

Calculating init score for Passenger_Cat...
Calculating init score for Age_Cat...
Calculating init score for Gender...
Calculating init score for Survived...
=== start rule Learning ===
Initial target scores (empty set): {'Passenger_Cat': 0.40045420136260407, 'Age_Cat': 0.9515518546555639, 'Gender': 0.7834973504920515, 'Survived': 0.6691900075700227}
It 1 - target: Age_Cat  (1/4)
Found no improvement for Age_Cat
It 1 - target: Gender  (2/4)
for target Gender, 6 rules lead to an improved prediction
best rule results in score 0.8160484481453444
6 rules are being considered as best...
	considering score for Age_Cat: 0.0
	considering score for Age_Cat: 0.0
for target Gender, best rule least hurting others results in score 0.8160 (improvement of 0.0326)
resetting improvement found flags for Gender ...
End it. 2
-------------------------
It 2 - target: Survived  (1/4)
for target Survived, 13 rules lead to an improved prediction
best rule results in score 0.7645722937168812
13 rules are being

The following columns are not stringly typed:
	Survived: int64 Maybe you did not discretize all numerical attributes?
CONVERTED THESE COLUMNS TO STRING!
  Passenger_Cat Age_Cat Gender Survived
0          crew   adult   male        0


## Inspecting the fitted model

In [None]:
print("Rule set scores:")
print(selected_set_scores)
print()
print("Defaults:")
print(clf.default_predictions)
print()
print("Rules: ")
for rule in selected_set:
    print(rule)

In [None]:
greedy_clf_rules = clf.learned_rule_set
max_len: int = max([len(mids_mcar_to_pretty_string(mids_rule.car)) for mids_rule in greedy_clf_rules])
greedy_rule_strs = []
for mids_rule in clf.learned_rule_set:
    greedy_rule_strs.append(mids_mcar_to_pretty_string(mids_rule.car).ljust(max_len)
          + "\t" + f"s: {mids_rule.car.support:0.3f}, c: {mids_rule.car.confidence:0.3f}"
          )
greedy_rule_strs.sort()
for s in greedy_rule_strs:
    print(s)
print("Defaults", clf.default_predictions)
print("---")


In [None]:
clf.default_predictions

In [None]:
target_to_score_info: Dict[TargetAttr, ScoreInfo] = score_mt_clf_on_its_targets_without_nans(
    clf, df_test, filter_nans=True)

In [None]:
for target, val in target_to_score_info.items():
    print(target)
    print(val.to_str("\t"))

### Visualization of the F1-score evolution during the fitting of the model

In [None]:
df_score_evolution = clf.score_evolution_to_dataframe()
df_score_evolution.head()

In [None]:
sns.set(style="whitegrid")

width = 16
height = 3
figsize = (width, height)
fig, axes = plt.subplots(figsize=figsize)
ax = sns.lineplot(x='time_point',
           y='score_value',
           data=df_score_evolution,
           hue='target_attribute'
           )
ax.set_title(f"Evolution of rule set score when adding 1 rule at a time")
# ax3.tight_layout(rect=[0, 0.03, 1, 0.95]) # make sure the subplots don't overlap
    
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
