In [55]:
from pyarc.data_structures import ClassAssocationRule
from pyids.data_structures import IDSRule
from pyids.model_selection.utils import encode_label
from sklearn.metrics import roc_auc_score
import logging
import numpy as np

logging.basicConfig(level=logging.INFO)

In [2]:
def print_cars(ids_cars):
    cars = ids_cars
    is_ids = False
    if type(ids_cars[0]) == IDSRule:
        is_ids = [ r.f1 for r in ids_cars ]
        cars = [ r.car for r in ids_cars ]
    latex = []
    for idx, car in enumerate(cars):
        if is_ids:
            latex.append(car_to_latex(car, is_ids=is_ids[idx]))
        else:
            latex.append(car_to_latex(car))
        
    return "\n\n".join(latex)
        
def car_to_latex(car, is_ids=False):
    trans = {
        "-":  r"\-",
        "_":  r"\_",
        "]":  r"\]",
        "\\": r"\\",
        "^":  r"\^",
        "$":  r"\$",
        "*":  r"\*",
        ".":  r"\."
    }
    latex = []
    
    latex.append("If")
    
    for idx, predicate in enumerate(car.antecedent):
       
        name, interval = predicate
        name = name.translate(str.maketrans(trans))
        
        interval_string = interval.translate(str.maketrans(trans))
        
        if idx != 0:
            latex.append("and")
        latex.append(
            f"\\textcolor{{blue}}{{{name}}}=\\textcolor{{gray}}{{{interval_string}}}"
        )

        
    class_name, class_value = car.consequent
    
    latex.append("then")
    latex.append(
        f"\\textcolor{{red}}{{{class_name}}}=\\textcolor{{gray}}{{{class_value}}}"
    )
    
    if is_ids:
        latex.append(
            f"$\mid$ f1-score: \\textit{{{round(is_ids, 2)}}}"
        )
    
    return " ".join(latex)

# IDS Demonstration

## Importing required libraries

In [3]:
import pandas as pd
import pyarc
from pyids import IDS
from pyids.algorithms.ids_classifier import mine_CARs
from pyids.rule_mining import RuleMiner
from pyarc.qcba.data_structures import QuantitativeDataFrame

from sklearn.model_selection import train_test_split

## Loading the test and train data

In [4]:
data = pd.read_csv("../data/titanic.csv").sample(frac=1).reset_index(drop=True)


data_train, data_test = train_test_split(data, test_size=0.8)

In [5]:
data.head()

Unnamed: 0,Passenger_Cat,Age_Cat,Gender,Died
0,3rd_class,adult,male,1
1,crew,adult,male,1
2,3rd_class,child,female,1
3,crew,adult,male,1
4,2nd_class,adult,male,1


In [6]:
rm = RuleMiner()
rules = rm.mine_rules(data_train)

[CAR {Age_Cat=adult} => {Died=1} sup: 0.95 conf: 0.00 len: 2, id: 24, CAR {Age_Cat=adult} => {Died=0} sup: 0.95 conf: 0.00 len: 2, id: 25, CAR {Gender=male} => {Died=1} sup: 0.78 conf: 0.00 len: 2, id: 20, CAR {Gender=male} => {Died=0} sup: 0.78 conf: 0.00 len: 2, id: 21, CAR {Gender=male,Age_Cat=adult} => {Died=1} sup: 0.74 conf: 0.00 len: 3, id: 22, CAR {Gender=male,Age_Cat=adult} => {Died=0} sup: 0.74 conf: 0.00 len: 3, id: 23, CAR {Passenger_Cat=crew} => {Died=1} sup: 0.38 conf: 0.00 len: 2, id: 14, CAR {Passenger_Cat=crew} => {Died=0} sup: 0.38 conf: 0.00 len: 2, id: 15, CAR {Passenger_Cat=crew,Age_Cat=adult} => {Died=1} sup: 0.38 conf: 0.00 len: 3, id: 12, CAR {Passenger_Cat=crew,Age_Cat=adult} => {Died=0} sup: 0.38 conf: 0.00 len: 3, id: 13, CAR {Gender=male,Passenger_Cat=crew} => {Died=1} sup: 0.36 conf: 0.00 len: 3, id: 18, CAR {Gender=male,Passenger_Cat=crew} => {Died=0} sup: 0.36 conf: 0.00 len: 3, id: 19, CAR {Gender=male,Passenger_Cat=crew,Age_Cat=adult} => {Died=1} sup: 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [7]:
print(print_cars(rules))

If \textcolor{blue}{Age\_Cat}=\textcolor{gray}{adult} then \textcolor{red}{Died}=\textcolor{gray}{1}

If \textcolor{blue}{Age\_Cat}=\textcolor{gray}{adult} then \textcolor{red}{Died}=\textcolor{gray}{0}

If \textcolor{blue}{Gender}=\textcolor{gray}{male} then \textcolor{red}{Died}=\textcolor{gray}{1}

If \textcolor{blue}{Gender}=\textcolor{gray}{male} then \textcolor{red}{Died}=\textcolor{gray}{0}

If \textcolor{blue}{Gender}=\textcolor{gray}{male} and \textcolor{blue}{Age\_Cat}=\textcolor{gray}{adult} then \textcolor{red}{Died}=\textcolor{gray}{1}

If \textcolor{blue}{Gender}=\textcolor{gray}{male} and \textcolor{blue}{Age\_Cat}=\textcolor{gray}{adult} then \textcolor{red}{Died}=\textcolor{gray}{0}

If \textcolor{blue}{Passenger\_Cat}=\textcolor{gray}{crew} then \textcolor{red}{Died}=\textcolor{gray}{1}

If \textcolor{blue}{Passenger\_Cat}=\textcolor{gray}{crew} then \textcolor{red}{Died}=\textcolor{gray}{0}

If \textcolor{blue}{Passenger\_Cat}=\textcolor{gray}{crew} and \textcolor{bl

In [8]:
quant_dataframe_train = QuantitativeDataFrame(data_train)
quant_dataframe_test = QuantitativeDataFrame(data_test)

In [9]:
data_train.sample(len(data_train)).head(20)

Unnamed: 0,Passenger_Cat,Age_Cat,Gender,Died
389,crew,adult,male,1
1472,1st_class,adult,male,0
523,3rd_class,adult,male,1
509,3rd_class,adult,female,0
335,crew,adult,male,0
1724,crew,adult,male,1
1694,3rd_class,adult,male,1
927,3rd_class,adult,male,1
1206,1st_class,adult,male,1
792,crew,adult,male,0


## Mining the Class Association Rules (CARs)

## Training the IDS Model

In [10]:
ids = IDS()
ids.fit(class_association_rules=rules, quant_dataframe=quant_dataframe_train, random_seed=5)

<pyids.algorithms.ids.IDS at 0x26abff5a860>

In [11]:
print(print_cars(ids.clf.rules))

If \textcolor{blue}{Age\_Cat}=\textcolor{gray}{adult} then \textcolor{red}{Died}=\textcolor{gray}{1} $\mid$ f1-score: \textit{0.61}

If \textcolor{blue}{Gender}=\textcolor{gray}{male} and \textcolor{blue}{Age\_Cat}=\textcolor{gray}{adult} then \textcolor{red}{Died}=\textcolor{gray}{1} $\mid$ f1-score: \textit{0.57}

If \textcolor{blue}{Passenger\_Cat}=\textcolor{gray}{3rd\_class} and \textcolor{blue}{Age\_Cat}=\textcolor{gray}{adult} then \textcolor{red}{Died}=\textcolor{gray}{1} $\mid$ f1-score: \textit{0.2}

If \textcolor{blue}{Gender}=\textcolor{gray}{male} and \textcolor{blue}{Passenger\_Cat}=\textcolor{gray}{3rd\_class} then \textcolor{red}{Died}=\textcolor{gray}{1} $\mid$ f1-score: \textit{0.18}

If \textcolor{blue}{Gender}=\textcolor{gray}{male} and \textcolor{blue}{Age\_Cat}=\textcolor{gray}{adult} then \textcolor{red}{Died}=\textcolor{gray}{0} $\mid$ f1-score: \textit{0.18}

If \textcolor{blue}{Passenger\_Cat}=\textcolor{gray}{3rd\_class} then \textcolor{red}{Died}=\textcolor{

In [12]:
ids.clf.default_class

'0'

In [31]:
ids.clf.default_class_confidence

1.0

In [49]:
pred = np.array(ids.predict(quant_dataframe_train)).astype(float)
real = data_train["Died"].astype(float)

roc_auc_score(pred, real)

0.5726744186046512

## Evaluating the IDS model

In [42]:
print("Accuracy on train data: ", ids.score(quant_dataframe_train))
print("Accuracy on test data: ", ids.score(quant_dataframe_test))

Accuracy on train data:  0.6420454545454546
Accuracy on test data:  0.6834634492547906


In [43]:
print("AUC on train data: ", ids.score_auc(quant_dataframe_train, confidence_based=False))
print("AUC on test data: ", ids.score_auc(quant_dataframe_test, confidence_based=False))

AUC on train data:  0.5
AUC on test data:  0.5


In [44]:
print("AUC on train data: ", ids.score_auc(quant_dataframe_train, confidence_based=True))
print("AUC on test data: ", ids.score_auc(quant_dataframe_test, confidence_based=True))

AUC on train data:  0.4996839443742098
AUC on test data:  0.49391694740523917


In [17]:
ids.score_interpretability_metrics(quant_dataframe_train)

{'fraction_overlap': 0.1932334710743801,
 'fraction_classes': 1.0,
 'fraction_uncovered': 0.005681818181818121,
 'average_rule_width': 1.8181818181818181,
 'ruleset_length': 11}

In [18]:
ids.score_interpretability_metrics(quant_dataframe_test)

{'fraction_overlap': 0.19488999290276787,
 'fraction_classes': 1.0,
 'fraction_uncovered': 0.00709723207948898,
 'average_rule_width': 1.8181818181818181,
 'ruleset_length': 11}

In [88]:
def train_ids(lambda_array):
    ids = IDS()
    ids.fit(class_association_rules=rules, quant_dataframe=quant_dataframe_train, lambda_array=lambda_array)
    
    score_dict = dict()
    score_dict["acc_train"] = ids.score(quant_dataframe_train)
    score_dict["acc_test"] = ids.score(quant_dataframe_test)
    score_dict["auc_train_classbased"] = ids.score_auc(quant_dataframe_train, confidence_based=False)
    score_dict["auc_train_confbased"] = ids.score_auc(quant_dataframe_train, confidence_based=True)
    score_dict["auc_test_classbased"] = ids.score_auc(quant_dataframe_test, confidence_based=False) 
    score_dict["auc_test_confbased"] = ids.score_auc(quant_dataframe_test, confidence_based=True)
    
    pred1 = np.array(ids.predict(quant_dataframe_train))
    pred2 = np.array(ids.predict(quant_dataframe_train))
    
    pred_np = pred1.astype(float)
    real_np = data_train["Died"].astype(float)

    print(roc_auc_score(real_np, pred_np))
    
    real, pred = encode_label(data_train["Died"], pred2)
    print(pred)
    print(real)
    print(roc_auc_score(real, pred))
    
    print(pred_np == pred)
    
    score_dict.update(ids.score_interpretability_metrics(quant_dataframe_test))
    
    return score_dict
    

In [89]:
train_ids([1, 1, 1, 1, 1, 1, 1])

0.6449992976541649
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0
 1 1 1 1 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1
 1 1 1 1 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 1 0 0 0 0
 0 1 1 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 

{'acc_train': 0.7357954545454546,
 'acc_test': 0.7750177430801988,
 'auc_train_classbased': 0.5,
 'auc_train_confbased': 0.31517769349627756,
 'auc_test_classbased': 0.5,
 'auc_test_confbased': 0.31170610813497546,
 'fraction_overlap': 0.1332758795498327,
 'fraction_classes': 1.0,
 'fraction_uncovered': 0.0,
 'average_rule_width': 1.875,
 'ruleset_length': 8}

In [104]:
def encode_label(actual, predicted):
    levels = set(actual) | set(predicted)
    
    actual_copy = np.copy(actual)
    predicted_copy = np.copy(predicted)

    for idx, level in enumerate(levels):
        print(idx, level)
        actual_copy[actual == level] = idx
        predicted_copy[predicted == level] = idx

    actual_copy = actual_copy.astype(int)
    predicted_copy = predicted_copy.astype(int)
        
    return actual_copy, predicted_copy

In [97]:
ids = IDS()
ids.fit(class_association_rules=rules, quant_dataframe=quant_dataframe_train)

<pyids.algorithms.ids.IDS at 0x26acb5f8cc0>

In [105]:
pred_enc

array(['1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '1', '0',
       '1', '0', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '0', '1', '0', '1', '1', '1', '1', '0', '0', '1', '1',
       '0', '1', '1', '0', '1', '1', '1', '1', '0', '1', '0', '1', '0',
       '1', '0', '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '0', '1', '1', '0', '0', '0', '0', '1',
       '1', '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '1', '0',
       '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1',
       '1', '0', '0', '1', '1', '1', '0', '1', '1', '0', '1', '0', '1',
       '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1',
       '0', '1', '1', '1', '1', '0', '1', '0', '1', '1', '1', '1', '0',
       '1', '1', '1', '0', '1', '1', '0', '0', '1', '0', '1', '1

In [106]:
pred1 = np.array(ids.predict(quant_dataframe_train))
pred2 = np.array(ids.predict(quant_dataframe_train))

pred_np = pred1.astype(float)
real_np = data_train["Died"].astype(float)

print(roc_auc_score(real_np, pred_np))

real_enc, pred_enc = encode_label(data_train["Died"], pred2)
print(roc_auc_score(real_enc, pred_enc))

print(pred_np == pred_enc)

0.6964461300744487
0 1
1 0
0.6964461300744487
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 Fals

In [73]:
pred1 = np.array(ids.predict(quant_dataframe_train))
pred2 = np.array(ids.predict(quant_dataframe_train))
real = data_train["Died"]

In [76]:
pred1

array(['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1

In [79]:
encode_label(real, pred)

(array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
        0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
        1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
        1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
        0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
        1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 

In [27]:
print(data_train["Died"].value_counts(True))
print(data_test["Died"].value_counts(True))

1    0.642045
0    0.357955
Name: Died, dtype: float64
1    0.676366
0    0.323634
Name: Died, dtype: float64
