In [75]:
import pandas as pd
from pyarc import CBA, TransactionDB
from pyarc.algorithms import top_rules, createCARs, M1Algorithm

from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

%matplotlib inline

In [19]:
balanced_data = pd.read_csv("../../titanic.csv")

balanced_data["Died"].value_counts()

1    1179
0     582
Name: Died, dtype: int64

In [49]:
def make_imbalanced_data(balanced_data, class_name, class_value, prop=9):
    one_class_only = balanced_data[balanced_data[class_name] == class_value]

    one_class_only_oversampled = one_class_only.sample(frac=prop, replace=True).reset_index(drop=True)
    
    return pd.concat([balanced_data, one_class_only_oversampled])

In [67]:
titanic_imbalanced = make_imbalanced_data(balanced_data, "Died", 1)

titanic_imbalanced["Died"].value_counts(normalize=True)

1    0.952958
0    0.047042
Name: Died, dtype: float64

In [68]:
txns = TransactionDB.from_DataFrame(titanic_imbalanced)

cba = CBA(support=0.01, confidence=0.6)
cba.fit(txns)

<pyarc.cba.CBA at 0x16ac142f6d8>

In [69]:
cba.clf.rules

[CAR {Passenger_Cat=2nd_class,Gender=male,Age_Cat=adult} => {Died=1} sup: 0.10 conf: 0.99 len: 4, id: 98,
 CAR {Passenger_Cat=3rd_class,Gender=male,Age_Cat=adult} => {Died=1} sup: 0.25 conf: 0.98 len: 4, id: 102,
 CAR {Passenger_Cat=3rd_class,Gender=male} => {Died=1} sup: 0.27 conf: 0.98 len: 3, id: 103,
 CAR {Gender=male,Age_Cat=adult} => {Died=1} sup: 0.84 conf: 0.97 len: 3, id: 110,
 CAR {Passenger_Cat=3rd_class,Age_Cat=adult} => {Died=1} sup: 0.31 conf: 0.97 len: 3, id: 104,
 CAR {Passenger_Cat=crew} => {Died=1} sup: 0.42 conf: 0.97 len: 2, id: 109,
 CAR {Passenger_Cat=3rd_class} => {Died=1} sup: 0.35 conf: 0.97 len: 2, id: 105]

In [70]:
cba.rule_model_accuracy(txns)

0.9595861623019722

In [80]:
cars = createCARs(top_rules(txns.string_representation))
clf = M1Algorithm(cars, txns).build()

clf

Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=4
Rule count: 217, Iteration: 1
Increasing maxlen 4
Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=4, MAX_RULE_LEN=4
Rule count: 312, Iteration: 2
Decreasing confidence to 0.45
Running apriori with setting: confidence=0.45, support=0.0, minlen=2, maxlen=4, MAX_RULE_LEN=4
Rule count: 314, Iteration: 3
Decreasing confidence to 0.4
Running apriori with setting: confidence=0.4, support=0.0, minlen=2, maxlen=4, MAX_RULE_LEN=4
Rule count: 314, Iteration: 4
Decreasing confidence to 0.35000000000000003
Running apriori with setting: confidence=0.35000000000000003, support=0.0, minlen=2, maxlen=4, MAX_RULE_LEN=4
Rule count: 315, Iteration: 5
Decreasing confidence to 0.30000000000000004
Running apriori with setting: confidence=0.30000000000000004, support=0.0, minlen=2, maxlen=4, MAX_RULE_LEN=4
Rule count: 316, Iteration: 6
Decreasing confidence to 0.25000000000000006
Running a

<pyarc.algorithms.classifier.Classifier at 0x16abd9ea748>

In [83]:
clf.test_transactions(txns)

0.954898157129001