In [1]:
%run ../../main.py

In [59]:
from cba.algorithms import top_rules
from cba.data_structures import TransactionDB, Consequent, Antecedent, Item, ClassAssocationRule
from cba.algorithms import M1Algorithm, M2Algorithm
import pandas as pd
from sklearn.metrics import accuracy_score

directory = "c:/code/python/machine_learning/assoc_rules"

def func(datasetname):
    pd_ds = pd.read_csv("c:/code/python/machine_learning/assoc_rules/train/{}.csv".format(datasetname))
    txns = TransactionDB.from_pandasdf(pd_ds)
    
    txns_test = TransactionDB.from_pandasdf(pd.read_csv("c:/code/python/machine_learning/assoc_rules/test/{}.csv".format(datasetname)))

    rules = top_rules(txns.string_representation, appearance=txns.appeardict)

    rules.sort(reverse=True)


    cars = []
    for idx, rule in enumerate(rules):
        con_tmp, ant_tmp, support, confidence = rule

        con = Consequent(*con_tmp.split("="))

        ant_items = [ Item(*i.split("=")) for i in ant_tmp ]
        ant = Antecedent(ant_items)

        id_len = len(ant)

        car = ClassAssocationRule(ant, con, support=support, confidence=confidence, id_rule=id_len)
        cars.append(car)

    cars.sort(reverse=True)

    if len(cars) > 1000:
        cars = cars[:1000]
        

    print("len(rules)", len(cars))

    m1 = M1Algorithm(cars, txns)
    
    m2 = M2Algorithm(cars, txns)
    
    m1clf = m1.build()
    m2clf = m2.build()
    
    
    actual = list(map(lambda i: i.value, txns_test.class_labels))

    pred = m1clf.predict_all(txns_test)
    predM2 = m2clf.predict_all(txns_test)
    
    accM2 = accuracy_score(predM2, actual)
    acc = accuracy_score(pred, actual)

    return acc, accM2



def mean_func(dataset_name, start=0, end=10):
    files = [ dataset_name + repr(i) for i in range(start, end) ]

    accs = []
    accsM2 = []
    
    for file in files:
        acc, accM2 = func(file)
        print("done", file, acc)
        print("done m2", file, accM2)
        accs.append(acc)
        accsM2.append(accM2)
        
    mn = sum(accs) / len(accs)
    mnM2 = sum(accsM2) / len(accsM2)
    
    return mn, mnM2


                
datasets = ["breast-w", "anneal", "hypothyroid", "ionosphere", "lymph", "vehicle", "autos", "diabetes", "glass", "heart-h", "tic-tac-toe", "australian"]    

means = []
meansM2 = []
for dataset in ["sick"]:
    acc, accM2 = mean_func(dataset, start=3, end=10)
    print("*****")
    print("M1", dataset, acc)
    print("M2", dataset, accM2)
    print("******")
    
    means.append((dataset, acc))
    meansM2.append((dataset, accM2))
    
print("M1")
print(means)
print("\nM2")
print(meansM2)

Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=29
Rule count: 2827, Iteration: 1
Target rule count satisfied: 1000
len(rules) 1000
done sick3 0.957559681698
done m2 sick3 0.962864721485
Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=29
Rule count: 2853, Iteration: 1
Target rule count satisfied: 1000
len(rules) 1000
done sick4 0.968169761273
done m2 sick4 0.968169761273
Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=29
Rule count: 2848, Iteration: 1
Target rule count satisfied: 1000
len(rules) 1000
done sick5 0.973474801061
done m2 sick5 0.973474801061
Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=29
Rule count: 2770, Iteration: 1
Target rule count satisfied: 1000
len(rules) 1000
done sick6 0.984084880637
done m2 sick6 0.984084880637
Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=

In [60]:
"""
m2df = pd.DataFrame(dict(map(lambda i:  (i[0], [i[1]]), meansM2))).transpose()
m2df.columns = ["m2"]

m1df = pd.DataFrame(dict(map(lambda i:  (i[0], [i[1]]), means))).transpose()
m1df.columns = ["m1"]
"""

#acc_df = m1df.join(m2df)

#acc_df.to_csv("../data/accuracies.csv")

acc_df = pd.read_csv("../data/accuracies.csv")



0.8273412000811476
0.828698171981806


In [67]:
#acc_df.columns = ["name", "m1", "m2"]

#acc_df.set_index("name", inplace=True)

acc_df

Unnamed: 0_level_0,m1,m2
name,Unnamed: 1_level_1,Unnamed: 2_level_1
anneal,0.991061,0.989924
australian,0.80714,0.862259
autos,0.794512,0.644781
breast-w,0.951343,0.951406
diabetes,0.695437,0.756545
glass,0.676144,0.699997
heart-h,0.656617,0.663875
hypothyroid,0.985692,0.985161
ionosphere,0.925411,0.925411
lymph,0.789127,0.79621


In [83]:
ARC_accs = pd.read_csv("../data/arc_accs3.csv")

ARC_accs_mod = ARC_accs[["dataset", "accuracy"]].set_index("dataset")

ARC_accs_mod.columns = ["ARC"]

accs = acc_df.join(ARC_accs_mod)

print("M1", accs["m1"].mean())
print("M2", accs["m2"].mean())
print("ARC", accs["ARC"].mean())

M1 0.8273412000811476
M2 0.828698171981806
ARC 0.8125
