In [2]:
%run ../../main.py

In [4]:
import collections

collections.Counter??

In [3]:
from cba.algorithms import top_rules
from cba.data_structures import TransactionDB, Consequent, Antecedent, Item, ClassAssocationRule
from cba.algorithms import M1Algorithm, M2Algorithm
import pandas as pd
from sklearn.metrics import accuracy_score

directory = "c:/code/python/machine_learning/assoc_rules"

def func(datasetname):
    pd_ds = pd.read_csv("c:/code/python/machine_learning/assoc_rules/train/{}.csv".format(datasetname))
    txns = TransactionDB.from_pandasdf(pd_ds)
    
    txns_test = TransactionDB.from_pandasdf(pd.read_csv("c:/code/python/machine_learning/assoc_rules/test/{}.csv".format(datasetname)))

    rules = top_rules(txns.string_representation, appearance=txns.appeardict)

    rules.sort(reverse=True)


    cars = []
    for idx, rule in enumerate(rules):
        con_tmp, ant_tmp, support, confidence = rule

        con = Consequent(*con_tmp.split("="))

        ant_items = [ Item(*i.split("=")) for i in ant_tmp ]
        ant = Antecedent(ant_items)

        car = ClassAssocationRule(ant, con, support=support, confidence=confidence)
        cars.append(car)

    cars.sort(reverse=True)

    if len(cars) > 1000:
        cars = cars[:1000]
        

    print("len(rules)", len(cars))

    m1 = M1Algorithm(cars, txns)
    
    m2 = M2Algorithm(cars, txns)
    
    m1clf = m1.build()
    m2clf = m2.build()
    
    
    actual = list(map(lambda i: i.value, txns_test.class_labels))

    pred = m1clf.predict_all(txns_test)
    predM2 = m2clf.predict_all(txns_test)
    
    accM2 = accuracy_score(predM2, actual)
    acc = accuracy_score(pred, actual)

    return acc, accM2



def mean_func(dataset_name, start=0, end=10):
    files = [ dataset_name + repr(i) for i in range(start, end) ]

    accs = []
    accsM2 = []
    
    for file in files:
        acc, accM2 = func(file)
        print("done", file, acc)
        print("done m2", file, accM2)
        accs.append(acc)
        accsM2.append(accM2)
        
    mn = sum(accs) / len(accs)
    mnM2 = sum(accsM2) / len(accsM2)
    
    return mn, mnM2


                
datasets = ["breast-w", "anneal", "hypothyroid", "ionosphere", "lymph", "vehicle", "autos", "diabetes", "glass", "heart-h", "tic-tac-toe", "australian"]    

means = []
meansM2 = []
for dataset in ["anneal"]:
    acc, accM2 = mean_func(dataset)
    print("*****")
    print("M1", dataset, acc)
    print("M2", dataset, accM2)
    print("******")
    
    means.append((dataset, acc))
    meansM2.append((dataset, accM2))
    
print("M1")
print(means)
print("\nM2")
print(meansM2)

Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=33
Rule count: 6309, Iteration: 1
Target rule count satisfied: 1000
len(rules) 1000
done anneal0 1.0
done m2 anneal0 1.0
Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=33
Rule count: 6951, Iteration: 1
Target rule count satisfied: 1000
len(rules) 1000
done anneal1 1.0
done m2 anneal1 1.0
Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=33
Rule count: 6535, Iteration: 1
Target rule count satisfied: 1000
len(rules) 1000
done anneal2 1.0
done m2 anneal2 1.0
Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=33
Rule count: 6821, Iteration: 1
Target rule count satisfied: 1000
len(rules) 1000
done anneal3 0.989010989011
done m2 anneal3 0.989010989011
Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=33
Rule count: 6745, Iteration: 1


In [4]:

m2df = pd.DataFrame(dict(map(lambda i:  (i[0], [i[1]]), meansM2))).transpose()
m2df.columns = ["m2"]

m1df = pd.DataFrame(dict(map(lambda i:  (i[0], [i[1]]), means))).transpose()
m1df.columns = ["m1"]


acc_df = m1df.join(m2df)

acc_df.to_csv("../data/accuracies.csv")

#acc_df = pd.read_csv("../data/accuracies.csv")



In [6]:
acc_df

Unnamed: 0,m1,m2
anneal,0.991061,0.989924
australian,0.810039,0.855074
autos,0.800068,0.650337
breast-w,0.951343,0.951406
diabetes,0.707177,0.750085
glass,0.638078,0.699759
heart-h,0.660189,0.663875
hypothyroid,0.985692,0.985161
ionosphere,0.925411,0.925411
lymph,0.789127,0.79621


In [8]:
ARC_accs = pd.read_csv("../data/arc_accs3.csv")

ARC_accs_mod = ARC_accs[["dataset", "accuracy"]].set_index("dataset")

ARC_accs_mod.columns = ["ARC"]

accs = acc_df.join(ARC_accs_mod)

print("M1", accs["m1"].mean())
print("M2", accs["m2"].mean())
print("ARC", accs["ARC"].mean())

M1 0.8252761948168871
M2 0.828101175891974
ARC 0.8125


In [9]:
accs

Unnamed: 0,m1,m2,ARC
anneal,0.991061,0.989924,0.94
australian,0.810039,0.855074,0.86
autos,0.800068,0.650337,0.7
breast-w,0.951343,0.951406,0.97
diabetes,0.707177,0.750085,0.74
glass,0.638078,0.699759,0.71
heart-h,0.660189,0.663875,0.82
hypothyroid,0.985692,0.985161,0.97
ionosphere,0.925411,0.925411,0.91
lymph,0.789127,0.79621,0.81
