In [1]:
%run ../../main.py

In [3]:
from cba.algorithms import top_rules
from cba.data_structures import TransactionDB, Consequent, Antecedent, Item, ClassAssocationRule
from cba.algorithms import M1Algorithm, M2Algorithm, createCARs
import pandas as pd
from sklearn.metrics import accuracy_score

directory = "c:/code/python/machine_learning/assoc_rules"

def func(datasetname):
    pd_ds = pd.read_csv("c:/code/python/machine_learning/assoc_rules/train/{}.csv".format(datasetname))
    txns = TransactionDB.from_pandasdf(pd_ds)
    
    txns_test = TransactionDB.from_pandasdf(pd.read_csv("c:/code/python/machine_learning/assoc_rules/test/{}.csv".format(datasetname)))

    rules = top_rules(txns.string_representation, appearance=txns.appeardict)

    rules.sort(reverse=True)


    cars = createCARs(rules)

    cars.sort(reverse=True)

    if len(cars) > 1000:
        cars = cars[:1000]
        

    print("len(rules)", len(cars))

    m1 = M1Algorithm(cars, txns)
    
    m2 = M2Algorithm(cars, txns)
    
    m1clf = m1.build()
    m2clf = m2.build()
    
    
    actual = list(map(lambda i: i.value, txns_test.class_labels))

    pred = m1clf.predict_all(txns_test)
    predM2 = m2clf.predict_all(txns_test)
    
    accM2 = accuracy_score(predM2, actual)
    acc = accuracy_score(pred, actual)

    return acc, accM2



def mean_func(dataset_name, start=0, end=10):
    files = [ dataset_name + repr(i) for i in range(start, end) ]

    accs = []
    accsM2 = []
    
    for file in files:
        acc, accM2 = func(file)
        print("done", file, acc)
        print("done m2", file, accM2)
        accs.append(acc)
        accsM2.append(accM2)
        
    mn = sum(accs) / len(accs)
    mnM2 = sum(accsM2) / len(accsM2)
    
    return mn, mnM2


                
datasets = [
    "breast-w",
    "anneal",
    "hypothyroid",
    "ionosphere",
    "lymph",
    "vehicle",
    "autos",
    "diabetes",
    "glass",
    "heart-h",
    "tic-tac-toe",
    "australian",
    "sick",
    "segment",
    "spambase",
    "sonar",
    "vowel",
    "hepatitis",
    "credit-a",
    "mushroom",
    "house-votes-84",
    "soybean",
    "primary-tumor",
    "credit-g",
    "audiology",
    "breast-cancer",
    "balance-scale",
    "heart-c",
    "kr-vs-kp",
    "pima",
    "heart-statlog"]    

means = []
meansM2 = []
for dataset in ["mushroom"]:
    acc, accM2 = mean_func(dataset)
    print("*****")
    print("M1", dataset, acc)
    print("M2", dataset, accM2)
    print("******")
    
    means.append((dataset, acc))
    meansM2.append((dataset, accM2))
    
print("M1")
print(means)
print("\nM2")
print(meansM2)

Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=23
Rule count: 4892, Iteration: 1
Target rule count satisfied: 1000
len(rules) 1000
done mushroom0 0.9987699877
done m2 mushroom0 0.993849938499
Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=23
Rule count: 4890, Iteration: 1
Target rule count satisfied: 1000
len(rules) 1000
done mushroom1 0.9975399754
done m2 mushroom1 0.993849938499
Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=23
Rule count: 4893, Iteration: 1
Target rule count satisfied: 1000
len(rules) 1000
done mushroom2 0.9987699877
done m2 mushroom2 0.9963099631
Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=23
Rule count: 4890, Iteration: 1
Target rule count satisfied: 1000
len(rules) 1000
done mushroom3 1.0
done m2 mushroom3 0.9963099631
Running apriori with setting: confidence=0.5, support=0.0, minlen

In [17]:

m2df = pd.DataFrame(dict(map(lambda i:  (i[0], [i[1]]), meansM2))).transpose()
m2df.columns = ["m2"]

m1df = pd.DataFrame(dict(map(lambda i:  (i[0], [i[1]]), means))).transpose()
m1df.columns = ["m1"]


acc_df = m1df.join(m2df)

acc_df.to_csv("../data/accuracies.csv")

#acc_df = pd.read_csv("../data/accuracies.csv")



In [18]:
acc_df

Unnamed: 0,m1,m2
anneal,0.991061,0.989924
audiology,0.492008,0.451806
australian,0.853562,0.855074
autos,0.804456,0.649543
balance-scale,0.742104,0.73729
breast-cancer,0.720501,0.696117
breast-w,0.952772,0.954222
credit-a,0.818922,0.833312
credit-g,0.71,0.74
diabetes,0.744805,0.750085


In [19]:
ARC_accs = pd.read_csv("../data/arc_accs3.csv")

ARC_accs_mod = ARC_accs[["dataset", "accuracy"]].set_index("dataset")

ARC_accs_mod.columns = ["ARC"]

accs = acc_df.join(ARC_accs_mod)

print("M1", accs["m1"].mean())
print("M2", accs["m2"].mean())
print("ARC", accs["ARC"].mean())

M1 0.8061054835913428
M2 0.7880112605256341
ARC 0.7777419354838709


In [20]:
accs

Unnamed: 0,m1,m2,ARC
anneal,0.991061,0.989924,0.94
audiology,0.492008,0.451806,0.59
australian,0.853562,0.855074,0.86
autos,0.804456,0.649543,0.7
balance-scale,0.742104,0.73729,0.69
breast-cancer,0.720501,0.696117,0.73
breast-w,0.952772,0.954222,0.97
credit-a,0.818922,0.833312,0.85
credit-g,0.71,0.74,0.74
diabetes,0.744805,0.750085,0.74
