In [3]:
%run ../../main.py
%matplotlib inline

In [4]:
import pandas as pd

from cba.algorithms import M1Algorithm, M2Algorithm, top_rules, createCARs 
from cba.data_structures import TransactionDB

In [9]:
#
#
# =========================
# Oveření běhu v závislosti na vložených pravidlech / instancích
# =========================
#
#
#

import time

rule_count = 100

benchmark_data = {
    "input rows": [],
    "input rules": [],
    "output rules M1 pyARC": [],
    "output rules M1 pyARC unique": [],
    "output rules M2 pyARC": [],
    "time M1 pyARC": [],
    "time M1 pyARC unique": [],
    "time M2 pyARC": []
}

stop_m2 = False

number_of_iterations = 30

directory = "c:/code/python/machine_learning/assoc_rules"

dataset_name_benchmark = "lymph0"

pd_ds = pd.read_csv("c:/code/python/machine_learning/assoc_rules/train/{}.csv".format(dataset_name_benchmark))

for i in range(11):
    dataset_name_benchmark = "lymph0"
    
    pd_ds = pd.concat([pd_ds, pd_ds])
    
    txns = TransactionDB.from_DataFrame(pd_ds, unique_transactions=True)
    txns_unique = TransactionDB.from_DataFrame(pd_ds, unique_transactions=False) 
    
    rules = top_rules(txns.string_representation, appearance=txns.appeardict, target_rule_count=rule_count)

    cars = createCARs(rules)
     
    if len(cars) > rule_count:
        cars = cars[:rule_count]    

        
    m1t1 = time.time()
    m1clf_len = []
    for _ in range(number_of_iterations):
        m1 = M1Algorithm(cars, txns)
        clf = m1.build()
        m1clf_len.append(len(clf.rules) + 1)
    
    m1t2 = time.time()
    
    
    
    m1t1_unique = time.time()
    m1clf_len_unique = []
    for _ in range(number_of_iterations):
        m1 = M1Algorithm(cars, txns_unique)
        clf = m1.build()
        m1clf_len_unique.append(len(clf.rules) + 1)
    
    m1t2_unique = time.time()
    
    
    
    if not stop_m2:
        m2t1 = time.time()
        m2clf_len = []
        for _ in range(number_of_iterations):
            m2 = M2Algorithm(cars, txns)
            clf = m2.build()
            m2clf_len.append(len(clf.rules) + 1)

        m2t2 = time.time()
    
     
    m1duration = (m1t2 - m1t1) / number_of_iterations
    m1duration_unique = (m1t2_unique - m1t1_unique) / number_of_iterations
    outputrules_m1 = sum(m1clf_len) / len(m1clf_len)
    outputrules_m1_unique = sum(m1clf_len_unique) / len(m1clf_len_unique)
    
    if not stop_m2:
        m2duration = (m2t2 - m2t1) / number_of_iterations
        outputrules_m2 = sum(m2clf_len) / len(m2clf_len)
        if m2duration > 0.5:
            stop_m2 = True
    
    benchmark_data["input rows"].append(len(txns))
    benchmark_data["input rules"].append(rule_count)
    benchmark_data["output rules M1 pyARC"].append(outputrules_m1)
    benchmark_data["output rules M1 pyARC unique"].append(outputrules_m1_unique)
    benchmark_data["output rules M2 pyARC"].append(None if stop_m2 else outputrules_m2)
    benchmark_data["time M1 pyARC"].append(m1duration)
    benchmark_data["time M1 pyARC unique"].append(m1duration_unique)
    benchmark_data["time M2 pyARC"].append(None if stop_m2 else m2duration)

    print("data_count:", len(txns))
    print("M1 duration:", m1duration)
    print("M1 unique duration", m1duration_unique)
    print("M1 output rules", outputrules_m1)
    if not stop_m2:
        print("M2 duration:", m2duration)
        print("M2 output rules", outputrules_m2)
    print("\n\n")

Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=19
Rule count: 2266, Iteration: 1
Target rule count satisfied: 100
data_count: 260
M1 duration: 0.01953581174214681
M1 unique duration 0.021085572242736817
M1 output rules 25.0
M2 duration: 0.04892378648122152
M2 output rules 25.0



Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=19
Rule count: 2266, Iteration: 1
Target rule count satisfied: 100
data_count: 520
M1 duration: 0.03185047308603923
M1 unique duration 0.026241016387939454
M1 output rules 25.0
M2 duration: 0.09020005861918132
M2 output rules 25.0



Running apriori with setting: confidence=0.5, support=0.0, minlen=2, maxlen=3, MAX_RULE_LEN=19
Rule count: 2266, Iteration: 1
Target rule count satisfied: 100
data_count: 1040
M1 duration: 0.059152650833129886
M1 unique duration 0.04903545379638672
M1 output rules 25.0
M2 duration: 0.17896058559417724
M2 output rules 25.0



Running apriori with 

KeyboardInterrupt: 

In [None]:
#benchmark_data.pop("M2_duration")

benchmark_df = pd.DataFrame(benchmark_data)

benchmark_df.plot(x=["input rows"], y=["time M1 pyARC", "time M2 pyARC"])

#benchmark_df.to_csv("../data/data_sensitivity.csv")

In [None]:
benchmark_df

In [None]:
R_benchmark = pd.read_csv("../data/arc-data-size.csv")

R_benchmark[["input rows"]] = R_benchmark[["input rows"]].astype(str)
R_benchmark.set_index("input rows", inplace=True)

In [None]:
R_benchmark.head()

In [None]:
benchmark_df[["input rows"]] = benchmark_df[["input rows"]].astype(str)
benchmark_df = benchmark_df.set_index("input rows")

In [None]:
benchmark_all = benchmark_df.join(R_benchmark, lsuffix="_py", rsuffix="_R")
benchmark_all

In [None]:
import matplotlib.pyplot as plt

labels = ["pyARC - m1", "pyARC - m2", "arc", "rCBA", "arulesCBA"]

ax = benchmark_all.plot(y=["time M1 pyARC", "time M2 pyARC", "time_arc", "time_acba", "time_rcba"])
ax.legend(labels)

plt.savefig("../data/data_size_sensitivity_plot.png")

In [None]:
benchmark_all.plot(y=["time M1 pyARC", "time M2 pyARC", "time_arc", "time_acba", "time_rcba"])