In [3]:
%run ../../main.py
%matplotlib inline
#%run ../visualization/CBA_visualization.ipynb

import pandas as pd
import pyarc

from pyarc import TransactionDB

In [4]:
movies_train_undiscr = pd.read_csv("../data/movies.csv", sep=";", index_col=0)
movies_train_discr = pd.read_csv("../data/movies_discr.csv", sep=";", index_col=0)

movies_undiscr_txns = movies_train_undiscr.reset_index()
movies_discr_txns = TransactionDB.from_DataFrame(movies_train_discr)

movies_train_discr.head()

Unnamed: 0,estimated-budget,a-list-celebrities,class
0,<150;200),<0;2),box-office-bomb
1,<50;100),<0;2),box-office-bomb
2,<50;100),<0;2),box-office-bomb
3,<50;100),<2;4),box-office-bomb
4,<200;250),<0;2),box-office-bomb


In [5]:
movies_undiscr_txns

Unnamed: 0,estimated-budget,a-list-celebrities,class
0,160,1,box-office-bomb
1,55,2,box-office-bomb
2,78,1,box-office-bomb
3,66,3,box-office-bomb
4,223,2,box-office-bomb
5,152,1,box-office-bomb
6,10,1,box-office-bomb
7,202,3,box-office-bomb
8,154,4,box-office-bomb
9,132,2,box-office-bomb


In [6]:
from pyarc.data_structures import Transaction

class UndiscretizedTransaction(Transaction):
    
    def issuperset(self, other):
        return self.frozenset >= other.frozenset
        
    def issubset(self, other):
        return self.frozenset <= other.frozenset 
        
    def __ge__(self, other):
        return self.issuperset(other)
        
    def __le__(self, other):
        return self.issubset(other)
    
    pass

In [7]:
import re
import numpy as np


def make_intervalfunc(minv, maxv, left_inclusivity, right_inclusivity):
    def inner_func(value):
        if greaterthan(value, minv, left_inclusivity) and lesserthan(value, maxv, right_inclusivity):
            return True
        else:
            return False
        
    return inner_func
        
def greaterthan(a, b, inclusivity):
    if inclusivity:
        if a >= b: return True
    elif a > b: return True
    
    return False
        
def lesserthan(a, b, inclusivity):
    if inclusivity:
        if a <= b: return True
    elif a < b: return True
    
    return False


class Interval:

    interval_regex = re.compile("(<|\()(\d+);(\d+)(\)|>)")
    
    def __init__(self, interval_string="<0;0)"):
        try:
            args = Interval.interval_regex.findall(interval_string)[0]
            self.left_bracket, self.minval, self.maxval, self.right_bracket = args
            
            self.left_inclusive = True if self.left_bracket == "<" else False
            self.right_inclusive = True if self.right_bracket == ">" else False
            
            self.minval, self.maxval = float(self.minval), float(self.maxval)
            
            self.__membership_func = np.vectorize(
                make_intervalfunc(self.minval, self.maxval, self.left_inclusive, self.right_inclusive)
            )
            
        except Exception as e:
            raise e
            
    @classmethod        
    def from_scalars(clazz, minval, maxval, left_inc, right_inc):
        """rework this as the default constructor
        """
        interval_string = "{}{};{}{}".format(
            "<" if left_inc else "(",
            minval,
            maxval,
            ">" if right_inc else ")"
        )
        
        return clazz(interval_string)
    
    def __hash__(self):
        return hash(repr(self))
            
    def refit(self, vals):
        """refit values to grid
        """
        values = np.array(vals)
        
        mask = self.test_membership(values)
        new_array = values[mask]

        left, right = min(new_array), max(new_array)

        return Interval.from_scalars(left, right, True, True)
        
            
    def test_membership(self, value):
        return self.__membership_func(value)
        

    def string(self):
        return "{}{};{}{}".format(self.left_bracket, self.minval, self.maxval, self.right_bracket)
        
    def __repr__(self):
        return "Interval[{}{};{}{}]".format(self.left_bracket, self.minval, self.maxval, self.right_bracket)
        

class QCBAStep:
    
    def transform(self):
        pass
    


i1 = Interval("<30;40)")
i2 = Interval("(30;40>")

assert i1.test_membership(30) == True
assert i1.test_membership(35) == True
assert i1.test_membership(40) == False

assert i2.test_membership(30) == False
assert i2.test_membership(35) == True
assert i2.test_membership(40) == True


arr1 = [32, 33, 35, 36, 38]

i1.refit(arr1)

#i1.intersect_values(arr1)


Interval[<32.0;38.0>]

In [8]:
from pyarc import CBA
from pyarc.algorithms import generateCARs
import functools

rm = CBA(algorithm="m1", confidence=0.5, support=0.05).fit(movies_discr_txns)

rm.clf.rules

cars = generateCARs(movies_discr_txns)


movies_discr_txns.string_representation


def make_literal_dict(string_repr):
    literal_dict = {}
    
    for tid, transaction in enumerate(string_repr):
        for literal in transaction:
            if literal in literal_dict:
                literal_dict[literal].add(tid)
            else:
                literal_dict[literal] = set([tid])
                
                
    return literal_dict


def reduce_intersection(prev, curr):
    return prev.intersection(curr)


def calculate_supp_conf(ant_list, cons, litdict):
    
    fetch_from_litdict = lambda item: litdict[item]
    
    ant_tid = map(fetch_from_litdict, ant_list)
    cons_tid = litdict[cons]
    
    ant_all_tid = set(functools.reduce(reduce_intersection, ant_tid))
    
    supp_r = len(ant_all_tid.intersection(cons_tid))
    supp_ant = len(ant_all_tid)
    
    
    conf_r = supp_r / supp_ant
    
    return supp_r / n, conf_r
    
    
def prune_literals(rule, litdict, n):

    if len(rule.antecedent) <= 1:
        return rule
    
    idx_to_remove = []
    
    ant = list(rule.antecedent)
    ant = [ ":=:".join(item) for item in ant  ]
    
    cons = ":=:".join(rule.consequent)
    
    
    fetch_from_litdict = lambda item: litdict[item]
    
    ant_tid = [ litdict[item] for item in ant ]
    cons_tid = litdict[cons]
    
    ant_all_tid = set(functools.reduce(reduce_intersection, ant_tid))
    
    all_intersection_tid = ant_all_tid.intersection(cons_tid) 
    
    supp_r = len(all_intersection_tid)
    supp_ant = len(ant_all_tid)
    
    
    conf_r = supp_r / supp_ant
    
    new_supp_r = 0
    new_conf_r = 0
    

    for idx_literal, literal in enumerate(ant_tid):
        new_ant_all_tid = literal - ant_all_tid
        
        if not new_ant_all_tid:
            continue
        
        new_supp_r = literal - all_intersection_tid
        new_conf_r = len(all_intersection_tid) / len(new_ant_all_tid)
        
        if new_conf_r > conf_r:
            print("changed")
            conf_r = new_conf_r
            all_intersection_tid = new_supp_r
            ant_all_tid = new_ant_all_tid
    
            idx_to_remove.append(idx_literal)
        
    
    if not idx_to_remove:
        return rule
    
    
    
    antecedent = list(r.antecedent)
    new_items = []
    
    for idx in idx_to_remove:
        antecedent.pop(idx)
    
    for item in antecedent:
        new_items.append(pyarc.data_structures.Item(*item))
        
    new_antecedent = pyarc.data_structures.Antecedent(new_items)
    
    rule.antecedent = new_antecedent
    rule.support = len(new_supp_r) / n
    rule.confidence = new_conf_r
    
    return rule
    
    
    
    
    
    
    
    

litdict = make_literal_dict(movies_discr_txns.string_representation)
transaction_len = len(movies_discr_txns.string_representation) 


for idx, r in enumerate(cars):
    before = repr(r)
    after = repr(prune_literals(r, litdict, transaction_len))
    
    
    if before == after:
        print(idx, "same")
        print(before)
        print(after)
    else:
        print(idx, "different")
        print(before)
        print(after)
        
    print()

0 same
CAR {a-list-celebrities=<0;2)} => {class=box-office-bomb} sup: 0.31 conf: 1.00 len: 2, id: 48
CAR {a-list-celebrities=<0;2)} => {class=box-office-bomb} sup: 0.31 conf: 1.00 len: 2, id: 48

1 same
CAR {estimated-budget=<0;50),a-list-celebrities=<0;2)} => {class=box-office-bomb} sup: 0.11 conf: 1.00 len: 3, id: 41
CAR {estimated-budget=<0;50),a-list-celebrities=<0;2)} => {class=box-office-bomb} sup: 0.11 conf: 1.00 len: 3, id: 41

2 same
CAR {estimated-budget=<50;100),a-list-celebrities=<0;2)} => {class=box-office-bomb} sup: 0.09 conf: 1.00 len: 3, id: 47
CAR {estimated-budget=<50;100),a-list-celebrities=<0;2)} => {class=box-office-bomb} sup: 0.09 conf: 1.00 len: 3, id: 47

3 same
CAR {estimated-budget=<250;300)} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 2, id: 25
CAR {estimated-budget=<250;300)} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 2, id: 25

4 same
CAR {a-list-celebrities=<0;2),estimated-budget=<150;200)} => {class=box-office-bomb} sup: 0.06 conf: 1.00

In [21]:
from pyarc.data_structures import ClassAssocationRule

class QuantClassAssociationRule:
    
    def __init__(self, rule):
        
        if type(rule) != ClassAssocationRule:
            raise Exception("Type of rule must be: ClassAssocationRule")
        
        self.rule = rule
    
        antecedent_string = rule.antecedent
        print(antecedent_string)
        
        self.new_antecedent = []
        
        for item in antecedent_string:
            print()
            attr, val = item
            
            interval = Interval(val)
            
            print(attr, interval)
            
            self.new_antecedent.append((attr, interval))
    
        print("========", "=======", sep="\n")
        
        
    def __repr__(self):
        r = self.rule
        
        ant = self.new_antecedent
        ant_string_arr = [ key + "=" + val.string() for key, val in ant ]
        ant_string = "{" + ",".join(ant_string_arr) + "}"
        
        
        args = [ant_string, "{" + r.consequent.string() + "}", r.support, r.confidence, r.rulelen, r.rid]
        text = "CAR {} => {} sup: {:.2f} conf: {:.2f} len: {}, id: {}".format(*args)

        return text


rules = rm.clf.rules

quant_rules = [ QuantClassAssociationRule(r) for r in rules ]

quant_rules[0]

Antecedent(('a-list-celebrities', '<0;2)'))

a-list-celebrities Interval[<0.0;2.0)]
Antecedent(('estimated-budget', '<250;300)'))

estimated-budget Interval[<250.0;300.0)]
Antecedent(('estimated-budget', '<0;50)'), ('a-list-celebrities', '<4;6)'))

estimated-budget Interval[<0.0;50.0)]

a-list-celebrities Interval[<4.0;6.0)]
Antecedent(('estimated-budget', '<200;250)'))

estimated-budget Interval[<200.0;250.0)]
Antecedent(('estimated-budget', '<0;50)'))

estimated-budget Interval[<0.0;50.0)]
Antecedent(('a-list-celebrities', '<4;6)'))

a-list-celebrities Interval[<4.0;6.0)]


CAR {a-list-celebrities=<0.0;2.0)} => {class=box-office-bomb} sup: 0.31 conf: 1.00 len: 2, id: 19

In [10]:
quant_rules[0].new_antecedent

[('a-list-celebrities', Interval[<0.0;2.0)])]

In [11]:
rules

[CAR {a-list-celebrities=<0;2)} => {class=box-office-bomb} sup: 0.31 conf: 1.00 len: 2, id: 19,
 CAR {estimated-budget=<250;300)} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 2, id: 0,
 CAR {estimated-budget=<0;50),a-list-celebrities=<4;6)} => {class=critical-success} sup: 0.06 conf: 1.00 len: 3, id: 10,
 CAR {estimated-budget=<200;250)} => {class=box-office-bomb} sup: 0.06 conf: 0.67 len: 2, id: 3,
 CAR {estimated-budget=<0;50)} => {class=box-office-bomb} sup: 0.14 conf: 0.62 len: 2, id: 14,
 CAR {a-list-celebrities=<4;6)} => {class=main-stream-hit} sup: 0.11 conf: 0.50 len: 2, id: 15]

In [12]:
class RuleReffiter:
    
    def __init__(self, undiscretized_values_df):
        self.undiscretized_values_df = undiscretized_values_df
        
    def refit(self, quant_rules):
        for quant_rule in quant_rules:
            print("=======")
            self.process_rule(quant_rule)
            print("=======")
            
        return quant_rules
            
    
    
    def process_rule(self, quant_rule):
        for idx, (attr, interval) in enumerate(quant_rule.new_antecedent):
            
            current_attribute_values = self.undiscretized_values_df[[attr]].values

            refitted_interval = interval.refit(current_attribute_values)

            quant_rule.new_antecedent[idx] = attr, refitted_interval
            
            a, i = quant_rule.new_antecedent[idx]
        
        
rr = RuleReffiter(movies_undiscr_txns)

rr.refit(quant_rules)

0 a-list-celebrities Interval[<1.0;1.0>]

0 estimated-budget Interval[<260.0;264.0>]

0 estimated-budget Interval[<10.0;45.0>]

1 a-list-celebrities Interval[<4.0;5.0>]

0 estimated-budget Interval[<202.0;223.0>]

0 estimated-budget Interval[<10.0;45.0>]

0 a-list-celebrities Interval[<4.0;5.0>]



[CAR {a-list-celebrities=<1.0;1.0>} => {class=box-office-bomb} sup: 0.31 conf: 1.00 len: 2, id: 19,
 CAR {estimated-budget=<260.0;264.0>} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 2, id: 0,
 CAR {estimated-budget=<10.0;45.0>,a-list-celebrities=<4.0;5.0>} => {class=critical-success} sup: 0.06 conf: 1.00 len: 3, id: 10,
 CAR {estimated-budget=<202.0;223.0>} => {class=box-office-bomb} sup: 0.06 conf: 0.67 len: 2, id: 3,
 CAR {estimated-budget=<10.0;45.0>} => {class=box-office-bomb} sup: 0.14 conf: 0.62 len: 2, id: 14,
 CAR {a-list-celebrities=<4.0;5.0>} => {class=main-stream-hit} sup: 0.11 conf: 0.50 len: 2, id: 15]

In [62]:
qr = quant_rules[0]

ds = movies_undiscr_txns

def calc_support(q_rule, dataset):
    
    cumulated_column_mask = np.array(len(dataset.iloc[:, 0]) * [True])
    
    dataset_len = len(cumulated_column_mask)
    
    for attr, interval in q_rule.new_antecedent:
        relevant_column = dataset[[attr]]
        column_mask = interval.test_membership(relevant_column)
        column_mask = column_mask.reshape(dataset_len)
        
        cumulated_column_mask = cumulated_column_mask & column_mask
        
    # consequent
    conseq_attr, conseq_val = q_rule.rule.consequent
    
    dataset_subset = dataset.iloc[cumulated_column_mask]
    conseq_val_mask = dataset_subset.iloc[:, -1] == conseq_val
    dataset_conseq_subset = dataset_subset[conseq_val_mask]
    
    conseq_satisf_cnt = len(dataset_conseq_subset)
    
    absolute_support = np.sum(cumulated_column_mask)
    
    support = absolute_support / dataset_len
    confidence = conseq_satisf_cnt / absolute_support
    
    
    return support, confidence
    
    
[ print(r.support, r.confidence) for r in rules ]
[ print(calc_support(qr, ds)) for qr in quant_rules ]


quant_rules[4]

0.3142857142857143 1.0
0.05714285714285714 1.0
0.05714285714285714 1.0
0.05714285714285714 0.6666666666666666
0.14285714285714285 0.625
0.11428571428571428 0.5
(0.17142857142857143, 1.0)
(0.057142857142857141, 1.0)
(0.085714285714285715, 1.0)
(0.085714285714285715, 0.66666666666666663)
(0.22857142857142856, 0.625)
(0.34285714285714286, 0.5)


CAR {estimated-budget=<0.0;50.0)} => {class=box-office-bomb} sup: 0.14 conf: 0.62 len: 2, id: 14

In [None]:
class RuleLiteralPruner:
    
    def __init__(self, undiscretized_values_df):
        self.undiscretized_values_df = undiscretized_values_df
        
    def refit(self, quant_rules):
        for quant_rule in quant_rules:
            print("=======")
            self.process_rule(quant_rule)
            print("=======")
            
    
    
    def process_rule(self, quant_rule):
        for idx, (attr, interval) in enumerate(quant_rule.new_antecedent):
            
            current_attribute_values = self.undiscretized_values_df[[attr]].values

            refitted_interval = interval.refit(current_attribute_values)

            quant_rule.new_antecedent[idx] = attr, refitted_interval
            
            a, i = quant_rule.new_antecedent[idx]
            
            print(idx, a, i)
            print()
    

In [None]:
class RuleTrimmer:
    
    def __init__(self, rules):
        self.rules = rules
        
    
    
r1 = RuleTrimmer()

In [None]:
[ print(r) for r in rules ]
print()
[ print(r) for r in quant_rules ]

In [54]:
a = np.array([True, False, True, False, False, True])
b = np.array([True, False, True, False, False, True])

qr = quant_rules[3]

attr, interval = qr.new_antecedent[0]

relevant_column = ds[[attr]]

mask = interval.test_membership(relevant_column).reshape(len(relevant_column))

sup = len(relevant_column[mask]) / len(relevant_column)

print(qr.rule.support, sup)


print(interval)
print(3 / len(relevant_column))
relevant_column

0.05714285714285714 0.08571428571428572
Interval[<200.0;250.0)]
0.08571428571428572


Unnamed: 0,estimated-budget
0,160
1,55
2,78
3,66
4,223
5,152
6,10
7,202
8,154
9,132
