In [1]:
import pandas as pd
import pyarc

from pyarc import TransactionDB

In [2]:
movies_train_undiscr = pd.read_csv("../data/movies.csv", sep=";", index_col=0)
movies_train_discr = pd.read_csv("../data/movies_discr.csv", sep=";", index_col=0)

movies_undiscr_txns = TransactionDB.from_DataFrame(movies_train_undiscr)
movies_discr_txns = TransactionDB.from_DataFrame(movies_train_discr)

movies_train_discr.head()

Unnamed: 0,estimated-budget,a-list-celebrities,class
0,<150;200),<0;2),box-office-bomb
1,<50;100),<0;2),box-office-bomb
2,<50;100),<0;2),box-office-bomb
3,<50;100),<2;4),box-office-bomb
4,<200;250),<0;2),box-office-bomb


In [3]:
movies_discr_txns.string_representation

[['estimated-budget:=:<150;200)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<50;100)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<50;100)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<50;100)',
  'a-list-celebrities:=:<2;4)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<200;250)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<150;200)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<0;50)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<200;250)',
  'a-list-celebrities:=:<2;4)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<150;200)',
  'a-list-celebrities:=:<2;4)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<100;150)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<0;50)',
  'a-list-celeb

In [40]:
from pyarc.data_structures import Transaction

class UndiscretizedTransaction(Transaction):
    
    def issuperset(self, other):
        return self.frozenset >= other.frozenset
        
    def issubset(self, other):
        return self.frozenset <= other.frozenset 
        
    def __ge__(self, other):
        return self.issuperset(other)
        
    def __le__(self, other):
        return self.issubset(other)
    
    pass

In [8]:
import numpy as np

a = np.array([0, 1, 2, 3, 4, 5, 6, 6, 6, 7, 7])

def make_intervalfunc(minv, maxv, leftinc, rightinc):
    def inner_func(value):
        if greaterthan(value, minv, leftinc) and lesserthan(value, maxv, rightinc):
            return True
        else:
            return False
        
    return inner_func
        
def greaterthan(a, b, inc):
    if inc:
        if a >= b: return True
    elif a > b: return True
    
    return False
        
def lesserthan(a, b, inc):
    if inc:
        if a <= b: return True
    elif a < b: return True
    
    return False


vectorized_func = np.vectorize(make_intervalfunc(-2, 5, False, True))

def refit(values, minval, maxval, r_inc, l_inc):
    mask = vectorized_func(a)
    new_array = a[mask]
    
    left, right = new_array[0], new_array[-1]
    
    return left, right

mask = vectorized_func(a)

a[mask]

refit(a, -2, 5, False, True)

(0, 5)

In [41]:
import re
import numpy


def make_intervalfunc(minv, maxv, left_inclusivity, right_inclusivity):
    def inner_func(value):
        if greaterthan(value, minv, left_inclusivity) and lesserthan(value, maxv, right_inclusivity):
            return True
        else:
            return False
        
    return inner_func
        
def greaterthan(a, b, inclusivity):
    if inclusivity:
        if a >= b: return True
    elif a > b: return True
    
    return False
        
def lesserthan(a, b, inclusivity):
    if inclusivity:
        if a <= b: return True
    elif a < b: return True
    
    return False


class Interval:

    interval_regex = re.compile("(<|\()(\d+);(\d+)(\)|>)")
    
    def __init__(self, interval_string="<0;0)"):
        try:
            args = Interval.interval_regex.findall(interval_string)[0]
            self.left_bracket, self.minval, self.maxval, self.right_bracket = args
            
            self.left_inclusive = True if self.left_bracket == "<" else False
            self.right_inclusive = True if self.right_bracket == ">" else False
            
            self.minval, self.maxval = float(self.minval), float(self.maxval)
            
            self.__membership_func = np.vectorize(
                make_intervalfunc(self.minval, self.maxval, self.left_inclusive, self.right_inclusive)
            )
            
        except Exception as e:
            print("Error: Wrong format of interval")
            
    @classmethod        
    def from_scalars(clazz, minval, maxval, left_inc, right_inc):
        """rework this as the default constructor
        """
        interval_string = "{}{};{}{}".format(
            "<" if left_inc else "(",
            minval,
            maxval,
            ">" if right_inc else ")"
        )
        
        return clazz(interval_string)
    
    def __hash__(self):
        return hash(repr(self))
            
    def refit(self, vals):
        """refit values to grid
        """
        values = np.array(vals)
        
        mask = self.test_membership(values)
        new_array = values[mask]

        left, right = new_array[0], new_array[-1]

        return Interval.from_scalars(left, right, True, True)
        
            
    def test_membership(self, value):
        return self.__membership_func(value)
        

    def __repr__(self):
        return "Interval[{}{};{}{}]".format(self.left_bracket, self.minval, self.maxval, self.right_bracket)
        

class QCBAStep:
    
    def transform(self):
        pass
    


i1 = Interval("<30;40)")
i2 = Interval("(30;40>")

assert i1.test_membership(30) == True
assert i1.test_membership(35) == True
assert i1.test_membership(40) == False

assert i2.test_membership(30) == False
assert i2.test_membership(35) == True
assert i2.test_membership(40) == True


arr1 = [32, 33, 35, 36, 38]

i1.refit(arr1)

#i1.intersect_values(arr1)


Interval[<32.0;38.0>]

In [67]:
from pyarc.data_structures import ClassAssocationRule

class QuantClassAssociationRule:
    
    def __init__(self, rule):
        
        if type(rule) != ClassAssocationRule:
            raise Exception("Type of rule must be: ClassAssocationRule")
        
    
        antecedent_string = r.antecedent
        
        self.new_antecedent = []
        
        for item in antecedent_string:
            attr, val = item
            
            interval = Interval(val)
            self.new_antecedent.append((attr, interval))
        
    


rules = rm.clf.rules

quant_rules = [ QuantClassAssociationRule(r) for r in rules ]

In [None]:
class RuleReffiter:
    pass

In [38]:
class RuleTrimmer:
    
    def __init__(self, rules):
        self.rules = rules
        
    
    
r1 = RuleTrimmer()

In [6]:
from pyarc import CBA
from pyarc.algorithms import generateCARs
import functools

rm = CBA(algorithm="m1", confidence=0.5, support=0.05).fit(movies_discr_txns)

rm.clf.rules

cars = generateCARs(movies_discr_txns)


movies_discr_txns.string_representation


def make_literal_dict(string_repr):
    literal_dict = {}
    
    for tid, transaction in enumerate(string_repr):
        for literal in transaction:
            if literal in literal_dict:
                literal_dict[literal].add(tid)
            else:
                literal_dict[literal] = set([tid])
                
                
    return literal_dict


def reduce_intersection(prev, curr):
    return prev.intersection(curr)


def calculate_supp_conf(ant_list, cons, litdict):
    
    fetch_from_litdict = lambda item: litdict[item]
    
    ant_tid = map(fetch_from_litdict, ant_list)
    cons_tid = litdict[cons]
    
    ant_all_tid = set(functools.reduce(reduce_intersection, ant_tid))
    
    supp_r = len(ant_all_tid.intersection(cons_tid))
    supp_ant = len(ant_all_tid)
    
    
    conf_r = supp_r / supp_ant
    
    return supp_r / n, conf_r
    
    
def prune_literals(rule, litdict, n):

    if len(rule.antecedent) <= 1:
        return rule
    
    idx_to_remove = []
    
    ant = list(rule.antecedent)
    ant = [ ":=:".join(item) for item in ant  ]
    
    cons = ":=:".join(rule.consequent)
    
    
    fetch_from_litdict = lambda item: litdict[item]
    
    ant_tid = [ litdict[item] for item in ant ]
    cons_tid = litdict[cons]
    
    ant_all_tid = set(functools.reduce(reduce_intersection, ant_tid))
    
    all_intersection_tid = ant_all_tid.intersection(cons_tid) 
    
    supp_r = len(all_intersection_tid)
    supp_ant = len(ant_all_tid)
    
    
    conf_r = supp_r / supp_ant
    
    new_supp_r = 0
    new_conf_r = 0
    

    for idx_literal, literal in enumerate(ant_tid):
        new_ant_all_tid = literal - ant_all_tid
        
        if not new_ant_all_tid:
            continue
        
        new_supp_r = literal - all_intersection_tid
        new_conf_r = len(all_intersection_tid) / len(new_ant_all_tid)
        
        if new_conf_r > conf_r:
            print("changed")
            conf_r = new_conf_r
            all_intersection_tid = new_supp_r
            ant_all_tid = new_ant_all_tid
    
            idx_to_remove.append(idx_literal)
        
    
    if not idx_to_remove:
        return rule
    
    
    
    antecedent = list(r.antecedent)
    new_items = []
    
    for idx in idx_to_remove:
        antecedent.pop(idx)
    
    for item in antecedent:
        new_items.append(pyarc.data_structures.Item(*item))
        
    new_antecedent = pyarc.data_structures.Antecedent(new_items)
    
    rule.antecedent = new_antecedent
    rule.support = len(new_supp_r) / n
    rule.confidence = new_conf_r
    
    return rule
    
    
    
    
    
    
    
    

litdict = make_literal_dict(movies_discr_txns.string_representation)
transaction_len = len(movies_discr_txns.string_representation) 


for idx, r in enumerate(cars):
    before = repr(r)
    after = repr(prune_literals(r, litdict, transaction_len))
    
    
    if before == after:
        print(idx, "same")
        print(before)
        print(after)
    else:
        print(idx, "different")
        print(before)
        print(after)
        
    print()

0 same
CAR {a-list-celebrities=<0;2)} => {class=box-office-bomb} sup: 0.31 conf: 1.00 len: 2, id: 48
CAR {a-list-celebrities=<0;2)} => {class=box-office-bomb} sup: 0.31 conf: 1.00 len: 2, id: 48

1 same
CAR {a-list-celebrities=<0;2),estimated-budget=<0;50)} => {class=box-office-bomb} sup: 0.11 conf: 1.00 len: 3, id: 41
CAR {a-list-celebrities=<0;2),estimated-budget=<0;50)} => {class=box-office-bomb} sup: 0.11 conf: 1.00 len: 3, id: 41

2 same
CAR {a-list-celebrities=<0;2),estimated-budget=<50;100)} => {class=box-office-bomb} sup: 0.09 conf: 1.00 len: 3, id: 47
CAR {a-list-celebrities=<0;2),estimated-budget=<50;100)} => {class=box-office-bomb} sup: 0.09 conf: 1.00 len: 3, id: 47

3 same
CAR {estimated-budget=<250;300)} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 2, id: 25
CAR {estimated-budget=<250;300)} => {class=main-stream-hit} sup: 0.06 conf: 1.00 len: 2, id: 25

4 same
CAR {a-list-celebrities=<0;2),estimated-budget=<150;200)} => {class=box-office-bomb} sup: 0.06 conf: 1.00