In [2]:
import pandas as pd
import pyarc

from pyarc import TransactionDB

In [3]:
movies_train_undiscr = pd.read_csv("../data/movies.csv", sep=";", index_col=0)
movies_train_discr = pd.read_csv("../data/movies_discr.csv", sep=";", index_col=0)

movies_undiscr_txns = TransactionDB.from_DataFrame(movies_train_undiscr)
movies_discr_txns = TransactionDB.from_DataFrame(movies_train_discr)

movies_train_discr.head()

Unnamed: 0,estimated-budget,a-list-celebrities,class
0,<150;200),<0;2),box-office-bomb
1,<50;100),<0;2),box-office-bomb
2,<50;100),<0;2),box-office-bomb
3,<50;100),<2;4),box-office-bomb
4,<200;250),<0;2),box-office-bomb


In [60]:
movies_discr_txns.string_representation

[['estimated-budget:=:<150;200)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<50;100)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<50;100)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<50;100)',
  'a-list-celebrities:=:<2;4)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<200;250)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<150;200)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<0;50)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<200;250)',
  'a-list-celebrities:=:<2;4)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<150;200)',
  'a-list-celebrities:=:<2;4)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<100;150)',
  'a-list-celebrities:=:<0;2)',
  'class:=:box-office-bomb'],
 ['estimated-budget:=:<0;50)',
  'a-list-celeb

In [71]:
import re

class Interval:

    interval_regex = re.compile("(<|\()(\d+);(\d+)(\)|>)")
    
    def __init__(self, interval_string="<0;0)"):
        try:
            args = Interval.interval_regex.findall(interval_string)[0]
            self.left_bracket, self.minval, self.maxval, self.right_bracket = args
        except Exception as e:
            print("Error: Wrong format of interval")
            
        self.minval, self.maxval = float(self.minval), float(self.maxval)    
            
            
    def intersect_values(self, value_array):
        value_array = sorted(map(float, value_array))
        value_array_len = len(value_array)
        
        value_arr_first = value_array[0]
        value_arr_last = value_array[value_array_len - 1]
        
        leftvalue = self.minval if self.minval in value_array else value_arr_first
        rightvalue = self.maxval if self.maxval in value_array else value_arr_last
        
        #leftvalue = self.minval if value_arr_first < self.minval else value_arr_first
        #rightvalue = self.maxval if value_arr_last < self.maxval else value_arr_last
        
        minidx = value_array.index(leftvalue)
        maxidx = value_array.index(rightvalue)
        
        return value_array[minidx : maxidx]
        
            
    def is_in(self, value):
        if value == self.minval and self.left_bracket == "<":
            return True
        
        if value == self.maxval and self.right_bracket == ">":
            return True
        
        if value > self.minval and value < self.maxval:
            return True
            
        return False
        

    def __repr__(self):
        return "Interval[{}{};{}{}]".format(self.left_bracket, self.minval, self.maxval, self.right_backet)
        

class QCBAStep:
    
    def transform(self):
        pass
    


i1 = Interval("<30;40)")
i2 = Interval("(30;40>")

assert i1.is_in(30) == True
assert i1.is_in(35) == True
assert i1.is_in(40) == False

assert i2.is_in(30) == False
assert i2.is_in(35) == True
assert i2.is_in(40) == True


arr1 = [32, 33, 35, 36, 38]
i1.intersect_values(arr1)

[32.0, 33.0, 35.0, 36.0]

In [1]:
import numpy as np

a = np.array([0, 1, 2, 3, 4, 5, 6, 6, 6, 7, 7])

def make_intervalfunc(minv, maxv, leftinc, rightinc):
    def inner_func(value):
        if greaterthan(value, minv, leftinc) and lesserthan(value, maxv, rightinc):
            return True
        else:
            return False
        
    return inner_func
        
def greaterthan(a, b, inc):
    if inc:
        if a >= b: return True
    elif a > b: return True
    
    return False
        
def lesserthan(a, b, inc):
    if inc:
        if a <= b: return True
    elif a < b: return True
    
    return False



vectorized_func = np.vectorize(make_intervalfunc(-2, 5, False, True))

def refit(values, minval, maxval, r_inc, l_inc):
    mask = vectorized_func(a)
    new_array = a[mask]
    
    left, right = new_array[0], new_array[-1]
    
    return left, right

mask = vectorized_func(a)

a[mask]

refit(a, -2, 5, False, True)

(0, 5)

In [107]:
from pyarc import CBA
from pyarc.algorithms import generateCARs
import functools

rm = CBA(algorithm="m1", confidence=0.5, support=0.05).fit(movies_discr_txns)

rm.clf.rules

cars = generateCARs(movies_discr_txns)


movies_discr_txns.string_representation


def make_literal_dict(string_repr):
    literal_dict = {}
    
    for tid, transaction in enumerate(string_repr):
        for literal in transaction:
            if literal in literal_dict:
                literal_dict[literal].add(tid)
            else:
                literal_dict[literal] = set([tid])
                
                
    return literal_dict


def reduce_intersection(prev, curr):
    return prev.intersection(curr)


def calculate_supp_conf(ant_list, cons, litdict):
    
    fetch_from_litdict = lambda item: litdict[item]
    
    ant_tid = map(fetch_from_litdict, ant_list)
    cons_tid = litdict[cons]
    
    ant_all_tid = set(functools.reduce(reduce_intersection, ant_tid))
    
    supp_r = len(ant_all_tid.intersection(cons_tid))
    supp_ant = len(ant_all_tid)
    
    
    conf_r = supp_r / supp_ant
    
    return supp_r / n, conf_r
    
    
def prune_literals(rule, litdict, n):

    if len(rule.antecedent) <= 1:
        return rule
    
    idx_to_remove = []
    
    ant = list(rule.antecedent)
    ant = [ ":=:".join(item) for item in ant  ]
    
    cons = ":=:".join(rule.consequent)
    
    
    fetch_from_litdict = lambda item: litdict[item]
    
    ant_tid = [ litdict[item] for item in ant ]
    cons_tid = litdict[cons]
    
    ant_all_tid = set(functools.reduce(reduce_intersection, ant_tid))
    
    all_intersection_tid = ant_all_tid.intersection(cons_tid) 
    
    supp_r = len(all_intersection_tid)
    supp_ant = len(ant_all_tid)
    
    
    conf_r = supp_r / supp_ant
    
    new_supp_r = 0
    new_conf_r = 0
    

    for idx_literal, literal in enumerate(ant_tid):
        new_ant_all_tid = literal - ant_all_tid
        
        if not new_ant_all_tid:
            continue
        
        new_supp_r = literal - all_intersection_tid
        new_conf_r = len(all_intersection_tid) / len(new_ant_all_tid)
        
        if new_conf_r > conf_r:
            print("changed")
            conf_r = new_conf_r
            all_intersection_tid = new_supp_r
            ant_all_tid = new_ant_all_tid
    
            idx_to_remove.append(idx_literal)
        
    
    if not idx_to_remove:
        return rule
    
    
    
    antecedent = list(r.antecedent)
    new_items = []
    
    for idx in idx_to_remove:
        antecedent.pop(idx)
    
    for item in antecedent:
        new_items.append(pyarc.data_structures.Item(*item))
        
    new_antecedent = pyarc.data_structures.Antecedent(new_items)
    
    rule.antecedent = new_antecedent
    rule.support = len(new_supp_r) / n
    rule.confidence = new_conf_r
    
    return rule
    
    
    
    
    
    
    
    

litdict = make_literal_dict(movies_discr_txns.string_representation)
transaction_len = len(movies_discr_txns.string_representation) 


for r in cars:
    before = repr(r)
    after = repr(prune_literals(r, litdict, transaction_len))
    
    
    if before == after:
        print("same")
    else:
        print("different")

same
same
same
same
same
same
same
same
same
same
same
same
same
same
same
same
same
same
same
same
same
changed
different
same
changed
different
changed
different
same
same
same
same
