In [113]:
%run ../../main.py

In [114]:
class RuleBuilderAlgorithm:
    def __init__(self, rules, dataset):
        self.rules = rules
        self.dataset = dataset
        self.y = dataset.class_labels
        
    def update_class_distr(self, classdist, rule):
        return classdist - rule.class_cases_covered

In [115]:
import collections
from cba.algorithms import Classifier

import time

class M1Algorithm(RuleBuilderAlgorithm):
    
    def build(self):
        classifier = []
        self.rules.sort(reverse=True)
        dataset = set(self.dataset)
        dataset_len = len(dataset)
        dataset_len_updated = dataset_len
        
        default_classes = []
        default_classes_errors = []
        rule_errors = []
        total_errors = []    
        
        for rule in self.rules:
            
            if (dataset_len_updated <= 0):
                break
            
            temp = set()
            temp_len = 0
            temp_satisfies_conseq_cnt = 0
            
            
            for datacase in dataset:
                if rule.antecedent <= datacase:
                    temp.add(datacase)
                    temp_len += 1

                    if rule.consequent == datacase.class_val:  
                        temp_satisfies_conseq_cnt += 1
                        rule.marked = True
                        

            if rule.marked:

                classifier.append(rule)
                
                dataset -= temp
                dataset_len_updated -= temp_len
                
                
                ctr = collections.Counter(map(lambda d: d.class_val.value, dataset))
                
                # this will be the default class
                most_common_tuple = ctr.most_common(1)
                
                most_common_cnt = 0
                most_common_label = "None"
                
                try:
                    most_common_tuple = most_common_tuple[0]
                    most_common_cnt = most_common_tuple[1]
                    most_common_label = most_common_tuple[0]
                except IndexError:
                    pass
                
                    
                
                # this is the default class label inserted at corresponding list
                default_classes.append(most_common_label)
                
                
                # number of errors the rule will make => all_satisfying - conseq_satisfying
                rule_errors.append(temp_len - temp_satisfies_conseq_cnt)
                
                
                dflt_class_err = dataset_len_updated - most_common_cnt
                err_cnt = dflt_class_err
                    
                
                
                default_classes_errors.append(err_cnt)
                
                total_errors.append(err_cnt + sum(rule_errors))
                
                
                
            temp = set()
            temp_len = 0
            temp_satisfies_conseq_cnt = 0
            

        min_errors = min(total_errors)
        
        print(total_errors)
        
        indices_to_cut = [ i for i in range(len(total_errors)) if total_errors[i] == min_errors ]
        
        idx_to_cut = indices_to_cut[0]
        
        classif = classifier[:idx_to_cut+1]
        default_class = default_classes[idx_to_cut]        
        
        clf = Classifier()
        clf.rules = classif
        clf.default_class = default_class
        
        return clf

In [116]:
from cba.algorithms import Classifier
from cba.data_structures import ClassAssocationRule, Antecedent, Consequent

import collections

class M2Algorithm(RuleBuilderAlgorithm):
    
    def build(self):

        self.rules.sort(reverse=True)
        
        self.dataset_frozen = self.dataset
        self.dataset_len = len(self.dataset_frozen)

        # set of crules that have higher precedence
        # that their corresponding wrules
        self.Q = set()
        
        # set of all crules
        self.U = set()
        
        # set of conflicting rules
        self.A = set()
        
        self.classifier = []
        
        self.stage1()
        self.stage2()
        self.stage3()
        
        clf = Classifier()
        clf.rules = self.classifier
        clf.default_class = self.default_class
        
        return clf
    
        
    def stage1(self):
        
        for datacase in self.dataset_frozen:
            # finds the highest precedence crules and wrules
            crule, wrule = self.maxcoverrule(datacase, self.rules)
        
            if crule is None:
                crule = self.emptyrule()
                
            if wrule is None:
                wrule = self.emptyrule()
                
            self.U.add(crule)
            
            crule.class_cases_covered.update([datacase.class_val.value])
            
            if crule > wrule:
                self.Q.add(crule)
                crule.marked = True
            else:
                structure = (datacase, datacase.class_val.value, crule, wrule)
                self.A.add(structure)
                
            
                
    
    def stage2(self):
        
        for conflicting_struct in self.A:
            datacase, clazz, crule, wrule = conflicting_struct
            
            
            if wrule.marked:
                crule.class_cases_covered[clazz] -= 1
                wrule.class_cases_covered[clazz] += 1
            
            else:
                wset = self.allcover_rules(self.U, datacase, crule)
                for w in wset:
                    w.replace.add((crule, datacase, clazz))
                    w.class_cases_covered[clazz] += 1
                    
                self.Q = self.Q.union(wset)
        
        
    def stage3(self):
        Qlist = sorted(self.Q, reverse=True)

        rule_errors = 0
        rule_supcount = 0
        total_errors_list = []
        default_classes_list = []
        rules_list = []
        
        # class distribution
        classdist = collections.Counter(map(lambda d: d.class_val.value, self.dataset_frozen))
        
        for rule in Qlist:
            if rule.class_cases_covered[rule.consequent.value] > 0:
                for (rule_replace, dcase, clazz) in rule.replace:
                    if dcase.alreadycovered == True:
                        rule.class_cases_covered[clazz] -= 1
                    else:
                        dcase.alreadycovered = True
                        rule_replace.class_cases_covered[clazz] -= 1
                
                rule_errors += self.errors_of_rule(rule)
                rule_supcount += rule.support_count
                
                classdist = self.update_class_distr(classdist, rule)
                
                
                default_class = self.select_default_class(classdist)
                default_class_count = default_class[1]
                default_class_label = default_class[0]
                
                default_errors = self.dataset_len - rule_supcount - default_class_count
                
                total_errors = rule_errors + default_errors
                
                rules_list.append(rule)
                default_classes_list.append(default_class_label)
                total_errors_list.append(total_errors)
                
        
        min_value = min(total_errors_list)
        
        min_indices = [ idx for (idx, err_num) in enumerate(total_errors_list) if err_num == min_value ]
        min_idx = min_indices[0]
        
        print(total_errors_list)
        
        final_classifier = [ rule for rule in rules_list[:min_idx + 1] ]
        default_class = default_classes_list[min_idx]

        if not default_class:
            i = min_idx
            while not default_class:
                i -= 1
                default_class = default_classes_list[i]

        self.classifier = final_classifier
        self.default_class = default_class
        
    
    def emptyrule(self):
        return ClassAssocationRule(Antecedent([]), Consequent(None, None), 0, 0)
    
    
    def maxcoverrule(self, datacase, rules):
        """
        finds the highest precedence rule that covers
        the case d
        
        
        arguments:
            rules: sorted rules
            datacase: instance d
            sameclass:
                if we are looking for rules
                with the same class as datacase
            
        """
        crule, wrule = None, None
        
        
        for rule in rules:
            if rule.antecedent <= datacase:
                if rule.consequent == datacase.class_val and not crule:
                    # save cRule
                    crule = rule
                    if crule and wrule:
                        return crule, wrule
                elif rule.consequent != datacase.class_val and not wrule:
                    # save wRule
                    wrule = rule
                    if crule and wrule:
                        return crule, wrule

        
        
        return crule, wrule
    
    
    def allcover_rules(self, U, datacase, crule):
        wset = set()
        
        for replacingrule in U:
            if replacingrule > crule and replacingrule.antecedent <= datacase and replacingrule.consequent.value != datacase.class_val.value:
                wset.add(replacingrule)
        
        return wset
    
    def errors_of_rule(self, rule):
        rule.support_count = sum(rule.class_cases_covered.values()) 
        return rule.support_count - rule.class_cases_covered[rule.consequent.value]
    
    
    
    def select_default_class(self, classdist):
        most_common = classdist.most_common(1)
        
        if not most_common:
            return (None, 0)
        
        return most_common[0]
    


In [117]:
from cba.data_structures import ComparableItemSet
from cba.data_structures import Item


class Transaction(ComparableItemSet):

    id_ = 0
    
    def __init__(self, row, header, class_item):
        self.class_val = class_item
        self.items = []
        self.tid = Transaction.id_
        Transaction.id_ += 1
        
        self.alreadycovered = False
        self.hidden = False
        
        # eg. [pay=high, eyes=green]
        self.string_items = []
        
        
        for idx, val in enumerate(row):
            header_label = header[idx]
            
            item = Item(header_label, val)
            
            self.string_items.append("{}:=:{}".format(header_label, val)) 
            
            self.items.append(item)
            
        key, val = self.class_val
        self.string_items.append("{}:=:{}".format(key, val))

        self.frozenset = frozenset(self)
            
            
    
    def __repr__(self):
        string = ", ".join(self.string_items) 
        return "{" + string + "}"
    
    def __hash__(self):
        return hash(tuple(self.items))

    def __eq__(self, other):
        return hash(self) == hash(other)
    
    def __getitem__(self, idx):
        return self.items[idx]
    
    def getclass(self):
        return self.class_val
    
    
    
class UniqueTransaction(Transaction):
    
    def __hash__(self):
        return hash(self.tid)

In [118]:
from cba.data_structures import Appearance, Transaction, Item

class TransactionDB:
    
    def __init__(self, dataset, header, unique_transactions=False):
        """
        arguments:
        - dataset: [[primitive]]
        - header: [string] - feature labels
        
        assert:
        - len(header) == len(values_list)
        
        """
        
        TransactionClass = UniqueTransaction if unique_transactions else Transaction
        
        self.header = header
        self.class_labels = []
        
        new_dataset = []

        for row in dataset:
            class_label = Item(header[-1], row[-1])
            new_row = TransactionClass(row[:-1], header[:-1], class_label)
            
            self.class_labels.append(class_label)
            
            new_dataset.append(new_row)
            
        self.data = new_dataset
        self.classes = list(map(lambda i: i[1], self.class_labels))
        
        
        
        get_string_items = lambda transaction: transaction.string_items
        
        mapped = map(get_string_items, self)
        
        self.string_representation = list(mapped)
        
        

    @property
    def appeardict(self):
        appear = Appearance()
        
        unique_class_items = set(self.class_labels)
        
        for item in unique_class_items:
            appear.add_to_RHS(item)

        return appear.dictionary
        
    
    def __getitem__(self, idx):
        return self.data[idx]
    
    
    @classmethod
    def from_DataFrame(clazz, df, unique_transactions=False):
        """
        convert pandas dataframe to DataSet
        """
        
        rows = df.values
        header = list(df.columns.values)

        return clazz(rows, header, unique_transactions=unique_transactions)

    
    def __repr__(self):
        return repr(self.string_representation)
        
    def __len__(self):
        return len(self.data)
        


In [174]:
from cba.algorithms import Classifier
import pandas as pd
from cba.algorithms import generateCARs
import sklearn.metrics as skmetrics

dsname = "credit-g0"

#iris_train = pd.read_csv("c:/code/python/machine_learning/assoc_rules/train/{}.csv".format(dsname))
iris_train = pd.read_csv("../data/movies_discr.csv", sep=";")
iris_train = iris_train.set_index("Unnamed: 0")
iris_txns = TransactionDB.from_DataFrame(iris_train, unique_transactions=True)

iris_test = pd.read_csv("c:/code/python/machine_learning/assoc_rules/test/{}.csv".format(dsname))
iris_txns_test = TransactionDB.from_DataFrame(iris_test, unique_transactions=True)



In [175]:
iris_train

Unnamed: 0_level_0,estimated-budget,a-list-celebrities,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,<150;200),<0;2),box-office-bomb
1,<50;100),<0;2),box-office-bomb
2,<50;100),<0;2),box-office-bomb
3,<50;100),<2;4),box-office-bomb
4,<200;250),<0;2),box-office-bomb
5,<150;200),<0;2),box-office-bomb
6,<0;50),<0;2),box-office-bomb
7,<200;250),<2;4),box-office-bomb
8,<150;200),<2;4),box-office-bomb
9,<100;150),<0;2),box-office-bomb


In [176]:
from cba.algorithms import top_rules, createCARs
import fim

def generateCARs(transactionDB, support=1, confidence=50, maxlen=10, **kwargs):
    appear = transactionDB.appeardict
    
    rules = fim.apriori(transactionDB.string_representation, supp=support, conf=confidence, target="r", report="sc", appear=appear, **kwargs, zmax=maxlen)
    
    print("done")
    
    return createCARs(rules)



In [180]:
cars = generateCARs(iris_txns, support=-1, maxlen=4)

m1clf = M1Algorithm(cars, iris_txns).build()
m2clf = M2Algorithm(cars, iris_txns).build()

print(m1clf.inspect())

print(m2clf.inspect())

#print(m1clf.test_transactions(iris_txns_test))
#print(m2clf.test_transactions(iris_txns_test))

done
[15, 13, 13, 12, 12, 11, 10, 9, 9, 9, 8, 8]
[15, 13, 13, 12, 12, 11, 10, 9, 9, 9, 8, 8]
                                                  lhs                     rhs  \
0                          {a-list-celebrities=<0;2)}   class=box-office-bomb   
1                        {estimated-budget=<250;300)}   class=main-stream-hit   
2   {a-list-celebrities=<4;6),estimated-budget=<0;...  class=critical-success   
3                          {a-list-celebrities=<6;8)}  class=critical-success   
4   {estimated-budget=<100;150),a-list-celebrities...   class=main-stream-hit   
5   {a-list-celebrities=<4;6),estimated-budget=<15...   class=main-stream-hit   
6                        {estimated-budget=<200;250)}   class=box-office-bomb   
7   {a-list-celebrities=<4;6),estimated-budget=<50...  class=critical-success   
8                           {estimated-budget=<0;50)}   class=box-office-bomb   
9                        {estimated-budget=<100;150)}   class=main-stream-hit   
10  {a-list-cele

In [181]:
m1clf.rules == m2clf.rules

True