In [2]:
from utils import *

In [12]:
class LabeledSet:  
    
    def __init__(self, input_dimension, ordinal_attributes):
        '''
            ordinal_attributes : {index of ordinal attributes: ordered values}
            initialize a labeled set with input dimension and ordinal attributes definition
        '''
        self.input_dimension = input_dimension
        self.nb_examples = 0
        self.ordinal_attributes = ordinal_attributes
    
    def addExample(self,vector,label):
        if (self.nb_examples == 0):
            self.x = np.array([vector])
            self.y = np.array([label])
        else:
            self.x = np.vstack((self.x, vector))
            self.y = np.vstack((self.y, label))
        
        self.nb_examples = self.nb_examples + 1
    
    #Renvoie la dimension de l'espace d'entrée
    def getInputDimension(self):
        return self.input_dimension
    
    #Renvoie le nombre d'exemples dans le set
    def size(self):
        return self.nb_examples
    
    #Renvoie la valeur de x_i
    def getX(self, i):
        return self.x[i]
        
    
    #Renvoie la valeur de y_i
    def getY(self, i):
        return(self.y[i])

In [46]:
val = dict()
val[1] = ['a', 'b', 'c']
a_set = LabeledSet(2, val)

a_set.addExample([0, 'a'], 1)
a_set.addExample([0, 'b'], 1)
a_set.addExample([0, 'c'], 1)

a_set.x[0][0] < '3'

True

In [78]:
values = a_set.x.copy()
ind = np.transpose(np.array([[i for i in range(a_set.size())]]))
values = np.hstack((values, ind))

In [79]:
values

array([['0', 'a', '0'],
       ['0', 'b', '1'],
       ['0', 'c', '2']], dtype='<U21')

In [81]:
v = a_set.getX(1)[1]
ind = []

In [57]:
v = a_set.getX(2)[1]
att_values = a_set.ordinal_attributes[1]
ind_v = att_values.index(v)
att_values.index(values[:,1]) >= ind_v

TypeError: cannot perform reduce with flexible type

In [15]:
class F_layer:
    '''
        object-wise local monotonicity measure 
    '''

    def value(self, w_i, labeled_set, esa, esl):
        raise NotImplementedError
        
    def equal_sets_attribute(self, labeled_set, a_j):    
        '''
            a_j : index of attribute in labeled_set
            labeled_set : labeled set
            for each object w in labeled_set, return its equal set generated by a_j  
        '''
        values = labeled_set.x.copy()
        ind = np.transpose(np.array([[i for i in range(labeled_set.size())]]))
        values = np.hstack((values, ind))

        dominant_sets = dict()
        for i in range(0, labeled_set.size()):
            v = labeled_set.getX(i)[a_j]
            dominant_sets[i] = set(values[values[:,a_j] == v][:,labeled_set.getInputDimension()])

        return dominant_sets
    
    def equal_sets_label(self, labeled_set):    
        '''
            labeled_set : labeled set
            for each object w in labeled_set, return its dominant set generated by its label  
        '''
        values = labeled_set.y.copy()
        ind = np.transpose(np.array([[i for i in range(labeled_set.size())]]))
        values = np.hstack((values, ind))

        dominant_sets = dict()

        for i in range(0, labeled_set.size()):
            v = labeled_set.getY(i)
            dominant_sets[i] = set(values[values[:,0] == v][:,1])

        return dominant_sets
    
    def dominant_sets_attribute(self, labeled_set, a_j):    
        '''
            a_j : index of attribute in labeled_set
            labeled_set : labeled set
            for each object w in labeled_set, return its dominant set generated by a_j  
        '''
        
        values = labeled_set.x.copy()
        ind = np.transpose(np.array([[i for i in range(labeled_set.size())]]))
        values = np.hstack((values, ind))

        dominant_sets = dict()
        if not (a_j in labeled_set.ordinal_attributes):
            for i in range(0, labeled_set.size()):
                v = labeled_set.getX(i)[a_j]
                dominant_sets[i] = set(values[values[:,a_j] >= v][:,labeled_set.getInputDimension()])
        else:
            for i in range(0, labeled_set.size()):
                v = labeled_set.getX(i)[a_j]
                att_values = labeled_set.ordinal_attributes[a_j]
                ind_v = att_values.index(v)
                dominant_sets[i] = set(values[att_values.index(values[:,a_j]) >= ind_v][:,labeled_set.getInputDimension()])

        return dominant_sets

    def dominant_sets_label(self, labeled_set):
        '''
            labeled_set : labeled set
            for each object w in labeled_set, return its dominant set generated by its label  
        '''
        values = labeled_set.y.copy()
        ind = np.transpose(np.array([[i for i in range(labeled_set.size())]]))
        values = np.hstack((values, ind))

        dominant_sets = dict()

        for i in range(0, labeled_set.size()):
            v = labeled_set.getY(i)
            dominant_sets[i] = set(values[values[:,0] >= v][:,1])

        return dominant_sets
    
class Ds(F_layer):
    def value(self, w_i, labeled_set, esa, esl):
        '''
            labeled_set : labeled set
            w_i : index of object in labeled_set
            return ds value of w_i, a_j
        '''
        esa_i = esa[w_i]
        esl_i = esl[w_i]
        intersection = esa_i.intersection(esl_i)
        
        return len(intersection) * 1.0 / len(esa_i)    

class Dsr(F_layer):
    
    def value(self, w_i, labeled_set, dsa, dsl):
        '''
            labeled_set : labeled set
            w_i : index of object in labeled_set
            return dsr value of w_i, a_j
        '''
        dsa_i = dsa[w_i]
        dsl_i = dsl[w_i]
        intersection = dsa_i.intersection(dsl_i)
        
        return len(intersection) * 1.0 / len(dsa_i)   
    
class Minds(F_layer):
    
    def value(self, w_i, labeled_set, esa, esl):
        '''
            labeled_set : labeled set
            w_i : index of object in labeled_set
            return minds value of w_i, a_j
        '''
        n = labeled_set.size()
        intersections_lengths = []
        equal_set = esa[w_i]
        
        for w_h in equal_set:
            intersections_lengths.append(len(esa[w_h].intersection(esl[w_h])))
    
        return min(intersections_lengths) * 1.0 / len(equal_set)       
    
class Mindsr(F_layer):  
    def value(self, w_i, labeled_set, dsa, dsl):
        '''
            labeled_set : labeled set
            w_i : index of object in labeled_set
            return mindsr value of w_i, a_j
        '''
        n = labeled_set.size()
        intersections_lengths = []
        dominant_set = dsa[w_i]
        
        for w_h in dominant_set:
            intersections_lengths.append(len(dsa[w_h].intersection(dsl[w_h])))
    
        return min(intersections_lengths) * 1.0 / len(dominant_set)    
    
class Maxdsr(F_layer):
    def value(self, w_i, labeled_set, dsa, dsl):
        '''
            w_i : index of object in labeled_set
            return maxdsr value of w_i, a_j
        '''
        n = labeled_set.size()
        intersections_lengths = []

        dominant_set = dsa[w_i]
        
        for w_h in dominant_set:
            intersections_lengths.append(len(dsa[w_h].intersection(dsl[w_h])))
    
    
        return max(intersections_lengths) * 1.0 / len(dominant_set)    
    
class Maxds(F_layer):

    def value(self, w_i, labeled_set, esa, esl):
        '''
            w_i : index of object in labeled_set
            return maxds value of w_i, a_j
        '''
        n = labeled_set.size()
        intersections_lengths = []

        equal_set = esa[w_i]
        
        for w_h in equal_set:
            intersections_lengths.append(len(esa[w_h].intersection(esl[w_h])))
    
    
        return max(intersections_lengths) * 1.0 / len(equal_set)

class Avgdsr(F_layer):
    def value(self, w_i, labeled_set, dsa, dsl):
        '''
            labeled_set : labeled set
            w_i : index of object in labeled_set
            return avgdsr value of w_i, a_j
        '''
        n = labeled_set.size()
        intersections_lengths = []

        dominant_set = dsa[w_i]
        
        for w_h in dominant_set:
            intersections_lengths.append(len(dsa[w_h].intersection(dsl[w_h])))
            
        return (1.0/len(dominant_set) * np.sum(intersections_lengths)) * 1.0 / len(dominant_set)    

class Avgds(F_layer):
    def value(self, w_i, labeled_set, esa, esl):
        '''
            labeled_set : labeled set
            w_i : index of object in labeled_set
            return avgds value of w_i, a_j
        '''
        n = labeled_set.size()
        intersections_lengths = []

        equal_set = esa[w_i]
        
        for w_h in equal_set:
            intersections_lengths.append(len(esa[w_h].intersection(esl[w_h])))
            
        return (1.0/len(equal_set) * np.sum(intersections_lengths)) * 1.0 / len(equal_set)    

In [5]:
class G_layer:
    '''
        object-wise local non-monotonicity measure
    '''
        
    def value(self, f_value):
        raise NotImplementedError
        
class Log(G_layer):    
    def value(self, f_value):
        '''
            f_value : value computed by f_layer
            return -log_2(f_value)
        '''
        return -log(f_value, 2) 
    
class One_minus(G_layer):    
    def value(self, f_value):
        '''
            f_value : value computed by f_layer
            return 1 - f_value
        '''
        return 1 - f_value

In [6]:
class H_layer:
    '''
        aggregated local non-monotonicity measure
    '''
        
    def value(self, g_values, labeled_set):
        raise NotImplementedError
        
class Sum(H_layer):    
    def value(self, g_values, labeled_set):
        '''
            return (1/labeled_set.size()) * sum(g_values)
        '''
        return (1.0/labeled_set.size()) * np.sum(g_values)

In [7]:
class Gdm:
    '''
        Generic rank discrimination measure
    '''
    def __init__(self, h, g, f):
        '''
            h : object-wise local monotonicity measure 
            g : object-wise local non-monotonicity measure 
            f : aggregated local non-monotonicity measure
            labeled_set : labeled set
        '''
        self.h = h 
        self.g = g
        self.f = f
    
    def value(self, labeled_set, a_j):
        g_f = []

        if (isinstance(self.f, Dsr)):
            sa = self.f.dominant_sets_attribute(labeled_set, a_j)
            sl = self.f.dominant_sets_label(labeled_set)
        else:
            sa = self.f.equal_sets_attribute(labeled_set, a_j)
            sl = self.f.equal_sets_label(labeled_set)
        
        for i in range(0, labeled_set.size()):
            g_f.append(self.g.value(self.f.value(i, labeled_set, sa, sl)))
        
        return self.h.value(g_f, labeled_set)

In [8]:
def discretize(H, labeled_set, a_j):
    '''
        H : discrimation measure
        labeled_set : labeled set
        
    '''
    
    n = labeled_set.size()
    ind = np.argsort(labeled_set.x,axis=0)[:,a_j] # sort values 
    
    binary_set = LabeledSet(labeled_set.getInputDimension())
    binary_set.nb_examples = labeled_set.size()
    binary_set.x = labeled_set.x.copy()
    binary_set.x[:,a_j] = np.ones(labeled_set.size())
    binary_set.y = labeled_set.y
    
    thresholds = []
    H_values = []
    
    for i in range(n-1):
        current = labeled_set.getX(ind[i])[a_j]
        current_label = labeled_set.getY(ind[i])
        lookahead = labeled_set.getX(ind[i+1])[a_j]
        lookahead_label = labeled_set.getY(ind[i+1])
        binary_set.x[ind[i]][a_j] = 0
        
        if current == lookahead or current_label == lookahead_label:
            continue

        thresholds.append((current + lookahead) / 2.0)
        H_values.append(H.value(binary_set, a_j))
        
    min_entropy = min(H_values)
    min_threshold = thresholds[np.argmin(H_values)]
    
    return (min_threshold, min_entropy)

In [None]:
def majority_class(labeled_set, labels):
    classes_size = []
    
    for label in labels:
        classes_sizes.append(len(labeledSet.x[np.where(labeledSet.y == label),:][0]))

    return labels[np.argmax(np.array(classes_size))]

In [10]:
def divide(Lset, att, threshold):
    '''
        Lset : labeled_set
        att : index of attribute to divide
        threshold : threshold value
    '''
    E1 = LabeledSet(Lset.getInputDimension())
    E2 = LabeledSet(Lset.getInputDimension())
    
    # Separate data according to threshold
    for i in range(Lset.size()):
        if Lset.getX(i)[att] <= seuil:
            E1.addExample(Lset.getX(i), Lset.getY(i))
        else:
            E2.addExample(Lset.getX(i), Lset.getY(i))
    
    return E1, E2

In [None]:
class GenericTree:
    '''
        Generic tree
        deal with both numeric and ordinal attributes
        deal with multi-class classification
    '''
    def __init__(self):
        self.attribute = None
        self.children = None
        self.label = None
        
        # binary tree
        self.threshold = None
        self.inf = None
        self.sup = None
        
    def isLeaf(self):
        """ 
            return True if tree is a leaf
        """
        return self.attribute == None
    
    def add_children(self,children,att):
        """ 
            child: dictionnary key=category, value=tree
            att: index of attribute
        """
        self.attribut = att
        self.fils = fils
    
    def add_children_binary(self, inf, sup, att, threshold):
        """
            inf, sup : trees
            att : index of attribute
            threshold : threshold value
        """
        self.attribute = att
        self.threshold = threshold
        self.inf = inf
        self.sup = sup
    
    def addLeaf(self,label):
        """ 
            add leaf corresponding to label
        """
        self.label = label
        
    def classify(self,example):
        """ 
            example : numpy array in labeled set
            classify example
        """
        if self.isLeaf():
            return self.label
        else:
            if threshold is None:
                for c,f in self.children.items():
                    if c == example[self.attribute]:
                        return f.classify(example)
            else:
                if example[self.attribute] <= self.threshold:
                    return self.inf.classify(example)
                return self.sup.classify(example)
                
    def to_graph(self, g, prefix='A'):
        """ 
            build a representation of the tree
        """
        if self.isLeaf():
            g.node(prefix,str(self.label),shape='box')
        else:
            g.node(prefix, str(self.attribute))
            
            if threshold is None: 
                for c, f in self.fils.items():
                    f.to_graph(g,prefixe+c)
                    g.edge(prefixe,prefixe+c, c)
            else:
                g.node(prefix, str(self.attribute))
                self.inf.to_graph(g,prefixe+"l")
                self.sup.to_graph(g,prefixe+"r")
                g.edge(prefix,prefixe+"l", '<='+ str(self.threshold))
                g.edge(prefix,prefixe+"r", '>'+ str(self.threshold))
        return g

In [None]:
def build_DT(labeled_set, H, H_stop, measureThreshold, maxDepth, percMinSize, labels, current_depth):
    '''
        labeled_set : labeled set
        H : rank discrimination measure used for discretization
        H_stop : discrimination measure (shannon, gini ...)
        measure_threshold : lower bound for H_stop
        max_depth : maximum length of a path from the root to a leaf node
        percMinSize : minimum size of the current object set labeled_set
    '''
    
    min_entropy = 1.1
    m = labeled_set.getInputDimension()
    threshold = None
    attribute = None
    categories = []
    
    h_values = []
    thresholds = []
    
    for a_j in range(m):
        # numeric attribute
        if isistance(labeled_set.getX(0)[a_j], numbers.Real):
            threshold, h = discretize(H, labeled_set, a_j)
        else:
            n = labeled_set.size()
            cat = []
            distribution = 
        
        if isinstance(labeled_set.getX(0)[attribute], numbers.Real):
            inf, sup = divide(labeled_set, attribute, threshold)
            tree = BinaryTree()
            
            if (inf.size() < percMinSize) or (sup.size() < percMinSize):
                if sup.size() < percMinSize:
                    tree.addLeaf(majority_class(Linf, labels, None))
                    return tree
                else:
                    tree.addLeaf(majority_class(Lsup, labels, None))
                    return tree
            if (maxDepth > current_depth + 1):
                tree.addLeaf(majority_class(Linf, labels, "left"))
                tree.addLeaf(majority_class(Lsup, labels, "right"))
                return tree
        else:
            k = len(categories)
            E = divide_ordinal(labeled_set, attribute, categories)
            tree = GenericTree()
            fils = dict()
            for i in range(k):
                fils[categories[i]] = construit_AD(E[i], epsilon, labels)
            AC.ajoute_fils(fils, attribut)
            
            return AC

In [None]:
from math import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import graphviz as gv
import numbers

########## LABELEDSET ##########

class LabeledSet:   
    def __init__(self, input_dimension):
        self.input_dimension = input_dimension
        self.nb_examples = 0
    
    def addExample(self,vector,label):
        if (self.nb_examples == 0):
            self.x = np.array([vector])
            self.y = np.array([label])
        else:
            self.x = np.vstack((self.x, vector))
            self.y = np.vstack((self.y, label))
        
        self.nb_examples = self.nb_examples + 1
    
    #Renvoie la dimension de l'espace d'entrée
    def getInputDimension(self):
        return self.input_dimension
    
    #Renvoie le nombre d'exemples dans le set
    def size(self):
        return self.nb_examples
    
    #Renvoie la valeur de x_i
    def getX(self, i):
        return self.x[i]
        
    
    #Renvoie la valeur de y_i
    def getY(self, i):
        return(self.y[i])
    
        
##################################

########## CLASSIFIER ##########

class Classifier:
    def __init__(self,input_dimension):
        """ Constructeur """
        raise NotImplementedError("Please Implement this method")
    
    
    # Permet de calculer la prediction sur x => renvoie un score
    def predict(self,x):
        raise NotImplementedError("Please Implement this method")

    
    # Permet d'entrainer le modele sur un ensemble de données étiquetés
    def train(self,labeledSet):
        raise NotImplementedError("Please Implement this method")
    
    # Permet de calculer le taux de bonne classification
    def accuracy(self,set):
        nb_ok=0
        for i in range(set.size()):
            score = self.predict(set.getX(i))
            if (score*set.getY(i)>0):
                nb_ok = nb_ok+1
        acc = nb_ok/(set.size() * 1.0)
        return acc    
    
def plot_frontiere(set,classifier,step=10):
    """ LabeledSet * Classifier * int -> NoneType
        Remarque: le 3e argument est optionnel et donne la "résolution" du tracé
        affiche la frontière de décision associée au classifieur
    """
    mmax=set.x.max(0)
    mmin=set.x.min(0)
    x1grid,x2grid=np.meshgrid(np.linspace(mmin[0],mmax[0],step),np.linspace(mmin[1],mmax[1],step))
    grid=np.hstack((x1grid.reshape(x1grid.size,1),x2grid.reshape(x2grid.size,1)))
    
    # calcul de la prediction pour chaque point de la grille
    res=np.array([classifier.predict(grid[i,:]) for i in range(len(grid)) ])
    res=res.reshape(x1grid.shape)
    # tracer des frontieres
    plt.contourf(x1grid,x2grid,res,colors=["red","cyan"],levels=[-1000,0,1000],linewidth=2)

######################################################    
    
########### Entropy ###########  

def majority_class(labeled_set, labels):
    classes_size = []
    
    for label in labels:
        classes_sizes.append(len(labeledSet.x[np.where(labeledSet.y == label),:][0]))

    return labels[np.argmax(np.array(classes_size))]

################# TREE REPRESENTATION ###############

class GenericTree:
    '''
        Generic tree
        deal with both numeric and ordinal attributes
        deal with multi-class classification
    '''
    def __init__(self):
        self.attribute = None
        self.children = None
        self.label = None
        
        # binary tree
        self.threshold = None
        self.inf = None
        self.sup = None
        
    def isLeaf(self):
        """ 
            return True if tree is a leaf
        """
        return self.attribute == None
    
    def add_children(self,children,att):
        """ 
            child: dictionnary key=category, value=tree
            att: index of attribute
        """
        self.attribut = att
        self.fils = fils
    
    def add_children_binary(self, inf, sup, att, threshold):
        """
            inf, sup : trees
            att : index of attribute
            threshold : threshold value
        """
        self.attribute = att
        self.threshold = threshold
        self.inf = inf
        self.sup = sup
    
    def addLeaf(self,label):
        """ 
            add leaf corresponding to label
        """
        self.label = label
        
    def classify(self,example):
        """ 
            example : numpy array in labeled set
            classify example
        """
        if self.isLeaf():
            return self.label
        else:
            if threshold is None:
                for c,f in self.children.items():
                    if c == example[self.attribute]:
                        return f.classify(example)
            else:
                if example[self.attribute] <= self.threshold:
                    return self.inf.classify(example)
                return self.sup.classify(example)
                
    def to_graph(self, g, prefix='A'):
        """ 
            build a representation of the tree
        """
        if self.isLeaf():
            g.node(prefix,str(self.label),shape='box')
        else:
            g.node(prefix, str(self.attribute))
            
            if threshold is None: 
                for c, f in self.fils.items():
                    f.to_graph(g,prefixe+c)
                    g.edge(prefixe,prefixe+c, c)
            else:
                g.node(prefix, str(self.attribute))
                self.inf.to_graph(g,prefixe+"l")
                self.sup.to_graph(g,prefixe+"r")
                g.edge(prefix,prefixe+"l", '<='+ str(self.threshold))
                g.edge(prefix,prefixe+"r", '>'+ str(self.threshold))
        return g 

def build_DT(labeled_set, H, measureThreshold, maxDepth, percMinSize, labels, current_depth)
    '''
    if current_depth > maxDepth or percMinSize > labeled_set.size():
        leaf = GenericTree()
        leaf.addLeaf(majority_class(labeled_set, labels))
        return leaf
    
    entro = entropy(labeled_set, labels) 
    m = labeled_set.getInputDimension()
    
    if entro <= epsilon:
        feuille = ArbreBinaire()
        feuille.ajoute_feuille(classe_majoritaire(Lset, labels))
        return feuille
    else:
    '''
    
    min_entropy = 1.1
    m = labeled_set.getInputDimension()
    threshold = None
    attribute = None
    categories = []
    
    for attr in range(m):
        if isistance(labeled_set.getX(0)[attr], numbers.Real):
            s, entro = discretisation(labeled_set, attr, labels)
            if (min_entropy > entro):
                min_entropy = entro
                threshold = s
                attribute = attr
        else:
            n = labeled_set.size()
            cat = []
            distribution 
        
        if isinstance(labeled_set.getX(0)[attribute], numbers.Real):
            inf, sup = divide(labeled_set, attribute, threshold)
            tree = BinaryTree()
            
            if (inf.size() < percMinSize) or (sup.size() < percMinSize):
                if sup.size() < percMinSize:
                    tree.addLeaf(majority_class(Linf, labels, None))
                    return tree
                else:
                    tree.addLeaf(majority_class(Lsup, labels, None))
                    return tree
            if (maxDepth > current_depth + 1):
                tree.addLeaf(majority_class(Linf, labels, "left"))
                tree.addLeaf(majority_class(Lsup, labels, "right"))
                return tree
        else:
            k = len(categories)
            E = divide_ordinal(labeled_set, attribute, categories)
            tree = GenericTree()
            fils = dict()
            for i in range(k):
                fils[categories[i]] = construit_AD(E[i], epsilon, labels)
            AC.ajoute_fils(fils, attribut)
            
            return AC
        
#############################################################

########################## RDMT #############################
    
class RDMT(Classifier):
    '''
        Rank discrimination measure tree 
    '''
    def __init__(self, H, measureThreshold, maxDepth, percMinSize, labels):
        '''
            H : discrimination measure to minimize for splitting
            measureThreshold : lower bound for the discrimination measure H
            maxDepth : maximum length of a path from the root to a leaf node
            percMinSize : minimum size of the current object set 
            labels : set of classes
        '''
        self.H = H
        self.measureThreshold = measureThreshold
        self.maxDepth = maxDepth
        self.percMinSize = percMinSize
        self.labels = labels
        self.root = None
        
    def predict(self,x):
        '''
            classify x using RDMT
            return prediction
        '''
        label = self.root.classify(x)
        return label
    
    def train(self,set):
        '''
            set : training set
            builds RDMT using set
        '''
        self.set = set
        self.root = build_DT(set,self.H, self.measureThreshold, self.maxDepth, self.percMinSize, self.labels, 0)
    
    def plot(self):
        '''
            display tree
        '''
        gtree = gv.Digraph(format='png')
        return self.root.to_graph(gtree)        