In [2]:
# Scientific papers
## Induction of Classification Rules by Granular Computing by Prof. Y.Y.Yao

In [3]:
# Importing libraries
import pandas as pd
import numpy as np
#from bigtree import *

In [4]:
# Creating a dataframe/information table
d = {'Object': ['O1','O2','O3','O4','O5','O6','O7','O8'], 
    'Height': ['short','short','tall','tall','tall','tall','tall','short'],
    'Hair': ['blond','blond','red','dark','dark','blond','dark','blond'],
    'Eyes': ['blue','brown','blue','blue','blue','blue','brown','brown'],
    'Class': [1,0,1,0,0,1,0,0]}
df = pd.DataFrame(data=d)

In [5]:
df

Unnamed: 0,Object,Height,Hair,Eyes,Class
0,O1,short,blond,blue,1
1,O2,short,blond,brown,0
2,O3,tall,red,blue,1
3,O4,tall,dark,blue,0
4,O5,tall,dark,blue,0
5,O6,tall,blond,blue,1
6,O7,tall,dark,brown,0
7,O8,short,blond,brown,0


In [6]:
# Information Table
information_table = df[['Height','Hair','Eyes', 'Class']]

In [7]:
information_table

Unnamed: 0,Height,Hair,Eyes,Class
0,short,blond,blue,1
1,short,blond,brown,0
2,tall,red,blue,1
3,tall,dark,blue,0
4,tall,dark,blue,0
5,tall,blond,blue,1
6,tall,dark,brown,0
7,short,blond,brown,0


In [8]:
# We define a function to create formulas and granules

def getFormulasGranules(column_attr, info_table):
    
    # Basic granules table containing a Formula F with its Granule G
    basic_granules_table = pd.DataFrame(columns = ['Formula', 'Granule'])
    
    # Index i
    i = 0
    
    for attr in column_attr:
        group = info_table.groupby( by = attr ).groups
        keys = list(group.keys())
        values = list(group.values())

        for key in keys:
            basic_granules_table.loc[i, 'Formula'] = attr + "=" + key
            basic_granules_table.loc[i, 'Granule'] = list(group[key])
            i += 1
        
    return basic_granules_table

In [9]:
# We define a function to compute Generality measure
# G = number of granules / total number of objects 

def getGenerality(basic_gr_table, info_table):
    for i in range(len(basic_gr_table)):
        obj_in_granule =  len(basic_gr_table.loc[i, 'Granule'])
        U = len(info_table)
        basic_gr_table.loc[i, 'Generality'] = obj_in_granule / U

In [10]:
# We define a function to compute confidence measure
# We define a function to count the total classes of each object

def countClasseGr(objects_Gr, info_table):
    class_0 = 0
    class_1 = 0
    
    for i in objects_Gr:
        if info_table.loc[i, 'Class'] == 0:
            class_0 += 1
        elif info_table.loc[i, 'Class'] == 1:
            class_1 += 1
    
    return class_0, class_1

def getConfidence( basic_gr_table, info_table ):
    
    
    for i in range(len(basic_gr_table)):
        obj_Gr = basic_gr_table.loc[i, 'Granule']
    
        class_0, class_1  = countClasseGr(obj_Gr, info_table)
    
        basic_gr_table.loc[i, 'confidence_0']  = class_0 / len(obj_Gr)
        basic_gr_table.loc[i, 'confidence_1']  = class_1 / len(obj_Gr)

In [11]:
# We define a function to compute the coverage

def getCoverage(basic_gr_table, info_table):
    if 0 in info_table[['Class']].values:
        class_0_count = len(info_table.groupby( by = 'Class').groups[0])
    else:
        class_0_count = 0

    if 1 in info_table[['Class']].values:
        class_1_count = len(info_table.groupby( by = 'Class').groups[1])
    else:
        class_1_count = 0

    for i in range(len(basic_gr_table)):
        obj_Gr = basic_gr_table.loc[i, 'Granule']

        class_0, class_1 = countClasseGr(obj_Gr, info_table)

        basic_gr_table.loc[i, 'coverage_0']  = (class_0 / class_0_count if class_0_count != 0 else 0)
        basic_gr_table.loc[i, 'coverage_1']  = (class_1 / class_1_count if class_1_count != 0 else 0)


In [12]:
# We define a function to compute the entropy
## Measure of disorder tied with terms such as: chaos and randomness

## Shannon Entropy: measures the uncertainty of a probability distribution

def getEntropy(basic_gr_table, info_table):
    
    res = 0

    for i in range(len(basic_gr_table)):

        for j in range(2):
            p_ = basic_gr_table.loc[i, 'confidence_'+str(j)]
            if p_ == 0:
                res += 0
            else:
                res += -( p_ * np.log2(p_) )

        basic_gr_table.loc[i, 'entropy'] = res
        res = 0


In [13]:
'''
    Goal: Trying to construct the granular network/tree
'''
# Covering solution ==> A list that contains the covered solutions   
covering_solution = list()
# Attributes ==> Columns or Features
attributes = ['Height','Hair','Eyes']
# Information table ==> A table that contains the attributes (features) and objects (data)
u_info_table = information_table
# Table of basic granules and their measurments
u_B_Granules = getFormulasGranules(attributes, information_table )

In [14]:
#Generality
getGenerality(u_B_Granules, u_info_table) 
#Confidence
getConfidence(u_B_Granules, u_info_table)
#Coverage
getCoverage(u_B_Granules, u_info_table)
#Entropy
getEntropy(u_B_Granules, u_info_table)

In [15]:
u_B_Granules

Unnamed: 0,Formula,Granule,Generality,confidence_0,confidence_1,coverage_0,coverage_1,entropy
0,Height=short,"[0, 1, 7]",0.375,0.666667,0.333333,0.4,0.333333,0.918296
1,Height=tall,"[2, 3, 4, 5, 6]",0.625,0.6,0.4,0.6,0.666667,0.970951
2,Hair=blond,"[0, 1, 5, 7]",0.5,0.5,0.5,0.4,0.666667,1.0
3,Hair=dark,"[3, 4, 6]",0.375,1.0,0.0,0.6,0.0,0.0
4,Hair=red,[2],0.125,0.0,1.0,0.0,0.333333,0.0
5,Eyes=blue,"[0, 2, 3, 4, 5]",0.625,0.4,0.6,0.4,1.0,0.970951
6,Eyes=brown,"[1, 6, 7]",0.375,1.0,0.0,0.6,0.0,0.0


In [16]:
u_B_Granules.sort_values(by = ['entropy', 'Generality'], ascending = [True, False], inplace = True)
u_B_Granules.reset_index(drop=True, inplace=True)

In [17]:
u_B_Granules

Unnamed: 0,Formula,Granule,Generality,confidence_0,confidence_1,coverage_0,coverage_1,entropy
0,Hair=dark,"[3, 4, 6]",0.375,1.0,0.0,0.6,0.0,0.0
1,Eyes=brown,"[1, 6, 7]",0.375,1.0,0.0,0.6,0.0,0.0
2,Hair=red,[2],0.125,0.0,1.0,0.0,0.333333,0.0
3,Height=short,"[0, 1, 7]",0.375,0.666667,0.333333,0.4,0.333333,0.918296
4,Height=tall,"[2, 3, 4, 5, 6]",0.625,0.6,0.4,0.6,0.666667,0.970951
5,Eyes=blue,"[0, 2, 3, 4, 5]",0.625,0.4,0.6,0.4,1.0,0.970951
6,Hair=blond,"[0, 1, 5, 7]",0.5,0.5,0.5,0.4,0.666667,1.0


In [18]:
# Adding the objects to the covering_solution list
covering_solution.extend([3,4,6,1,6,7,2])
print(covering_solution)

[3, 4, 6, 1, 6, 7, 2]


In [19]:
# Counting the remaining objects
remaining_objs = list(set(information_table.index) - set(covering_solution))
print(remaining_objs)

[0, 5]


In [20]:
# Dropping the included formulas in the covering_solution
remainining_formulas = u_B_Granules.drop([0,1,2])

In [21]:
remainining_formulas

Unnamed: 0,Formula,Granule,Generality,confidence_0,confidence_1,coverage_0,coverage_1,entropy
3,Height=short,"[0, 1, 7]",0.375,0.666667,0.333333,0.4,0.333333,0.918296
4,Height=tall,"[2, 3, 4, 5, 6]",0.625,0.6,0.4,0.6,0.666667,0.970951
5,Eyes=blue,"[0, 2, 3, 4, 5]",0.625,0.4,0.6,0.4,1.0,0.970951
6,Hair=blond,"[0, 1, 5, 7]",0.5,0.5,0.5,0.4,0.666667,1.0


## Jaccard index or Jaccard similarity coefficient

The Jaccard index measures the similarity between these lists based on the overlap of their elements.

Mathematically, the Jaccard index (J) is defined as:

J = |A ∩ B| / |A ∪ B|

where:

|A ∩ B| represents the size of the intersection of sets A and B.
|A ∪ B| represents the size of the union of sets A and B.

The resulting Jaccard index ranges from 0 to 1, where 0 indicates no similarity (no common elements) and 1 indicates complete similarity (all elements are the same).

In [22]:
def jaccard_index(list_A, list_B):
    coverage_inter = len(set(list_A) & set(list_B))
    coverage_union = len(set(list_A) | set(list_B))
    coverage_val = coverage_inter / coverage_union
    return coverage_val

In [23]:
for i in remainining_formulas.index:
    print("For formula", remainining_formulas.loc[i, 'Formula'], "Jaccard index = ", 
          jaccard_index(remainining_formulas.loc[i, 'Granule'],remaining_objs))

For formula Height=short Jaccard index =  0.25
For formula Height=tall Jaccard index =  0.16666666666666666
For formula Eyes=blue Jaccard index =  0.4
For formula Hair=blond Jaccard index =  0.5


In [24]:
new_granule = remainining_formulas.loc[6, 'Granule']
new_inf_table = information_table.iloc[new_granule, :]

In [25]:
new_inf_table

Unnamed: 0,Height,Hair,Eyes,Class
0,short,blond,blue,1
1,short,blond,brown,0
5,tall,blond,blue,1
7,short,blond,brown,0


In [26]:
u_B_Granules = getFormulasGranules(attributes, new_inf_table )
#Generality
getGenerality(u_B_Granules, new_inf_table) 
#Confidence
getConfidence(u_B_Granules, new_inf_table)
#Coverage
getCoverage(u_B_Granules, new_inf_table)
#Entropy
getEntropy(u_B_Granules, new_inf_table)

In [27]:
u_B_Granules.sort_values(by = ['entropy', 'Generality'], ascending = [True, False] )

Unnamed: 0,Formula,Granule,Generality,confidence_0,confidence_1,coverage_0,coverage_1,entropy
3,Eyes=blue,"[0, 5]",0.5,0.0,1.0,0.0,1.0,0.0
4,Eyes=brown,"[1, 7]",0.5,1.0,0.0,1.0,0.0,0.0
1,Height=tall,[5],0.25,0.0,1.0,0.0,0.5,0.0
0,Height=short,"[0, 1, 7]",0.75,0.666667,0.333333,1.0,0.5,0.918296
2,Hair=blond,"[0, 1, 5, 7]",1.0,0.5,0.5,1.0,1.0,1.0
