#Rule similarity based on Bag of Word representation

In [None]:
# Import libraries
import pandas as pd
import re
import numpy as np
import seaborn as sns;
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

#define rulesets path and destination path 
filepath=""
rulepath=""

In [None]:
# Load the data
#real dataset
ruleset_r=pd.read_csv(rulepath+"ruleset1.csv", header=None) 
ruleset_r['n'] = pd.Series( ruleset_r.index+1).astype("string")
print(ruleset_r.iloc[:,0])
print(ruleset_r.iloc[:,1])
print(ruleset_r.iloc[:,2])
ruleset_r['Rule'] = ruleset_r.apply(lambda x: x[0][x[0].find('IF')+3:x[0].find('THEN')-1], axis=1)
ruleset_r['Covering'] = ruleset_r.apply(lambda x: x[1][x[1].find(':')+2:], axis=1)
ruleset_r['Error'] = ruleset_r.apply(lambda x: x[2][x[2].find(':')+2:], axis=1)
ruleset_r['Class'] = ruleset_r.apply(lambda x: x[0][x[0].find('"')+1:x[0].find('"')+2], axis=1)
ruleset_r.drop(ruleset_r.iloc[:,0:3] , axis=1, inplace=True)
ruleset_r['Set']='real'
print(ruleset_r)

#synthetic dataset
ruleset_s=pd.read_csv(rulepath+"ruleset2.csv", header=None) 
ruleset_s['n'] = pd.Series( ruleset_s.index+1)
ruleset_s['Rule'] = ruleset_s.apply(lambda x: x[0][x[0].find('IF')+3:x[0].find('THEN')-1], axis=1)
ruleset_s['Covering'] = ruleset_s.apply(lambda x: x[1][x[1].find(':')+2:], axis=1)
ruleset_s['Error'] = ruleset_s.apply(lambda x: x[2][x[2].find(':')+2:], axis=1)
ruleset_s['Class'] = ruleset_s.apply(lambda x: x[0][x[0].find('"')+1:x[0].find('"')+2], axis=1)
ruleset_s.drop(ruleset_s.iloc[:,0:3] , axis=1, inplace=True)
ruleset_s['Set']='synth'
print(ruleset_s)

#concatenate real and synthetic rulesets and select rules with covering above 15%
ruleset = pd.concat([ruleset_r, ruleset_s], ignore_index=True)
ruleset=ruleset[ruleset["Covering"].astype('float')>0.15]


In [None]:
def convert_litteral(classe):
    if classe == '1':
        return 'HL'
    else:
        return 'noHL'

In [None]:
ruleset['Class'] = ruleset['Class'].apply(convert_litteral)
ruleset

In [None]:
ruleset.reset_index(drop=True, inplace=True)

In [None]:
#extract feature and sign of each condition
from collections import Counter
words = []
unilateral_conditions=[]
for index, value in ruleset.loc[:,'Rule'].items():
    for r in value.split(' AND '): #split each rules in r conditions, based on the splitting variable AND
        if r.count('<')==2:  #if the condition is defined by intervals (n1<x<n2), in order to split them into two unilateral conditions
            second=r[r.find('<')+2:] # second part of the interval x<n2
            words.append(second[:second.find('<')+2].replace(" ", ""))  # extract feature + sign (n1<) of the second part of the interval and removes the cut-off value
            feat=second[:second.find('<')] #extract the feature name
            if r.find('<')+1== '=': 
                first=feat+'>= '+r[:r.find('<')] # first condition n1<=x-> x>=n1
            else:
                first=feat+'> '+r[:r.find('<')] # first condition n1<x-> x>n1
                
            words.append(first[:first.find('>')+2].replace(" ", ""))  # extract feature + sign (n1>= or n1>) of the first part of the interval and removes the cut-off value  
            #split and save intervals as 2 unilateral conditions
            unilateral_conditions.append(second);
            unilateral_conditions.append(first);
        elif r.find('>')>0: # else if the condition is unilateral, in the form x>
            unilateral_conditions.append(r);
            words.append(r[:r.find('>')+2].replace(" ", ""))            
        else: # else if the condition is unilateral, in the form x<
            unilateral_conditions.append(r);
            words.append(r[:r.find('<')+2].replace(" ", ""))
print(words)
counter = Counter(words) #vector that store the list of words (unique) and their occurrence
print(counter)

In [None]:
for i in counter: #for each word in the list creates three new colums in the ruleset: one for presence/absence of the word in the rule, one for the cut-off value and one for the normalized cut-off value 
    ruleset.loc[:,i]= 0
    ruleset[i+'Value']=0.0
    ruleset[i+'ValueNorm']=0.0

In [None]:
#for each word fill the correspondent column with 1 is the word is present in the rule, in that case, fill the value column with the correspondent cut-off value
for c in counter:
    for index, value in ruleset.loc[:,'Rule'].items():
        
        for r in value.split(' AND '): #for each single condition
            if r.count('<')==2:  #search for conditions defined by intervals (n1<x<n2)
                second=r[r.find('<')+2:] 
                feat=second[:second.find('<')] 
                if r.find('<')+1== '=': 
                    first=feat+'>= '+r[:r.find('<')] #first condition n1<=x-> x>=n1
                else:
                    first=feat+'> '+r[:r.find('<')] #first condition n1<x-> x>n1
                if first[:first.find('>')+2].replace(" ", "") == c:
                    ruleset.loc[index, c] = 1 
                    ruleset.loc[index, c+'Value'] = float(first.split('>')[1].strip())    
                if second[:second.find('<')+2].replace(" ", "") == c:
                    ruleset.loc[index, c] = 1
                    if second.find('='):
                        ruleset.loc[index, c+'Value'] = float(second.split('<=')[1].strip()) 
                    else:   
                        ruleset.loc[index, c+'Value'] = float(second.split('<')[1].strip())           
            elif r[:r.find('>')+2].replace(" ", "") == c:
                ruleset.loc[index, c] = 1
                ruleset.loc[index, c+'Value'] = float(r.split('>')[1].strip())
            elif r[:r.find('<')+2].replace(" ", "") == c:
                ruleset.loc[index, c] = 1
                ruleset.loc[index, c+'Value'] = float(r.split('<=')[1].strip())
print(ruleset)               

In [None]:
#Vectors of maximum and minimum (normative) values for each feature that will be hardly never present in a classification rule
feature_names= ['srt', 'Age', 'trials', 'correct', '%correct', 'avg_reaction_time', 'total_test_time', 'volume']
minVet=[-25,0,20,15,0,0.5,100,1]
maxVet=[25,99,150,125,100,10,180,0.9] 
for c in counter: #normalize each column value between 0 and 1 by considering minimum and maximum possible values (hardly never achieved)  
    for feat in feature_names: #for each feature
        if c.count(feat)==1: 
            maxFeat=maxVet[feature_names.index(feat)]
            minFeat=minVet[feature_names.index(feat)]
    valueColIndex=ruleset.columns.get_loc(c)+1
    for rowIndex, value in ruleset.iloc[:,valueColIndex].items():
        if ruleset.iloc[rowIndex, valueColIndex]==0: #when the word column is zero
            ruleset.iloc[rowIndex, valueColIndex+1] =0 #also the value column is 0 (as the minimum is never present in a classification rule, we have value 0 only if the word is not present)
        else:
            ruleset.iloc[rowIndex, valueColIndex+1] = (ruleset.iloc[rowIndex, valueColIndex]-minFeat)/(maxFeat-minFeat) #normalize between min and max
for c in counter: #reset the cut-off value column and keeps only the normalized values
    del ruleset[c+'Value'] 
print(ruleset)

In [None]:
# divide the rules by class as only rules characterizing the same output class will be compared
ruleset_HL=ruleset[ruleset['Class']=='HL']
ruleset_noHL=ruleset[ruleset['Class']=='noHL']
print(ruleset_HL)

In [None]:
#convert the columns of the BOW matrix into arrays to compute cosine similarity
matrix_noHL = ruleset_noHL.iloc[:,6:ruleset_noHL.shape[1]].to_numpy()
matrix_HL = ruleset_HL.iloc[:,6:ruleset_HL.shape[1]].to_numpy()

In [None]:
#calculate cosine similarity of the BOW matrixes, one for each output class
cos_sim_noHL = cosine_similarity (matrix_noHL, matrix_noHL)
cos_sim_HL = cosine_similarity (matrix_HL, matrix_HL)

In [None]:
# Matrixes of cosine similarity between rules, max similarity=1 (same rule) 
#the cell in position i,j cointains the similarity coefficient between rule i and rule j
similarity_matrix_noHL = pd.DataFrame(cos_sim_noHL,
                   index = ruleset_noHL['Set'] +' - Rule: ' +  ruleset_noHL['n'].astype("string"),
                   #index=phrases,
                   columns = ruleset_noHL['Set'] + ' - Rule: ' + ruleset_noHL['n'].astype("string"))
                   #columns=phrases)
similarity_matrix_HL = pd.DataFrame(cos_sim_HL,
                   index = ruleset_HL['Set'] +' - Rule: ' + ruleset_HL['n'].astype("string"),
                   #index=phrases,
                   columns = ruleset_HL['Set'] + ' - Rule: '  + ruleset_HL['n'].astype("string"))
                   #columns=phrases)

print(similarity_matrix_noHL)
print(similarity_matrix_HL)

In [None]:
#Adjust rule similarity by considering covering differences between rules: max similarity=1 (same rule structure and covering) 
# no HL
ruleset_noHL.reset_index(drop=True, inplace=True)
cov_as_float=ruleset_noHL['Covering'].astype('float') #vector of covering values for rules describing the no HL class

for i in range (similarity_matrix_noHL.shape[1]):    
    for j in range (similarity_matrix_noHL.shape[1]):       
        if(similarity_matrix_noHL.iat[i,j]!=0): #if the rule similarity of a couple of rule i,j is not zero
            tmp=1-(abs(cov_as_float[i]-cov_as_float[j])) 
            similarity_matrix_noHL.iat[i,j]= similarity_matrix_noHL.iat[i,j]*tmp #adjust rule similarity coefficient between rule i and j
            

#%matplotlib notebook 
sns.set_theme()
font = {'family' : 'normal','weight' : 'bold','size'   : 10}
matplotlib.rc('font', **font)

#PLOT THE RULE SIMILARITY MATRIX FOR NO HL RULES
mask = np.zeros_like(similarity_matrix_noHL)
mask[np.triu_indices_from(mask)] = True #plots only lower triangular matrix (the matrix is simmetric)
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(20, 15))
    ax = sns.heatmap(similarity_matrix_noHL, mask=mask, square=True,cbar_kws={'label': 'Rule similarity coef'},vmin=0, vmax=1,annot=True)
    ax.set_title('Rule similarity matrix adjusted by covering (no HL rules)')

# HL
ruleset_HL.reset_index(drop=True, inplace=True)
cov_as_float=ruleset_HL['Covering'].astype('float') #vector of covering values for rules describing the HL class
for i in range (similarity_matrix_HL.shape[1]):    
    for j in range (similarity_matrix_HL.shape[1]):       
        if(similarity_matrix_HL.iat[i,j]!=0):
            tmp=1-(abs(cov_as_float[i]-cov_as_float[j]))
            similarity_matrix_HL.iat[i,j]= similarity_matrix_HL.iat[i,j]*tmp #adjust rule similarity coefficient 
            
#PLOT THE RULE SIMILARITY MATRIX FOR NO HL RULES
mask = np.zeros_like(similarity_matrix_HL)
mask[np.triu_indices_from(mask)] = True #plots only lower triangular matrix (the matrix is simmetric)
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(20, 15))
    ax = sns.heatmap(similarity_matrix_HL, mask=mask, square=True,cbar_kws={'label': 'Rule similarity coef'},vmin=0, vmax=1,annot=True)
    ax.set_title('Rule similarity matrix adjusted by covering (HL rules)')

In [None]:
#save to Excel
similarity_matrix_HL.to_excel(filepath+'Rule_similarity.xlsx', sheet_name='HL',index=True)
similarity_matrix_noHL.to_excel(filepath+'Rule_similarity.xlsx', sheet_name='noHL',index=True)