# A SAT-Based Approach for Mining Association Rules

The following program takes in input the *Congressional Voting Records* dataset (https://archive.ics.uci.edu/ml/datasets/Congressional+Voting+Records) and computes its Association Rules using SAT, as proposed by Abdelhamid Boudane et al.

In [21]:
import numpy as np
import pandas as pd
from pysat.solvers import Minisat22
from itertools import combinations
from pysat.pb import *
import time
import datetime

In [22]:
alpha = 0.5
beta = 1
size = 232

ARM = Minisat22()

## Data preparation

In [23]:
colnames = ["class_name","handicapped_infants","water_project_cost_sharing","adoption_of_the_budget_resolution",
            "physician_fee_freeze","el_salvador_aid","religious_groups_in_schools","anti_satellite_test_ban",
            "aid_to_nicaraguan_contras","mx_missile","immigration","synfuels_corporation_cutback",
            "education_spending","superfund_right_to_sue","crime","duty_free_exports",
            "export_administration_act_south_africa"]

data = pd.read_csv("/home/laura/Scrivania/KDMproj/house-votes-84.txt", sep = ',',na_values = '?',
                   header = None, names = colnames)
data = pd.DataFrame(data)

data["republican"] = 0
data.loc[data.class_name=="republican","republican"] = pd.Series(np.ones(len(data.class_name=="republican"),dtype=int))
data["democrat"] = 0
data.loc[data.class_name=="democrat","democrat"] = pd.Series(np.ones(len(data.class_name=="democrat"),dtype=int))
del data["class_name"]
del colnames[0]

items = list(data.columns)

data.replace(to_replace = "n", value = 0, inplace = True)
data.replace(to_replace = "y", value = 1, inplace = True)
data = data.dropna(axis = 0) #working with the assumption: complete data 
data = data.astype("int32")

print(len(data))
data.head()

232


Unnamed: 0,handicapped_infants,water_project_cost_sharing,adoption_of_the_budget_resolution,physician_fee_freeze,el_salvador_aid,religious_groups_in_schools,anti_satellite_test_ban,aid_to_nicaraguan_contras,mx_missile,immigration,synfuels_corporation_cutback,education_spending,superfund_right_to_sue,crime,duty_free_exports,export_administration_act_south_africa,republican,democrat
5,0,1,1,0,1,1,0,0,0,0,0,0,1,1,1,1,0,1
8,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,1,1,0
19,1,1,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,1
23,1,1,1,0,0,0,1,1,1,0,0,0,0,0,1,1,0,1
25,1,0,1,0,0,0,1,1,1,1,0,0,0,0,1,1,0,1


In [24]:
data_small = data.iloc[:size,]

In [25]:
#creates a dictionary that associates to each item "a" a unique integer value.

items_ID = dict(zip([x for x in range(1,len(items)+1)],items))
items_ID

{1: 'handicapped_infants',
 2: 'water_project_cost_sharing',
 3: 'adoption_of_the_budget_resolution',
 4: 'physician_fee_freeze',
 5: 'el_salvador_aid',
 6: 'religious_groups_in_schools',
 7: 'anti_satellite_test_ban',
 8: 'aid_to_nicaraguan_contras',
 9: 'mx_missile',
 10: 'immigration',
 11: 'synfuels_corporation_cutback',
 12: 'education_spending',
 13: 'superfund_right_to_sue',
 14: 'crime',
 15: 'duty_free_exports',
 16: 'export_administration_act_south_africa',
 17: 'republican',
 18: 'democrat'}

## Encoding and insertion

$x_a$: if the item a is in the set $X$ or not. The encoding will be computed with the a's ID.

$y_a$: if the item a is in the set $Y$ or not. The encoding will be computed with the a's ID + 17.

$p_i$: if the i-th transaction contains the set $X$.

$q_i$: if the i-th transaction contains the $X \cup Y$.

In [26]:
def x(a):
    return a

In [27]:
def y(a):
    return len(data_small.columns)+a

In [28]:
def p(i):
    return len(data_small.columns)*2+i

In [29]:
def q(i):
    return len(data_small)+len(data_small.columns)*2+i

In [30]:
# Start point
start_time = time.time()
print("Start time: ", datetime.datetime.now())

Start time:  2019-10-14 21:32:13.002640


### Rule 1

In [31]:
#rule 1
for a in range(1,len(data_small.columns)+1):
    ARM.add_clause([-x(a),-y(a)])


### Rule 2

In [32]:
#first part rule 2 - CNF
for i in range(1,np.shape(data_small)[0]+1):
    l = [x(a) for a in items_ID.keys() if data.iloc[i-1,a-1] == 0] 
    l.append(p(i))
    ARM.add_clause(l)

In [33]:
#second part rule 2 - CNF
for i in range(1,np.shape(data_small)[0]+1):
    for a in items_ID.keys():
        if data_small.iloc[i-1,a-1] == 0:  
            Minisat22().add_clause([-x(a),-p(i)])

### Rule 3

In [34]:
#first part rule 3 - CNF
for i in range(1,np.shape(data_small)[0]+1):
    l = [y(a) for a in items_ID.keys() if data_small.iloc[i-1,a-1] == 0] 
    l.append(-p(i))
    l.append(q(i))
    ARM.add_clause(l)
    ARM.add_clause([-q(i),p(i)])

In [35]:
#second part rule 3 - CNF 

for i in range(1,np.shape(data_small)[0]+1):
    for a in items_ID.keys():
        if data_small.iloc[i-1,a-1] == 0:  
            ARM.add_clause([-y(a),-q(i)])

### Rule 4

In [36]:
#PSEUDO BOOLEAN

cnf = PBEnc.atleast(lits = [q(i) for i in range(1,np.shape(data_small)[0]+1)], 
                    bound = int(len(data_small)*alpha))

for clause in cnf.clauses:
    ARM.add_clause(clause)

print("\n{} variables currently added \n".format(ARM.nof_vars()))


4877 variables currently added 



### Rule 5

In [37]:
#PSEUDO BOOLEAN

cnf = PBEnc.atleast(lits = [q(i) for i in range(1,np.shape(data_small)[0]+1)]+[p(i) for i in range(1,np.shape(data_small)[0]+1)],
                    weights = [100 for i in range(1,np.shape(data_small)[0]+1)]+[-int(beta*100) for i in range(1,np.shape(data_small)[0]+1)],
                    bound = 0, top_id = ARM.nof_vars())

for clause in cnf.clauses:
    ARM.add_clause(clause)


58933 variables currently added 



### Rule 6

In [38]:
#(not in the paper) - to avoid empty sets as X or Y

X = [ x(a) for a in range(1,len(data_small.columns)+1)]
Y = [ y(a) for a in range(1,len(data_small.columns)+1)]

ARM.add_clause(X)
ARM.add_clause(Y)

In [39]:
print("\n{} variables currently added \n".format(ARM.nof_vars()))

print("\n{} clauses currently added \n".format(ARM.nof_clauses()))


58933 variables currently added 


119110 clauses currently added 



## Association rule extraction

In [40]:
print(ARM.solve())
if not ARM.solve(): print(ARM.get_core())

res = []

while ARM.solve():
    model = np.array(ARM.get_model())
    model = model[abs(model) <= len(data_small.columns)*2] #extracting the rule
    ARM.add_clause([-int(model[i]) for i in range(len(model))])
    #print(model)
    res.append(model[model>0])
    
print(ARM.nof_clauses())
res.sort(key = len)
print("\n{} rules found \n".format(len(res)))
res

True


KeyboardInterrupt: 

In [None]:
#finds models with minimal X, when Y is the same

n = len(data_small.columns)

min_models_pred = []
minimality = [True for i in range(len(res))]

for i in range(len(res)-1):
    for j in range(i+1,len(res)):
        if set(res[i][res[i]>n]) == set(res[j][res[j]>n]) and set(res[i][res[i]<=n]) < set(res[j][res[j]<=n]):
            minimality[j] = False
            
for i, x in enumerate(minimality):
    if x: min_models_pred.append(res[i])

min_models_pred
    

In [None]:
#finds models with maximal Y, when X is the same

models = []
maximality = [True for i in range(len(min_models_pred))]

for i in range(len(min_models_pred)-1,0,-1):
    for j in range(i-1,-1,-1):
        if set(min_models_pred[i][min_models_pred[i]<=n]) == set(min_models_pred[j][min_models_pred[j]<=n]) and set(min_models_pred[i][min_models_pred[i]>n]) > set(min_models_pred[j][min_models_pred[j]>n]):
            maximality[j] = False
            
for i, x in enumerate(maximality):
    if x: models.append(res[i])

models

In [None]:
# End point
end_time = time.time()
uptime = end_time - start_time
human_uptime = datetime.timedelta(seconds=uptime)

print("End time: ", datetime.datetime.now())
print("Uptime :" ,human_uptime)