### DDW - Association Rules Mining

In [None]:
# ! pip install pandas

In [11]:
import pandas as pd

#### Basic Operations

In [12]:
# Reading csv to a data frame
import pandas as pd
df = pd.read_csv('data/bank-data.csv')

# print head(tail) of the data frame
print(df.head()) # df.tail()

# select column
print(df[['age', 'car']])

# select by index
print(df.iloc[3:6,5:9])

# delete column
del df["id"]
print(df.head())

# discretize continous values to categorical values
df["income"] = pd.cut(df["income"],10)
print(df.head())

        id  age     sex      region   income married  children  car save_act  \
0  ID12101   48  FEMALE  INNER_CITY  17546.0      NO         1   NO       NO   
1  ID12102   40    MALE        TOWN  30085.1     YES         3  YES       NO   
2  ID12103   51  FEMALE  INNER_CITY  16575.4     YES         0  YES      YES   
3  ID12104   23  FEMALE        TOWN  20375.4     YES         3   NO       NO   
4  ID12105   57  FEMALE       RURAL  50576.3     YES         0   NO      YES   

  current_act mortgage  pep  
0          NO       NO  YES  
1         YES      YES   NO  
2         YES       NO   NO  
3         YES       NO   NO  
4          NO       NO   NO  
     age  car
0     48   NO
1     40  YES
2     51  YES
3     23   NO
4     57   NO
..   ...  ...
595   61  YES
596   30  YES
597   31  YES
598   29   NO
599   38  YES

[600 rows x 2 columns]
  married  children car save_act
3     YES         3  NO       NO
4     YES         0  NO      YES
5     YES         2  NO      YES
   age     sex 

### Apriori algorithm implementation

In [13]:
from collections import Counter

def frequentItems(transactions, support):
    counter = Counter()
    for trans in transactions:
        counter.update(frozenset([t]) for t in trans)
    return set(item for item in counter if counter[item]/len(transactions) >= support), counter

def generateCandidates(L, k):
    candidates = set()
    for a in L:
        for b in L:
            union = a | b
            if len(union) == k and a != b:
                candidates.add(union)
    return candidates

def filterCandidates(transactions, itemsets, support):
    counter = Counter()
    for trans in transactions:
        subsets = [itemset for itemset in itemsets if itemset.issubset(trans)]
        counter.update(subsets)
    return set(item for item in counter if counter[item]/len(transactions) >= support), counter

def apriori(transactions, support, maxlen=4):
    result = list()
    resultc = Counter()
    candidates, counter = frequentItems(transactions, support)
    result += candidates
    resultc += counter
    k = 2
    while candidates:
        candidates = generateCandidates(candidates, k)
        candidates,counter = filterCandidates(transactions, candidates, support)
        result += candidates
        resultc += counter
        k += 1
        if k>maxlen:
            break
    resultc = {item:(resultc[item]/len(transactions)) for item in resultc}
    return result, resultc

#### Frequent item sets

In [14]:
dataset = [
    ['bread', 'milk'],
    ['bread', 'diaper', 'beer', 'egg'],
    ['milk', 'diaper', 'beer', 'cola'],
    ['bread', 'milk', 'diaper', 'beer'],
    ['bread', 'milk', 'diaper', 'cola'],
]

frequentItemsets, supports = apriori(dataset, 0.1)
for f in frequentItemsets:
    print("{} - {}".format(f,supports[f]))

"frozenset({'beer'}) - 0.6"
"frozenset({'bread'}) - 0.8"
"frozenset({'milk'}) - 0.8"
"frozenset({'egg'}) - 0.2"
"frozenset({'cola'}) - 0.4"
"frozenset({'diaper'}) - 0.8"
"frozenset({'egg', 'bread'}) - 0.2"
"frozenset({'diaper', 'beer'}) - 0.6"
"frozenset({'milk', 'diaper'}) - 0.6"
"frozenset({'egg', 'beer'}) - 0.2"
"frozenset({'bread', 'diaper'}) - 0.6"
"frozenset({'milk', 'beer'}) - 0.4"
"frozenset({'cola', 'milk'}) - 0.4"
"frozenset({'cola', 'bread'}) - 0.2"
"frozenset({'bread', 'beer'}) - 0.4"
"frozenset({'cola', 'beer'}) - 0.2"
"frozenset({'egg', 'diaper'}) - 0.2"
"frozenset({'milk', 'bread'}) - 0.6"
"frozenset({'cola', 'diaper'}) - 0.4"
"frozenset({'diaper', 'bread', 'beer'}) - 0.4"
"frozenset({'diaper', 'cola', 'beer'}) - 0.2"
"frozenset({'cola', 'bread', 'diaper'}) - 0.2"
"frozenset({'cola', 'milk', 'diaper'}) - 0.4"
"frozenset({'egg', 'bread', 'beer'}) - 0.2"
"frozenset({'bread', 'cola', 'milk'}) - 0.2"
"frozenset({'diaper', 'milk', 'beer'}) - 0.4"
"frozenset({'egg', 'diaper', 

#### Rules

In [15]:
from pprint import pprint as print 

def generateRules(frequentItemsets, supports, minConfidence):
    result = []
    for fi in frequentItemsets:
        if len(fi) == 1:
            continue
        for consequent in fi:
            antecedent = set(fi).difference([consequent])
            support = supports[fi]
            confidence = support/supports[frozenset(antecedent)]
            if confidence > minConfidence:
                result.append({
                    "antecedent": antecedent,
                    "consequent": consequent,
                    "support": support,
                    "confidence": confidence,
                    "len": len(fi)
                })
    return pd.DataFrame(result)

generateRules(frequentItemsets, supports, 0.5)

# bank dataset preprocessing
import pandas as pd
df = pd.read_csv("data/bank-data.csv")
del df["id"]
df["income"] = pd.cut(df["income"],10) # rozdělí na 10 rovnoměrných intervalů
dataset = []
# kombinace názvu sloupce a hodnoty, aby to bylo rozeznatelné
for index, row in df.iterrows():
    row = [col+"="+str(row[col]) for col in list(df)]
    dataset.append(row)
frequentItemsets, supports = apriori(dataset, 0.3)
generateRules(frequentItemsets, supports, 0.5)

# ...
# {'car=YES'} => married=YES, 0.3233333333333333, 0.6554054054054054
# ...
# {'married=YES', 'save_act=YES'} => current_act=YES, 0.3433333333333333, 0.7436823104693141
# ...

Unnamed: 0,antecedent,consequent,support,confidence,len
0,{sex=MALE},save_act=YES,0.346667,0.693333,2
1,{save_act=YES},sex=MALE,0.346667,0.502415,2
2,{pep=YES},mortgage=NO,0.303333,0.664234,2
3,{car=YES},save_act=YES,0.348333,0.706081,2
4,{save_act=YES},car=YES,0.348333,0.504831,2
...,...,...,...,...,...
56,"{married=YES, current_act=YES}",save_act=YES,0.343333,0.703072,3
57,"{married=YES, save_act=YES}",current_act=YES,0.343333,0.743682,3
58,"{current_act=YES, mortgage=NO}",save_act=YES,0.353333,0.704319,3
59,"{save_act=YES, mortgage=NO}",current_act=YES,0.353333,0.785185,3


In [16]:
rules = generateRules(frequentItemsets, supports, 0.5)
rules[rules['len'] > 1].sort_values(["support", "confidence"], ascending=[False, False])

Unnamed: 0,antecedent,consequent,support,confidence,len
20,{save_act=YES},current_act=YES,0.531667,0.770531,2
19,{current_act=YES},save_act=YES,0.531667,0.701099,2
5,{mortgage=NO},current_act=YES,0.501667,0.769821,2
6,{current_act=YES},mortgage=NO,0.501667,0.661538,2
35,{married=YES},current_act=YES,0.488333,0.739899,2
...,...,...,...,...,...
50,"{married=YES, mortgage=NO}",save_act=YES,0.306667,0.704981,3
49,"{save_act=YES, mortgage=NO}",married=YES,0.306667,0.681481,3
51,"{married=YES, save_act=YES}",mortgage=NO,0.306667,0.664260,3
2,{pep=YES},mortgage=NO,0.303333,0.664234,2


In [18]:
df = pd.read_csv("data/zoo.csv")
dataset = []
for index, row in df.iterrows():
    row = [col+"="+str(row[col]) for col in list(df)]
    dataset.append(row)
fi, supports = apriori(dataset, 0.1)
rules = generateRules(fi, supports, 0.3)

test = {"legs=0"}
cr = rules[(rules['len']>1)&(rules["consequent"].str.contains("type=", regex=False))].copy()
for t in test:
    cr = cr[cr["antecedent"].str.contains(t, regex=False)]
cr.sort_values(["support", "confidence"], ascending=[False, False])

Unnamed: 0,antecedent,consequent,support,confidence,len
17888,"{eggs=True, fins=True, legs=0}",type=fish,0.128713,1.000000,4
20805,"{fins=True, legs=0, milk=False}",type=fish,0.128713,1.000000,4
36602,"{breathes=False, fins=True, legs=0}",type=fish,0.128713,1.000000,4
18288,"{toothed=True, breathes=False, legs=0}",type=fish,0.128713,0.928571,4
22056,"{backbone=True, breathes=False, legs=0}",type=fish,0.128713,0.928571,4
...,...,...,...,...,...
34506,"{domestic=False, hair=False, legs=0}",type=fish,0.118812,0.571429,4
1912,"{domestic=False, legs=0}",type=fish,0.118812,0.545455,3
26613,"{domestic=False, feathers=False, legs=0}",type=fish,0.118812,0.545455,4
30390,"{domestic=False, airborne=False, legs=0}",type=fish,0.118812,0.545455,4
