In [13]:
import numpy as np
import matplotlib.pyplot as plt

% matplotlib inline

## Basic function

In [14]:
def loadTestDataSet():
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

def createC1(dataList):
    C1 = []
    for transaction in dataList:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()
    return map(frozenset, C1)

def scanD(DataSet, Ck, minSupport): # generate LK
    ssCnt = {}
    for transaction in DataSet:
        for candidate in Ck:
            if candidate.issubset(transaction):
                if not ssCnt.has_key(candidate):
                    ssCnt[candidate] = 1
                else:
                    ssCnt[candidate] += 1
   
    numItems = float(len(DataSet))
    retList = []
    supportVals = {}
    for key in ssCnt.keys():
        support = ssCnt[key] / numItems
        if support >= minSupport:
            retList.insert(0, key)
        supportVals[key] = support
    
    return retList, supportVals

In [15]:
dataSet = loadTestDataSet()
print(type(dataSet))
print(np.shape(dataSet))
print("\n")

C1 = createC1(dataSet)
print(C1)
print("\n")

D = map(set, dataSet)
print(D)
print("\n")

L1, supportVals = scanD(D, C1, 0.5)
print(L1)
print(supportVals)

<type 'list'>
(4,)


[frozenset([1]), frozenset([2]), frozenset([3]), frozenset([4]), frozenset([5])]


[set([1, 3, 4]), set([2, 3, 5]), set([1, 2, 3, 5]), set([2, 5])]


[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])]
{frozenset([4]): 0.25, frozenset([5]): 0.75, frozenset([2]): 0.75, frozenset([3]): 0.75, frozenset([1]): 0.5}


## Apriori

In [60]:
def generateCandidateSet(Lk): # generate Ck from Lk-1
    #print("haha",Lk)
    Ck = []
    lenLK = len(Lk)
    for i in range(lenLK):
        for j in range(i+1, lenLK):
            k = len(Lk[i])
            L1 = list(Lk[i])[:k-1]
            L2 = list(Lk[j])[:k-1]
            L1.sort(); L2.sort()
            if L1 == L2:
                Ck.append(Lk[i] | Lk[j])
    return Ck

def apriori(dataSet, minSupport = 0.5):
    C1 = createC1(dataSet)
    D = map(set, dataSet)
    L1, supportVals = scanD(D, C1, minSupport)
    L = [L1]
    while( len(L[-1]) > 0 ):
        Ck = generateCandidateSet(L[-1])
        Lk, supportK = scanD(D, Ck, minSupport)
        supportVals.update(supportK)
        L.append(Lk)
    
    return L, supportVals

In [61]:
dataSet = loadTestDataSet()
L, supportVals = apriori(dataSet)
print(L)
print(supportVals)

[[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])], []]
{frozenset([5]): 0.75, frozenset([3]): 0.75, frozenset([2, 3, 5]): 0.5, frozenset([1, 2]): 0.25, frozenset([1, 5]): 0.25, frozenset([3, 5]): 0.5, frozenset([4]): 0.25, frozenset([2, 3]): 0.5, frozenset([2, 5]): 0.75, frozenset([1]): 0.5, frozenset([1, 3]): 0.5, frozenset([2]): 0.75}


In [18]:
dataSet = loadTestDataSet()
L, supportVals = apriori(dataSet, 0.7)
print(L)
print(supportVals)

[[frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([2, 5])], []]
{frozenset([5]): 0.75, frozenset([3]): 0.75, frozenset([3, 5]): 0.5, frozenset([4]): 0.25, frozenset([2, 3]): 0.5, frozenset([2, 5]): 0.75, frozenset([1]): 0.5, frozenset([2]): 0.75}


## asocciation rules: P->H

In [77]:
def calConf(freqSet, H, supportVals, bigRuleList, minConf = 0.7):
    # H is the item list of freqSet
    prunedH = []
    for conseq in H:
        conf = supportVals[freqSet] / supportVals[freqSet - conseq]
        if conf >= minConf:
            print( freqSet - conseq, '--->', conseq, ' conf: ', conf )
            bigRuleList.append((freqSet - conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH
        
def rulesFromConseq(freqSet, H, supportVals, bigRuleList, minConf=0.7):
    m = len(H[0])
    if (len(freqSet) > m+1):
        Hmp1 = generateCandidateSet(H) # try to merge the element of H to generate larger H
        Hmp1 = calConf(freqSet, Hmp1, supportVals, bigRuleList, minConf)
        if len(Hmp1) > 1:
            rulesFromConseq(freqSet, Hmp1, supportVals, bigRuleList, minConf)

def generateRules(L, supportVals, minConf = 0.7):
    bigRuleList = []
    for i in range(1, len(L)):
        for freqSet in L[i]: # frequent set with order i+1
            #print('freaSet:', freqSet)
            H1 = [frozenset([item]) for item in freqSet]
            #print("H1", H1)
            if i > 1 :
                rulesFromConseq(freqSet, H1, supportVals, bigRuleList, minConf)
            else:
                calConf(freqSet, H1, supportVals, bigRuleList, minConf)
    return bigRuleList

In [51]:
dataSet = loadTestDataSet()
L, supportVals = apriori(dataSet)
print(L)
print(len(L))
print("\n")
print(supportVals)

[[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])], []]
4


{frozenset([5]): 0.75, frozenset([3]): 0.75, frozenset([2, 3, 5]): 0.5, frozenset([1, 2]): 0.25, frozenset([1, 5]): 0.25, frozenset([3, 5]): 0.5, frozenset([4]): 0.25, frozenset([2, 3]): 0.5, frozenset([2, 5]): 0.75, frozenset([1]): 0.5, frozenset([1, 3]): 0.5, frozenset([2]): 0.75}


In [75]:
rules = generateRules(L, supportVals)

(frozenset([1]), '--->', frozenset([3]), ' conf: ', 1.0)
(frozenset([5]), '--->', frozenset([2]), ' conf: ', 1.0)
(frozenset([2]), '--->', frozenset([5]), ' conf: ', 1.0)


In [76]:
rules = generateRules(L, supportVals, 0.5)

(frozenset([3]), '--->', frozenset([1]), ' conf: ', 0.6666666666666666)
(frozenset([1]), '--->', frozenset([3]), ' conf: ', 1.0)
(frozenset([5]), '--->', frozenset([2]), ' conf: ', 1.0)
(frozenset([2]), '--->', frozenset([5]), ' conf: ', 1.0)
(frozenset([3]), '--->', frozenset([2]), ' conf: ', 0.6666666666666666)
(frozenset([2]), '--->', frozenset([3]), ' conf: ', 0.6666666666666666)
(frozenset([5]), '--->', frozenset([3]), ' conf: ', 0.6666666666666666)
(frozenset([3]), '--->', frozenset([5]), ' conf: ', 0.6666666666666666)
(frozenset([5]), '--->', frozenset([2, 3]), ' conf: ', 0.6666666666666666)
(frozenset([3]), '--->', frozenset([2, 5]), ' conf: ', 0.6666666666666666)
(frozenset([2]), '--->', frozenset([3, 5]), ' conf: ', 0.6666666666666666)


## A real case : mushroom

In [78]:
DATA_DIR = 'data/apriori/'

In [88]:
dataMushroom = np.array([line.split() for line in open(DATA_DIR+'mushroom.dat').readlines()])
print(np.shape(dataMushroom))
print(dataMushroom[:2,:])

(8124, 23)
[['1' '3' '9' '13' '23' '25' '34' '36' '38' '40' '52' '54' '59' '63' '67'
  '76' '85' '86' '90' '93' '98' '107' '113']
 ['2' '3' '9' '14' '23' '26' '34' '36' '39' '40' '52' '55' '59' '63' '67'
  '76' '85' '86' '90' '93' '99' '108' '114']]


In [89]:
L, supportVals = apriori(dataMushroom, minSupport=0.3)

In [96]:
for item in L[1]:
    if item.intersection('2'):
        print(item)

frozenset(['2', '59'])
frozenset(['39', '2'])
frozenset(['2', '67'])
frozenset(['2', '34'])
frozenset(['2', '23'])
frozenset(['2', '86'])
frozenset(['76', '2'])
frozenset(['90', '2'])
frozenset(['2', '53'])
frozenset(['93', '2'])
frozenset(['63', '2'])
frozenset(['2', '28'])
frozenset(['2', '85'])
frozenset(['2', '36'])


In [97]:
for item in L[2]:
    if item.intersection('2'):
        print(item)

frozenset(['90', '2', '23'])
frozenset(['39', '2', '53'])
frozenset(['2', '59', '34'])
frozenset(['2', '53', '85'])
frozenset(['23', '2', '34'])
frozenset(['2', '53', '34'])
frozenset(['93', '2', '86'])
frozenset(['39', '2', '59'])
frozenset(['2', '59', '36'])
frozenset(['39', '2', '36'])
frozenset(['2', '28', '53'])
frozenset(['86', '2', '59'])
frozenset(['90', '2', '59'])
frozenset(['2', '93', '63'])
frozenset(['90', '39', '2'])
frozenset(['39', '2', '93'])
frozenset(['76', '2', '34'])
frozenset(['39', '2', '85'])
frozenset(['2', '85', '67'])
frozenset(['63', '90', '2'])
frozenset(['93', '2', '36'])
frozenset(['2', '63', '34'])
frozenset(['2', '86', '23'])
frozenset(['2', '59', '23'])
frozenset(['39', '2', '67'])
frozenset(['63', '39', '2'])
frozenset(['2', '28', '85'])
frozenset(['28', '2', '59'])
frozenset(['2', '63', '23'])
frozenset(['2', '86', '53'])
frozenset(['2', '63', '85'])
frozenset(['2', '36', '34'])
frozenset(['2', '86', '67'])
frozenset(['28', '2', '86'])
frozenset(['93