In [1]:
import itertools
import numpy as np
import pandas as pd

## Task 1
Find the frequent itemsets from the given dataset. Minsup = 0.15 and minconf = 0.80


In [33]:
data = {'Transactions':['A, B, C, D, F', 'A, B, C, D', ' A, B, C, D', 'A, B', 'B, C, E']}
df = pd.DataFrame(data, index=[1,2,3,4,5])
print(df)
items = ['A', 'B', 'C', 'D', 'E', 'F']

    Transactions
1  A, B, C, D, F
2     A, B, C, D
3     A, B, C, D
4           A, B
5        B, C, E


Their are six unique items in the data set including A, B, C, D, E and F.


In [49]:
c_1 = list(itertools.combinations(items, 1))
print(c_1)

[('A',), ('B',), ('C',), ('D',), ('E',), ('F',)]


### First Iteration
For our first iteration we count the number of items of $ C_1 $ and find that none of our sets are below the minimum support threshold. Thus, none are pruned and $C_1 = L_1 $

In [50]:
c_1 = {'Itemset': ['{A}', '{B}', '{C}', '{D}', '{E}', '{F}'], 'sup': [4, 5, 4, 3, 1, 1], 'sup1': [4/5, 5/5, 4/5, 3/5, 1/5, 1/5]}
print(pd.DataFrame(c_1).to_string(index=False))
l_1 = ['A', 'B', 'C', 'D', 'E', 'F']

Itemset  sup  sup1
    {A}    4   0.8
    {B}    5   1.0
    {C}    4   0.8
    {D}    3   0.6
    {E}    1   0.2
    {F}    1   0.2


### Second Iteration
Create itemsets of size 2 using the items in $L_1$. This produces 15 subsets.

In [51]:
c_2 = list(itertools.combinations(items, 2))
print(c_2)

[('A', 'B'), ('A', 'C'), ('A', 'D'), ('A', 'E'), ('A', 'F'), ('B', 'C'), ('B', 'D'), ('B', 'E'), ('B', 'F'), ('C', 'D'), ('C', 'E'), ('C', 'F'), ('D', 'E'), ('D', 'F'), ('E', 'F')]


In [52]:
c_2 = {'Itemset': ['{A, B}', '{A, C}', '{A, D}', '{A, E}', '{A, F}', '{B, C}', '{B, D}', '{B, E}', '{B, F}', '{C, D}', '{C, E}', '{C, F}', '{D, E}', '{D, F}', '{E, F}'], 'sup':[4,3,3,0,1,4,3,1,1,3,1,1,0,1,0], 'sup1':[4/5,3/5,3/5,0/5,1/5,4/5,3/5,1/5,1/5,3/5,1/5,1/5,0/5,1/5,0/5]}
print(pd.DataFrame(c_2).to_string(index=False))

Itemset  sup  sup1
 {A, B}    4   0.8
 {A, C}    3   0.6
 {A, D}    3   0.6
 {A, E}    0   0.0
 {A, F}    1   0.2
 {B, C}    4   0.8
 {B, D}    3   0.6
 {B, E}    1   0.2
 {B, F}    1   0.2
 {C, D}    3   0.6
 {C, E}    1   0.2
 {C, F}    1   0.2
 {D, E}    0   0.0
 {D, F}    1   0.2
 {E, F}    0   0.0


Three sets above fall below the minsup threshold of 0.15. These include {A, E}, {D, E} and {E, F}. Thus, these are removed and we get $L_2$,

In [53]:
l_2 = {'Itemset': ['{A, B}', '{A, C}', '{A, D}', '{A, F}', '{B, C}', '{B, D}', '{B, E}', '{B, F}', '{C, D}', '{C, E}', '{C, F}', '{D, F}'], 'sup1':[4/5,3/5,3/5,1/5,4/5,3/5,1/5,1/5,3/5,1/5,1/5,1/5]}
print(pd.DataFrame(l_2).to_string(index=False))


Itemset  sup1
 {A, B}   0.8
 {A, C}   0.6
 {A, D}   0.6
 {A, F}   0.2
 {B, C}   0.8
 {B, D}   0.6
 {B, E}   0.2
 {B, F}   0.2
 {C, D}   0.6
 {C, E}   0.2
 {C, F}   0.2
 {D, F}   0.2


In [54]:
c_3 = list(itertools.combinations(items, 3))
toPrune = [('A', 'E'), ('D', 'E'), ('E', 'F')]
print(c_3)

[('A', 'B', 'C'), ('A', 'B', 'D'), ('A', 'B', 'E'), ('A', 'B', 'F'), ('A', 'C', 'D'), ('A', 'C', 'E'), ('A', 'C', 'F'), ('A', 'D', 'E'), ('A', 'D', 'F'), ('A', 'E', 'F'), ('B', 'C', 'D'), ('B', 'C', 'E'), ('B', 'C', 'F'), ('B', 'D', 'E'), ('B', 'D', 'F'), ('B', 'E', 'F'), ('C', 'D', 'E'), ('C', 'D', 'F'), ('C', 'E', 'F'), ('D', 'E', 'F')]


In [55]:
for i in c_3:
    print(i)

('A', 'B', 'C')
('A', 'B', 'D')
('A', 'B', 'E')
('A', 'B', 'F')
('A', 'C', 'D')
('A', 'C', 'E')
('A', 'C', 'F')
('A', 'D', 'E')
('A', 'D', 'F')
('A', 'E', 'F')
('B', 'C', 'D')
('B', 'C', 'E')
('B', 'C', 'F')
('B', 'D', 'E')
('B', 'D', 'F')
('B', 'E', 'F')
('C', 'D', 'E')
('C', 'D', 'F')
('C', 'E', 'F')
('D', 'E', 'F')


Then when generating subsets of size three we use these identified subsets of size 2 under the threshold to prune $C_3$ 

Thus, we remove {A, B, E}, {A, C, E}, {A, D, E}, {A, E, F}, {B, D, E}, {C, D, E}, {D, E, F} and {B, E, F} before continuing to the next iteration. 

Which leaves us with:

In [56]:
c_3 = [('A', 'B', 'C'),('A', 'B', 'D'),('A', 'B', 'F'),('A', 'C', 'D'),('A', 'C', 'F'),('A', 'D', 'F'),('B', 'C', 'D'),('B', 'C', 'E'),('B', 'C', 'F'),('B', 'D', 'F'),('C', 'D', 'F'),('C', 'E', 'F')]

### Third Iteration

In [57]:
c_3 = {'Itemset': c_3, 'sup': [3,3,1,3,1,1,3,0,1,1,1,0], 'sup1': [3/5,3/5,1/5,3/5,1/5,1/5,3/5,0/5,1/5,1/5,1/5,0/5]}
print(pd.DataFrame(c_3).to_string(index=False))

  Itemset  sup  sup1
(A, B, C)    3   0.6
(A, B, D)    3   0.6
(A, B, F)    1   0.2
(A, C, D)    3   0.6
(A, C, F)    1   0.2
(A, D, F)    1   0.2
(B, C, D)    3   0.6
(B, C, E)    0   0.0
(B, C, F)    1   0.2
(B, D, F)    1   0.2
(C, D, F)    1   0.2
(C, E, F)    0   0.0


In [58]:
c_4 = list(itertools.combinations(items, 4))
print(c_4)

[('A', 'B', 'C', 'D'), ('A', 'B', 'C', 'E'), ('A', 'B', 'C', 'F'), ('A', 'B', 'D', 'E'), ('A', 'B', 'D', 'F'), ('A', 'B', 'E', 'F'), ('A', 'C', 'D', 'E'), ('A', 'C', 'D', 'F'), ('A', 'C', 'E', 'F'), ('A', 'D', 'E', 'F'), ('B', 'C', 'D', 'E'), ('B', 'C', 'D', 'F'), ('B', 'C', 'E', 'F'), ('B', 'D', 'E', 'F'), ('C', 'D', 'E', 'F')]


{B, C, E} and {C, E, F} fall below the threshold. So we pruned these from the sets of size 4 above. This gives us c_4 below...

In [59]:
c_4 = [('A', 'B', 'C', 'D'),('A', 'B', 'C', 'F'),('A', 'B', 'D', 'E'),('A', 'B', 'D', 'F'),('A', 'B', 'E', 'F'),('A', 'C', 'D', 'E'),('A', 'C', 'D', 'F'),('A', 'D', 'E', 'F'),('B', 'C', 'D', 'E'),('B', 'C', 'D', 'F'),('B', 'D', 'E', 'F'),]

### Forth Iteration

In [60]:
c_4 = {'Itemset': c_4, 'sup': [3,1,0,1,0,0,1,0,0,1,0], 'sup1': [3/5,1/5,0,1/5,0,0,1/5,0,0,1/5,0]}
print(pd.DataFrame(c_4).to_string(index=False))

     Itemset  sup  sup1
(A, B, C, D)    3   0.6
(A, B, C, F)    1   0.2
(A, B, D, E)    0   0.0
(A, B, D, F)    1   0.2
(A, B, E, F)    0   0.0
(A, C, D, E)    0   0.0
(A, C, D, F)    1   0.2
(A, D, E, F)    0   0.0
(B, C, D, E)    0   0.0
(B, C, D, F)    1   0.2
(B, D, E, F)    0   0.0


Here we pruned all except {A, B, C, D}, {A, B, C, F}, {A, B, D, F}, {A, C, D, F} and {B, C, D, F}.

We prune subsets of size 5 that contain:
(A, B, D, E),
(A, B, E, F),
(A, C, D, E),
(A, D, E, F),
(B, C, D, E),
(B, D, E, F).

This leaves us with only one set remaining {A, B, C, D, F}.

In [61]:
c_5 = list(itertools.combinations(items, 5))
for c in c_5:
    print(c)

('A', 'B', 'C', 'D', 'E')
('A', 'B', 'C', 'D', 'F')
('A', 'B', 'C', 'E', 'F')
('A', 'B', 'D', 'E', 'F')
('A', 'C', 'D', 'E', 'F')
('B', 'C', 'D', 'E', 'F')


This set occurs once with a support of $1/5$.

In [62]:
c_5 = [('A', 'B', 'C', 'D', 'F')]

### Generate itemsets from remaining set

Find all sets of size from 1 to k from c_5 where k is the length of c_5.

In [63]:
s4 = list(itertools.combinations(['A', 'B', 'C', 'D', 'F'], 4))
s3 = list(itertools.combinations(['A', 'B', 'C', 'D', 'F'], 3))
s2 = list(itertools.combinations(['A', 'B', 'C', 'D', 'F'], 2))
s1 = list(itertools.combinations(['A', 'B', 'C', 'D', 'F'], 1))
itemsets = []
for setList in [s4, s3, s2, s1]:
    for s in setList:
        itemsets.append(s)

print(sets)
itemsets = {'Itemset': itemsets, 'sup': [0*30]}
print(pd.DataFrame(itemsets).to_string(index=False))

NameError: name 'sets' is not defined

For the rules generated from this frequent data set we determine if they sit above our confidence threshold.
Which is calculated as $super(i)/super(s)$. Which is (1/5) / super(s). We accept if this is > minconfig.

['A', 'B', 'C', 'D', 'E', 'F']


In [74]:
l_4 = [{'A', 'B', 'C', 'D'}, {'A', 'B', 'C', 'F'}, {'A', 'B', 'D', 'F'}, {'A', 'C', 'D', 'F'}, {'B', 'C', 'D', 'F'}]

def generateItemSets(items:set, k:int)->list:
    """
    Generates combinations of itemsets and returns a list of sets

    Parameters
    items (set): The set of items to generate combinations from
    k (int): The length of the generated sets

    Returns:
    list: List of the generated combinations of length k
    """
    pairs = list(itertools.combinations(items, 2))

    itemsets = []
    for pair in pairs:
        set1 = pair[0]
        set2 = pair[1]
        set3 = set1.union(set2)
        set3 = sorted(list(set3))
        if set3 not in itemsets and len(set3) == k:

            itemsets.append(set3)
    return itemsets
c_5 = generateItemSets(l_4, 5)

print(len(c_5))
print(c_5)

1
[['A', 'B', 'C', 'D', 'F']]
