In [1]:
import numpy as np
import pandas as pd
from itertools import permutations, combinations
from collections import Counter
import time
import random

In [2]:
min_support = 1000
min_confidence = 0.5

In [3]:
filename = 'data/T10I4D100K.dat'
def loadTransactions(filename):
    transactions = []
    with open(filename) as f:
        lines = f.readlines()
        for line in lines:
            transaction = line.rstrip().split(" ")
            items = [int(item) for item in transaction]
            transactions.append(items)
            
    return transactions

In [4]:
def getItemsets(transactions, min_support):
    Cs_k = Counter(item for items in transactions for item in set(items))
    Ls_k = {key: Cs_k[key] for key in Cs_k if Cs_k[key] > min_support}
    L_k = [key for key in Ls_k]
    
    return Ls_k, L_k

In [6]:
def filterPairs(pair, k, L_k):
    subset = set([p if k > 2 else p[0] for p in combinations(pair, k-1)])
    superset = set(L_k)
    
    return subset.issubset(superset)

In [5]:
def newTransactions(transactions_master, k, L_k):
    transactions_k = []
    for transaction in transactions_master:
        pairs = list(combinations(transaction, k))
        candidates = [pair for pair in pairs if filterPairs(pair, k, L_k)]
        transactions_k.append(candidates)
    
    return transactions_k

In [7]:
transactions = loadTransactions(filename)
final_itemsets = {}
non_singleton = []
Ls_1, L_1 = getItemsets(transactions, min_support)
Ls_1 = sorted(Ls_1.items())
final_itemsets.update(Ls_1)
L_k = L_1
k = 2
while len(L_k) > 0:
    transactions_k = newTransactions(transactions, k, L_k)
    Ls_k, L_k = getItemsets(transactions_k, min_support)
    Ls_k = sorted(Ls_k.items())
    final_itemsets.update(Ls_k)
    non_singleton.append(Ls_k)
    k += 1

In [8]:
final_itemsets

{1: 1535,
 4: 1394,
 5: 1094,
 6: 2149,
 8: 3090,
 10: 1351,
 12: 3415,
 17: 1683,
 21: 2666,
 25: 1395,
 27: 2165,
 28: 1454,
 31: 1666,
 32: 4248,
 33: 1460,
 35: 1984,
 37: 1249,
 38: 2402,
 39: 4258,
 41: 1353,
 43: 1721,
 45: 1728,
 48: 2472,
 51: 1612,
 52: 1983,
 54: 2595,
 55: 1959,
 57: 2743,
 58: 1330,
 68: 1601,
 69: 2370,
 70: 2411,
 71: 3507,
 72: 2852,
 73: 2179,
 75: 3151,
 78: 2471,
 85: 1555,
 90: 1875,
 93: 2777,
 94: 1201,
 97: 1466,
 100: 1749,
 104: 1158,
 105: 1100,
 110: 1801,
 111: 1171,
 112: 2680,
 115: 1775,
 116: 2193,
 120: 4973,
 122: 1081,
 125: 1287,
 126: 1075,
 129: 1547,
 130: 1711,
 132: 2641,
 140: 2687,
 143: 1417,
 145: 4559,
 147: 1383,
 151: 2611,
 154: 1447,
 157: 1140,
 161: 2320,
 162: 1450,
 163: 1256,
 168: 1538,
 170: 1203,
 171: 1097,
 173: 1080,
 175: 2791,
 177: 4629,
 181: 1235,
 183: 3883,
 185: 1529,
 192: 2004,
 196: 2096,
 197: 1230,
 198: 1461,
 201: 1029,
 204: 2174,
 205: 3605,
 207: 1214,
 208: 1483,
 210: 2009,
 214: 1893,
 21

In [9]:
rules = []
for l in non_singleton :
    for itemset, support in l:
        k = len(itemset)
        for i in range(1,k):
            subsets = list(combinations(set(itemset), i))
            
            for a in subsets :
                if len(a)==1:
                    key = a[0]
                else: 
                    key = tuple(sorted(a))

                support_a = final_itemsets[key]
                confidence = support / support_a
                if confidence >= min_confidence :
                    rules.append((set(a),set(itemset).difference(set(a)),confidence))

In [22]:
SHOW = 15
n_rules = len(rules)
rules.sort(key=lambda tup: tup[2], reverse=True)
print("Found {} rules, showing {}".format(n_rules, SHOW))
for i in range(min(SHOW, n_rules)):
    antecedent, consequent, confidence = rules[i]
    print("{} -> {} (confidence: {})".format(antecedent, consequent, confidence))

Found 7 rules, showing 15
{704, 825} -> {39} (confidence: 0.9392014519056261)
{704, 39} -> {825} (confidence: 0.9349593495934959)
{825, 39} -> {704} (confidence: 0.8719460825610783)
{704} -> {39} (confidence: 0.617056856187291)
{704} -> {825} (confidence: 0.6142697881828316)
{227} -> {390} (confidence: 0.577007700770077)
{704} -> {825, 39} (confidence: 0.5769230769230769)
