In [1]:
import numpy as np
import pandas as pd
from itertools import permutations, combinations
from collections import Counter
import time

In [2]:
min_support = 1000
filename = 'data/T10I4D100K.dat'

In [3]:
def loadTransactions(filename):
    transactions = []
    with open(filename) as f:
        lines = f.readlines()
        for line in lines:
            transaction = line.rstrip().split(" ")
            items = [int(item) for item in transaction]
            transactions.append(items)
    return transactions

In [4]:
def getItemsets(transactions, min_support):
    Cs_k = Counter(item for items in transactions for item in set(items))
    Ls_k = {key: Cs_k[key] for key in Cs_k if Cs_k[key] > min_support}
    L_k = [key for key in Ls_k]
    
    return Ls_k, L_k

In [5]:
def filterPairs(pair, k, L_k):
    subset = set([p if k > 2 else p[0] for p in combinations(pair, k-1)])
    superset = set(L_k)
    return subset.issubset(superset)

In [6]:
def newTransactions(transactions_master, k, L_k):
    transactions_k = []
    for transaction in transactions_master:
        pairs = list(combinations(transaction, k))
        candidates = [pair for pair in pairs if filterPairs(pair, k, L_k)]
        transactions_k.append(candidates)
    
    return transactions_k

In [7]:
start = time.time()
transactions = loadTransactions(filename)
final_itemsets = {}
Ls_1, L_1 = getItemsets(transactions, min_support)
Ls_1 = sorted(Ls_1.items())
final_itemsets.update(Ls_1)
L_k = L_1
k = 2
while len(L_k) > 0:
    transactions_k = newTransactions(transactions, k, L_k)
    Ls_k, L_k = getItemsets(transactions_k, min_support)
    Ls_k = sorted(Ls_k.items())
    final_itemsets.update(Ls_k)
    k += 1
print(round(time.time() - start, 2))

136.48


In [8]:
len(final_itemsets)

385

In [17]:
118.09/22.35

5.2836689038031315

### DataFrame way

In [8]:
def getItemsets(transactions, min_support):
    Cs_k = Counter(item for items in transactions for item in set(items))
    Ls_k = {key: Cs_k[key] for key in Cs_k if Cs_k[key] > min_support}
    L_k = [key for key in Ls_k]
    
    return Ls_k, L_k

In [9]:
def filterPairs(pair, k, L_k):
    subset = set([p if k > 2 else p[0] for p in combinations(pair, k-1)])
    superset = set(L_k)
    return subset.issubset(superset)

In [12]:
def newTransactions(df, k, L_k):
    df['combinations'] = df['itemsets'].apply(lambda x: list(combinations(x, k)))
    df['combinations0'] = df['combinations'].apply(lambda x: [pair for pair in x if filterPairs(pair, k, L_k)])
    transactions_k = data['combinations0'].tolist()
    
    return transactions_k

In [15]:
start = time.time()
# Load dataframe
data = pd.read_csv(filename, header=None)
data = data.rename(columns={0:'itemsets'})
data['itemsets'] = data['itemsets'].str.rstrip()
data['itemsets'] = data['itemsets'].str.split(" ")

# Get frequent itemsets
transactions = data['itemsets'].tolist()
final_itemsets = {}
Ls_1, L_1 = getItemsets(transactions, min_support)
Ls_1 = sorted(Ls_1.items())
final_itemsets.update(Ls_1)
L_k = L_1
k = 2
while len(L_k) > 0:
    transactions_k = newTransactions(data, k, L_k)
    Ls_k, L_k = getItemsets(transactions_k, min_support)
    Ls_k = sorted(Ls_k.items())
    final_itemsets.update(Ls_k)
    k += 1
print(round(time.time() - start, 2))

152.62


In [15]:
len(final_itemsets)

385