In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

from time import time
from tqdm import tqdm
from pyfim import pyeclat
from PD_freqitems import freqitemsets
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth, fpmax

In [2]:
#Read product names and IDs
lines=open("products.txt", "rt", encoding="utf8").readlines()

#we subtract 1 because the pids start at 1, the first 0 is never filled
#therefore the product with pid {pid} is at index {pid}-1
products=[0]* (len(lines)-1)
for lin in lines[1:]:
    pid, pname, aid, did=lin.strip().split("\t")    
    products[int(pid) - 1]=pname
    
#read transactions
orders=pickle.load(open("order_products.pickle", "rb"))

#check products on order 2:
for prod in orders[2]: print(products[prod])

"Oikos Yogurt Drink, Strawberry"
Mild Italian Chicken Sausage
Organic Split Pea & Potato Soup
"Fajita Seasoning, Organic"
Belgian White Wheat Ale
Organic Green Leaf Lettuce
Zebra Cakes
Organic Fusilli No. 34
12 G. Protein Bar Coffee Chocolate


In [3]:
len(orders)

3214874

In [4]:
len(products)

49688

In [5]:
len(orders)*len(products)

159740659312

# Objective 1 - Analyze the itemset/rules generation procedure

In [6]:
encoder = TransactionEncoder().fit(orders.values())

In [7]:
binary_orders = encoder.transform(orders.values(), sparse=True)

In [8]:
binary_orders = pd.DataFrame.sparse.from_spmatrix(binary_orders)

In [9]:
binary_orders.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49667,49668,49669,49670,49671,49672,49673,49674,49675,49676
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
order_list = orders.values()

##  From the approaches used in classes make a performance analysis up to a threshold level of support
Approaches considered in class:
 1. Apriori
 2. Fp-Growth
 3. ECLAT
 4. Naive

In [13]:
results = pd.DataFrame(columns = ["threshold", "n_itemsets", "apriori", "fp-growth", "eclat", "naive"])

In [16]:
thresholds = [0.009,]

In [17]:
for i,thresh in enumerate(thresholds):
    start = time()
    fi = apriori(binary_orders, thresh)
    time_apriori = time() - start
    
    start = time()
    fi = fpgrowth(binary_orders, thresh)
    time_fp = time() - start
    
    start = time()
    fi = pyeclat(order_list, thresh)
    time_eclat = time() - start
    
    start = time()
    fi = freqitemsets(order_list, thresh)
    time_naive = time() - start
    
    results.loc[i] = {
        "threshold": thresh,
        "n_itemsets": len(fi),
        "apriori":time_apriori,
        "fp-growth":time_fp,
        "eclat":time_eclat,
        "naive":time_naive
    }
    
    print(f"\nIteration {i}: Threshold = {thresh}")
    print(results.loc[i])

  return self.todense() == other
  return self.todense() == other
  return self.todense() == other
  return self.todense() == other



Iteration 0: Threshold = 0.009
threshold       0.009000
n_itemsets    138.000000
apriori       118.274740
fp-growth      46.570990
eclat          64.299316
naive          54.049022
Name: 0, dtype: float64
