# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset

In [2]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001
K = 4

In [3]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)
baskets[:10]

[{'pastry', 'salty snack', 'whole milk'},
 {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
 {'pickled vegetables', 'soda'},
 {'canned beer', 'misc. beverages'},
 {'hygiene articles', 'sausage'},
 {'rolls/buns', 'sausage', 'whole milk'},
 {'soda', 'whole milk'},
 {'frankfurter', 'soda', 'whipped/sour cream'},
 {'curd', 'frankfurter'},
 {'beef', 'white bread'}]

## Część 2. - obliczanie wskaźników

In [14]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    all_products = sorted(all_products)
    baskets = [set(basket) for basket in baskets]
    total_baskets = len(baskets)
    def find_supports_rec(selected_items: set[str], remaining_baskets: list[set[str]], index: int):
        if len(selected_items) == K+1:
            return dict()

        if index == len(all_products):
            return dict()

        current_item = all_products[index]
        supports = find_supports_rec(selected_items, remaining_baskets, index + 1)
        selected_items.add(current_item)
        filtered_baskets = [basket for basket in remaining_baskets if current_item in basket]
        # current_item in basket is equivalent to basket.issuperset(selected_items) because all baskets in remaining_baskets
        # must contain all items in taken except the current item due to construction
        supp = len(filtered_baskets) / total_baskets
        if supp > epsilon:
            supports[frozenset(selected_items)] = supp
            supports.update(find_supports_rec(selected_items, filtered_baskets, index + 1))
        selected_items.remove(current_item)
        return supports
    return find_supports_rec(set(), baskets, 0)


supports = get_supports(baskets, products, EPSILON)
print(len(supports))
supports

750


{frozenset({'zwieback'}): 0.004009891064626078,
 frozenset({'yogurt'}): 0.08587850030074183,
 frozenset({'whole milk'}): 0.15792287642852368,
 frozenset({'whole milk', 'yogurt'}): 0.011160863463209249,
 frozenset({'white wine'}): 0.011695515605159393,
 frozenset({'white wine', 'whole milk'}): 0.0012697988371315912,
 frozenset({'white bread'}): 0.023992514870012697,
 frozenset({'white bread', 'yogurt'}): 0.0010693042839002875,
 frozenset({'white bread', 'whole milk'}): 0.003141081333957094,
 frozenset({'whipped/sour cream'}): 0.043707812604424245,
 frozenset({'whipped/sour cream', 'yogurt'}): 0.0029405867807257902,
 frozenset({'whipped/sour cream', 'whole milk'}): 0.004611374724319989,
 frozenset({'whipped/sour cream', 'white bread'}): 0.0010024727661565194,
 frozenset({'waffles'}): 0.018512330415023724,
 frozenset({'waffles', 'yogurt'}): 0.0011361358016440553,
 frozenset({'waffles', 'whole milk'}): 0.0026064291920069507,
 frozenset({'vinegar'}): 0.003408407404932166,
 frozenset({'uht-m

In [15]:
# definiujemy funkcje obliczajace support, confidence i lift
from itertools import chain

def support(supports, products: tuple[str]) -> float:
    key = frozenset(products)

    if key in supports:
        return supports[key]
    return 0

def confidence(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    supp_sum = support(supports, tuple(chain(prior_products, following_products)))
    supp_prior = support(supports, prior_products)

    if supp_prior == 0:
        return 0
    return supp_sum / supp_prior

def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    supp_sum = support(supports, tuple(chain(prior_products, following_products)))
    supp_prior = support(supports, prior_products)
    supp_following = support(supports, following_products)

    if supp_following == 0 or supp_prior == 0:
        return 0

    return supp_sum / supp_prior / supp_following

In [16]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448514


## Część 3. - generowanie rekomendacji

In [17]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence

def generate_basic_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    products = tuple(product for product in products if product not in basket)
    recommendations = []
    for subbasket in powerset(basket):
        for item in products:
            conf = confidence(supports, subbasket, (item, ))
            lft = lift(supports, subbasket, (item, ))
            if  lft > 1:
                recommendations.append((item, subbasket, conf, lft))
    return sorted(recommendations, key=lambda x: x[2], reverse=True)

In [18]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift
from collections import defaultdict
from itertools import chain, combinations


def generate_advanced_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    products = tuple(product for product in products if product not in basket)
    recommendations = defaultdict(list)

    for subbasket in powerset(basket):
        for item in products:
            conf = confidence(supports, subbasket, (item, ))
            lft = lift(supports, subbasket, (item, ))
            if lft > 1:
                recommendations[item].append((conf, lft, subbasket))

    aggregated_recommendations = []
    for item, values in recommendations.items():
        avg_confidence = sum(conf for conf, _, _, in values) / len(values)
        product_lift = 1
        best_subbasket = None

        for _, lft, subbasket in values:
            product_lift *= lft
            if best_subbasket is None or len(subbasket) > len(best_subbasket):
                best_subbasket = subbasket
        aggregated_recommendations.append((item, best_subbasket, avg_confidence, product_lift))

    return sorted(aggregated_recommendations, key=lambda x: x[2] * x[3], reverse=True)

In [25]:
from IPython.display import display, HTML

def format_recommendations(recommendations):
  """Formatting recommendations list to table format"""

  table = "<table><thead><tr><th>Produkt</th><th>Subbasket</th><th>Confidence</th><th>Lift</th></tr></thead><tbody>"
  for item, subbasket, confidence, lift in recommendations:
    table += f"<tr><td>{item}</td><td>{subbasket}</td><td>{confidence:.4f}</td><td>{lift:.4f}</td></tr>"
  table += "</tbody></table>"

  display(HTML(table))

In [26]:
print(baskets[1])
format_recommendations(generate_basic_candidates(baskets[1], products, supports))

{'yogurt', 'semi-finished bread', 'sausage', 'whole milk'}


Produkt,Subbasket,Confidence,Lift
rolls/buns,"('sausage', 'whole milk')",0.1269,1.1533
rolls/buns,"('yogurt', 'whole milk')",0.1198,1.0887
soda,"('sausage', 'whole milk')",0.1194,1.2296
soda,"('sausage',)",0.0986,1.015
bottled beer,"('sausage',)",0.0554,1.222
citrus fruit,"('yogurt',)",0.0537,1.0106
pastry,"('sausage',)",0.0532,1.0276
curd,"('sausage',)",0.0487,1.4466
frozen vegetables,"('sausage',)",0.0343,1.226
beverages,"('sausage',)",0.0255,1.5368


In [27]:
format_recommendations(generate_advanced_candidates(baskets[1], products, supports))

Produkt,Subbasket,Confidence,Lift
rolls/buns,"('yogurt', 'whole milk')",0.1233,1.2556
soda,"('sausage', 'whole milk')",0.109,1.248
curd,"('sausage',)",0.0487,1.4466
bottled beer,"('sausage',)",0.0554,1.222
pastry,"('sausage',)",0.0532,1.0276
citrus fruit,"('yogurt',)",0.0537,1.0106
frozen vegetables,"('sausage',)",0.0343,1.226
beverages,"('sausage',)",0.0255,1.5368
frozen meals,"('sausage',)",0.021,1.2543
sliced cheese,"('sausage',)",0.0188,1.3414


In [28]:
print(baskets[33])
format_recommendations(generate_basic_candidates(baskets[33], products, supports))

{'photo/film', 'yogurt', 'tropical fruit', 'soda', 'domestic eggs', 'root vegetables', 'white wine'}


Produkt,Subbasket,Confidence,Lift
sausage,"('yogurt',)",0.0669,1.109
sausage,"('soda',)",0.0613,1.015
citrus fruit,"('yogurt',)",0.0537,1.0106
shopping bags,"('root vegetables',)",0.048,1.0094
newspapers,"('domestic eggs',)",0.0414,1.0654
coffee,"('domestic eggs',)",0.0378,1.197
frankfurter,"('domestic eggs',)",0.0378,1.0021
frozen vegetables,"('root vegetables',)",0.0307,1.0978
white bread,"('domestic eggs',)",0.027,1.1265
uht-milk,"('tropical fruit',)",0.0227,1.0606


In [29]:
format_recommendations(generate_advanced_candidates(baskets[33], products, supports))

Produkt,Subbasket,Confidence,Lift
sausage,"('yogurt',)",0.0641,1.1256
citrus fruit,"('yogurt',)",0.0537,1.0106
shopping bags,"('root vegetables',)",0.048,1.0094
coffee,"('domestic eggs',)",0.0378,1.197
newspapers,"('domestic eggs',)",0.0414,1.0654
frankfurter,"('domestic eggs',)",0.0378,1.0021
frozen vegetables,"('root vegetables',)",0.0307,1.0978
white bread,"('domestic eggs',)",0.027,1.1265
flour,"('tropical fruit',)",0.0158,1.6171
specialty chocolate,"('tropical fruit',)",0.0197,1.2348
