# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [3]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset
from collections import defaultdict

In [4]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001
K = 4

In [5]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [59]:
# Helper function to calculated support for given products
def calc_support(basket_sets: list[set[str]], products: set[str]):
    return sum([products.issubset(basket_set) for basket_set in basket_sets]) / len(baskets)

epsilon_factory = lambda: EPSILON

# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`
def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float) -> dict[tuple[str], float]:
    MIN_SET_LEN, MAX_SET_LEN = 1, 2
    basket_sets = [set(basket) for basket in baskets]
    _supports: dict[tuple[str], float] = defaultdict(epsilon_factory)
    for products_set in powerset(all_products):
        if len(products_set) < MIN_SET_LEN:
            continue
        if len(products_set) > MAX_SET_LEN:
            return _supports

        _support = calc_support(basket_sets, set(products_set))
        if _support > epsilon:
            _supports[products_set] = _support
        
    
supports = get_supports(baskets, products, EPSILON)
supports

defaultdict(<function __main__.<lambda>()>,
            {('abrasive cleaner',): 0.0014702933903628951,
             ('artif. sweetener',): 0.0019381140145692708,
             ('baking powder',): 0.008086613646995923,
             ('bathroom cleaner',): 0.0011361358016440553,
             ('beef',): 0.03395041101383412,
             ('berries',): 0.021787074784468355,
             ('beverages',): 0.016574216400454454,
             ('bottled beer',): 0.04531176903027468,
             ('bottled water',): 0.06068301811134131,
             ('brandy',): 0.0025395976742631824,
             ('brown bread',): 0.03762614448974136,
             ('butter',): 0.03522020985096572,
             ('butter milk',): 0.017576689166610975,
             ('cake bar',): 0.006148499632426653,
             ('candles',): 0.004410880171088686,
             ('candy',): 0.014368776314910112,
             ('canned beer',): 0.04691572545612511,
             ('canned fish',): 0.007685624540533315,
             ('canne

In [64]:
# definiujemy funkcje obliczajace support, confidence i lift
to_set = lambda item: set(item)
get_key = lambda products: tuple(sorted(tuple(products)))

def support(supports, products: set[str]) -> float:
    products_key = get_key(products)
    if (products_key not in supports):
        supports[products_key] = max(calc_support(map(to_set, baskets), products), EPSILON)
    return supports[products_key]

def confidence(supports, prior_products: set[str], following_products: set[str]) -> float:
    products_union = prior_products.union(following_products)
    return support(supports, products_union) / support(supports, prior_products)
    
def lift(supports, prior_products: set[str], following_products: set[str]) -> float:
    products_sum = prior_products.union(following_products)
    return support(supports, products_sum) / (support(supports, prior_products) * support(supports, following_products))

In [65]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Część 3. - generowanie rekomendacji

In [69]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence
to_confidence = lambda item: item[2]

def generate_basic_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    candidates = []
    for candidate_product in products:
        _lift = lift(supports, basket, {candidate_product})
        _confidence = confidence(supports, basket, {candidate_product})
        if _lift > 1:
            candidates.append((candidate_product, basket, _confidence, _lift))
    return sorted(candidates, key=to_confidence, reverse=True)

In [51]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift

def generate_advanced_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    raise NotImplementedError()

In [70]:
print(baskets[1])
generate_basic_candidates(baskets[1], products, supports)
# generate_advanced_candidates(baskets[1], products, supports)

{'sausage', 'whole milk', 'yogurt', 'semi-finished bread'}


[('abrasive cleaner',
  {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
  1.0,
  680.1363636363636),
 ('artif. sweetener',
  {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
  1.0,
  515.9655172413793),
 ('baby cosmetics',
  {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
  1.0,
  1000.0000000000001),
 ('bags',
  {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
  1.0,
  1000.0000000000001),
 ('baking powder',
  {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
  1.0,
  123.66115702479338),
 ('bathroom cleaner',
  {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
  1.0,
  880.1764705882354),
 ('beef',
  {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
  1.0,
  29.45472440944882),
 ('berries',
  {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
  1.0,
  45.898773006134974),
 ('beverages',
  {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
  1.0,
  60.33467741935484),
 ('bottled beer',
  {'sa

In [None]:
print(baskets[33])
generate_basic_candidates(baskets[33], products, supports)
generate_advanced_candidates(baskets[33], products, supports)