# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [149]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset
from itertools import combinations

In [161]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001
K = 4

In [215]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [218]:
# Helper functions
get_support_key = lambda products: tuple(sorted(tuple(products)))

def calc_support(basket_sets: list[set[str]], products: set[str]):
    return sum([products.issubset(basket_set) for basket_set in basket_sets]) / len(baskets)

def are_all_n_minus_1_subsets_in_supports(supports: dict, products: tuple[str]):
    if len(products) <= 1:
        return True
    return all(get_support_key(subset) in supports.keys() for subset in combinations(products, len(products)-1))

def calc_max_k_subbaskets_supports(_supports: dict, basket_sets: list[set[str]], all_products: list[str]):
    for products_subset in powerset(all_products):
        if len(products_subset) > K:
            return
        if len(products_subset) == 0 or not are_all_n_minus_1_subsets_in_supports(_supports, products_subset):
            continue

        _support = calc_support(basket_sets, set(products_subset))
        if _support > EPSILON:
            _supports[get_support_key(products_subset)] = _support

# def calc_extended_baskets_supports(_supports: dict, basket_sets: list[set[str]], all_products: list[str]):
#     for basket_set in basket_sets:
#         for extra_product in all_products:
#             if extra_product in basket_set:
#                 continue
#             extended_basket = basket_set.union(set(extra_product))
#             _support = calc_support(basket_sets, extended_basket)
#             if _support > EPSILON:
#                 _supports[get_support_key(extended_basket)] = _support

# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`
def get_supports(baskets: list[tuple[str]], all_products: list[str]) -> dict[tuple[str], float]:
    basket_sets = [set(basket) for basket in baskets]
    _supports: dict[tuple[str], float] = {}

    print("Calculating support for subbaskets with with max length {}...".format(K))
    calc_max_k_subbaskets_supports(_supports, basket_sets, all_products)

    # print("Calculating support for extended baskets...")
    # calc_extended_baskets_supports(_supports, basket_sets, all_products)
    
    print("Done!")
    return _supports

supports = get_supports(baskets, products)
supports

Calculating support for subbaskets with with max length 4...
Calculating support for extended baskets...
Done!


{('abrasive cleaner',): 0.0014702933903628951,
 ('artif. sweetener',): 0.0019381140145692708,
 ('baby cosmetics',): 0.00020049455323130388,
 ('bags',): 0.00026732607097507187,
 ('baking powder',): 0.008086613646995923,
 ('bathroom cleaner',): 0.0011361358016440553,
 ('beef',): 0.03395041101383412,
 ('berries',): 0.021787074784468355,
 ('beverages',): 0.016574216400454454,
 ('bottled beer',): 0.04531176903027468,
 ('bottled water',): 0.06068301811134131,
 ('brandy',): 0.0025395976742631824,
 ('brown bread',): 0.03762614448974136,
 ('butter',): 0.03522020985096572,
 ('butter milk',): 0.017576689166610975,
 ('cake bar',): 0.006148499632426653,
 ('candles',): 0.004410880171088686,
 ('candy',): 0.014368776314910112,
 ('canned beer',): 0.04691572545612511,
 ('canned fish',): 0.007685624540533315,
 ('canned fruit',): 0.001403461872619127,
 ('canned vegetables',): 0.005480184454988973,
 ('cat food',): 0.011829178640646929,
 ('cereals',): 0.002806923745238254,
 ('chewing gum',): 0.0120296731938

In [221]:
# definiujemy funkcje obliczajace support, confidence i lift
basket_sets = [set(basket) for basket in baskets]

def support(supports: dict[tuple[str], float], products: set[str]) -> float:
    return supports.get(get_support_key(products), max(calc_support(basket_sets, products), EPSILON))

def confidence(supports: dict[tuple[str]], prior_products: set[str], following_products: set[str]) -> float:
    if len(prior_products.intersection(following_products)) > 0:
        raise ValueError("[Confidence]: Following products {} are already present in the prior products {}".format(following_products, prior_products))
    products_union = prior_products.union(following_products)
    return support(supports, products_union) / support(supports, prior_products)
    
def lift(supports: dict[tuple[str]], prior_products: set[str], following_products: set[str]) -> float:
    if len(prior_products.intersection(following_products)) > 0:
        raise ValueError("[Lift]: Following products {} are already present in the prior products {}".format(following_products, prior_products))
    products_sum = prior_products.union(following_products)
    return support(supports, products_sum) / (support(supports, prior_products) * support(supports, following_products))

In [222]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Część 3. - generowanie rekomendacji

In [229]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence
def sorted_desc_by_confidence(recommendations: list) -> list:
    confidence_key = lambda item: item[2]
    return sorted(recommendations, key=confidence_key, reverse=True)

def generate_basic_candidates(basket: set[str], products: list[str], supports: dict) -> list[tuple[str, set[str], float, float]]:
    recommendations: list[tuple] = []
    products_left = set(products) - basket
    for subbasket in powerset(basket):
        subbasket_set = set(subbasket)
        for candidate in products_left:
            if len(subbasket) < 1:
                continue
            _lift = lift(supports, subbasket_set, {candidate})
            _confidence = confidence(supports, subbasket_set, {candidate})
            if _lift > 1:
                recommendations.append((candidate, subbasket_set, _confidence, _lift))
    return sorted_desc_by_confidence(recommendations)
    # return [(item, subbasket, confidence, lift)]

In [None]:
print("Basic recomendations for basket {}:".format(baskets[1]))
generate_basic_candidates(set(baskets[1]), products, supports)

In [236]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift

avg = lambda _sum, _count: _sum / _count  

def generate_advanced_candidates(basket: set[str], products: list[str], supports: dict) -> list[tuple[str, tuple[str], float, float]]:
    recommendations: list[tuple] = []
    products_left = set(products) - basket
    for candidate in products_left:
        subbaskets_count, lift_sum, confidence_sum = 0, 0, 0
        for subbasket in powerset(basket):
            subbasket_set = set(subbasket)
            if len(subbasket) == 0:
                continue
            lift_sum += lift(supports, subbasket_set, {candidate})
            confidence_sum += confidence(supports, subbasket_set, {candidate})
            subbaskets_count += 1
        lift_avg = avg(lift_sum, subbaskets_count)
        confidence_avg = avg(confidence_sum, subbaskets_count)
        if lift_avg > 1:
            recommendations.append((candidate, subbasket, confidence_avg, lift_avg))
    return sorted_desc_by_confidence(recommendations)
    # return [(item, subbasket, confidence, lift)]

In [237]:
print("Advanced recomendations for basket {}:".format(baskets[1]))
generate_advanced_candidates(set(baskets[1]), products, supports)

Advanced recomendations for basket {'sausage', 'whole milk', 'yogurt', 'semi-finished bread'}:


[('rolls/buns',
  ('sausage', 'whole milk', 'yogurt', 'semi-finished bread'),
  0.24364020679147946,
  2.214816776561912),
 ('other vegetables',
  ('sausage', 'whole milk', 'yogurt', 'semi-finished bread'),
  0.23919623988613206,
  1.9590001846831937),
 ('soda',
  ('sausage', 'whole milk', 'yogurt', 'semi-finished bread'),
  0.23620872893305914,
  2.43247846595001),
 ('tropical fruit',
  ('sausage', 'whole milk', 'yogurt', 'semi-finished bread'),
  0.2130812614176693,
  3.144314511432532),
 ('pork',
  ('sausage', 'whole milk', 'yogurt', 'semi-finished bread'),
  0.21281278932884085,
  5.7375094895990015),
 ('pastry',
  ('sausage', 'whole milk', 'yogurt', 'semi-finished bread'),
  0.21267171354634934,
  4.11137835890701),
 ('root vegetables',
  ('sausage', 'whole milk', 'yogurt', 'semi-finished bread'),
  0.21041652312904946,
  3.024459592295838),
 ('bottled beer',
  ('sausage', 'whole milk', 'yogurt', 'semi-finished bread'),
  0.20901124704609145,
  4.61273641526647),
 ('shopping bags'

In [202]:
print(baskets[33])
generate_basic_candidates(baskets[33], products, supports)
generate_advanced_candidates(baskets[33], products, supports)

{'white wine', 'tropical fruit', 'root vegetables', 'soda', 'yogurt', 'photo/film', 'domestic eggs'}


[('preservation products',
  ('white wine',
   'tropical fruit',
   'root vegetables',
   'soda',
   'yogurt',
   'photo/film',
   'domestic eggs'),
  1.0,
  10000.0),
 ('salty snack',
  ('white wine',
   'tropical fruit',
   'root vegetables',
   'soda',
   'yogurt',
   'photo/film',
   'domestic eggs'),
  1.0,
  53.24911032028468),
 ('sauces',
  ('white wine',
   'tropical fruit',
   'root vegetables',
   'soda',
   'yogurt',
   'photo/film',
   'domestic eggs'),
  1.0,
  340.06818181818176),
 ('flower (seeds)',
  ('white wine',
   'tropical fruit',
   'root vegetables',
   'soda',
   'yogurt',
   'photo/film',
   'domestic eggs'),
  1.0,
  216.85507246376804),
 ('ketchup',
  ('white wine',
   'tropical fruit',
   'root vegetables',
   'soda',
   'yogurt',
   'photo/film',
   'domestic eggs'),
  1.0,
  467.59374999999994),
 ('toilet cleaner',
  ('white wine',
   'tropical fruit',
   'root vegetables',
   'soda',
   'yogurt',
   'photo/film',
   'domestic eggs'),
  1.0,
  2992.5999999