# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [102]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset
from itertools import combinations
import pandas as pd

In [103]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001
K = 4

In [104]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[set[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[set[str]]) -> list[str]:
    products: set[str] = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [107]:
# Helper functions
def get_support_key(products: set[str] | tuple[str, ...]) -> tuple[str, ...]:
    return tuple(sorted(tuple(products)))

def calc_support(baskets: list[set[str]], products: set[str]) -> float:
    return sum([products.issubset(basket_set) for basket_set in baskets]) / len(baskets)

def are_all_n_minus_1_subsets_in_supports(supports: dict[tuple[str, ...], float], products: tuple[str, ...]) -> bool:
    if len(products) <= 1:
        return True
    return all(get_support_key(subset) in supports.keys() for subset in combinations(products, len(products)-1))

# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`
def get_supports(baskets: list[set[str]], all_products: list[str], epsilon: float) -> dict[tuple[str, ...], float]:
    _supports: dict[tuple[str, ...], float] = {}

    print("Calculating supports for subbaskets with max length {}...".format(K))
    for subproducts in powerset(all_products):
        if len(subproducts) > K:
            print("Done")
            return _supports
        if len(subproducts) == 0 or not are_all_n_minus_1_subsets_in_supports(_supports, subproducts):
            continue

        _support = calc_support(baskets, set(subproducts))
        if _support > epsilon:
            _supports[get_support_key(subproducts)] = _support
    return _supports


supports = get_supports(baskets, products, EPSILON)
supports

Calculating supports for subbaskets with max length 4...
Done


{('abrasive cleaner',): 0.0014702933903628951,
 ('artif. sweetener',): 0.0019381140145692708,
 ('baking powder',): 0.008086613646995923,
 ('bathroom cleaner',): 0.0011361358016440553,
 ('beef',): 0.03395041101383412,
 ('berries',): 0.021787074784468355,
 ('beverages',): 0.016574216400454454,
 ('bottled beer',): 0.04531176903027468,
 ('bottled water',): 0.06068301811134131,
 ('brandy',): 0.0025395976742631824,
 ('brown bread',): 0.03762614448974136,
 ('butter',): 0.03522020985096572,
 ('butter milk',): 0.017576689166610975,
 ('cake bar',): 0.006148499632426653,
 ('candles',): 0.004410880171088686,
 ('candy',): 0.014368776314910112,
 ('canned beer',): 0.04691572545612511,
 ('canned fish',): 0.007685624540533315,
 ('canned fruit',): 0.001403461872619127,
 ('canned vegetables',): 0.005480184454988973,
 ('cat food',): 0.011829178640646929,
 ('cereals',): 0.002806923745238254,
 ('chewing gum',): 0.012029673193878232,
 ('chicken',): 0.027868742899151238,
 ('chocolate',): 0.02359152576355009,


In [108]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports: dict[tuple[str, ...], float], products: set[str]) -> float:
    return supports.get(get_support_key(products), max(calc_support(baskets, products), EPSILON))

def confidence(supports: dict[tuple[str, ...], float], prior_products: set[str], following_products: set[str]) -> float:
    if len(prior_products.intersection(following_products)) > 0:
        raise ValueError(f"[Confidence]: Following products {following_products} are already present in the prior products {prior_products}")
    products_union = prior_products.union(following_products)
    union_support = support(supports, products_union)
    products_union_support = union_support if union_support != EPSILON else 0.0
    return products_union_support / support(supports, prior_products)
    
def lift(supports: dict[tuple[str, ...], float], prior_products: set[str], following_products: set[str]) -> float:
    if len(prior_products.intersection(following_products)) > 0:
        raise ValueError(f"[Lift]: Following products {following_products} are already present in the prior products {prior_products}")
    products_union = prior_products.union(following_products)
    union_support = support(supports, products_union)
    union_support = union_support if union_support != EPSILON else 0.0
    return union_support / (support(supports, prior_products) * support(supports, following_products))

In [109]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Część 3. - generowanie rekomendacji

In [110]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence
class BasicRecommendation:
    def __init__(self, candidate: str, subbasket: set[str], confidence: float, lift: float):
        self.candidate = candidate
        self.subbasket = subbasket
        self.confidence = confidence
        self.lift = lift

    def to_dict(self) -> dict[str, str | float | set[str]]:
        return {
            "Candidate": self.candidate,
            "Confidence": self.confidence,
            "Lift": self.lift,
            "Subbasket": self.subbasket
        }

    def __str__(self):
        return f"{{candidate='{self.candidate}', subbasket={self.subbasket}, confidence={self.confidence:.8f}, lift={self.lift:.8f}}}"

def sorted_desc_by_confidence(recommendations: list[BasicRecommendation]) -> list[BasicRecommendation]:
    return sorted(recommendations, key=lambda rec: rec.confidence, reverse=True)

def generate_basic_candidates(basket: set[str], products: list[str], supports: dict[tuple[str, ...], float]) -> list[BasicRecommendation]:
    recommendations: list[BasicRecommendation] = []
    products_left = set(products) - basket
    for subbasket in powerset(basket):
        subbasket_set = set(subbasket)
        for candidate in products_left:
            if len(subbasket) == 0:
                continue
            _lift = lift(supports, subbasket_set, {candidate})
            _confidence = confidence(supports, subbasket_set, {candidate})
            if _lift > 1.0:
                recommendations.append(BasicRecommendation(candidate, subbasket_set, _confidence, _lift))
    return sorted_desc_by_confidence(recommendations)

In [111]:
def print_top_n_basic_recomendations(recomendations: list[BasicRecommendation], n: int):
    df = pd.DataFrame([rec.to_dict() for rec in recomendations][:n])
    print(df)

In [112]:
PRINT_TOP_N = 20

print(f"Basic top {PRINT_TOP_N} recomendations for basket {baskets[1]}:")

basic_candidates = generate_basic_candidates(set(baskets[1]), products, supports)
print_top_n_basic_recomendations(basic_candidates, PRINT_TOP_N)

Basic top 20 recomendations for basket {'semi-finished bread', 'sausage', 'yogurt', 'whole milk'}:
            Candidate  Confidence      Lift              Subbasket
0          rolls/buns    0.126866  1.153275  {sausage, whole milk}
1          rolls/buns    0.119760  1.088685   {yogurt, whole milk}
2                soda    0.119403  1.229612  {sausage, whole milk}
3                soda    0.098560  1.014975              {sausage}
4        bottled beer    0.055371  1.222000              {sausage}
5        citrus fruit    0.053696  1.010642               {yogurt}
6              pastry    0.053156  1.027617              {sausage}
7                curd    0.048726  1.446615              {sausage}
8   frozen vegetables    0.034330  1.225966              {sausage}
9           beverages    0.025471  1.536764              {sausage}
10            dessert    0.024363  1.032711              {sausage}
11       frozen meals    0.021041  1.254327              {sausage}
12        salty snack    0.018

In [113]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift
class AdvancedRecommendation:
    def __init__(self, candidate: str, confidence: float, lift: float, occurrences: int):
        self.candidate = candidate
        self.confidence = confidence
        self.lift = lift
        self.occurrences = occurrences
        self.cl_product = confidence * lift

    def to_dict(self) -> dict[str, str | int | float]:
        return {
            "Candidate": self.candidate,
            "CL Product": self.cl_product,
            "Confidence": self.confidence,
            "Lift": self.lift,
            "Occurrences": self.occurrences
        }

    def __str__(self):
        return f"{{candidate='{self.candidate}', confidence={self.confidence:.8f}, lift={self.lift:.8f}, cl_product={self.cl_product:.8f}, occurances={self.occurrences}}}"
        
def sorted_desc_by_conf_lift_product(recommendations: list[AdvancedRecommendation]) -> list[AdvancedRecommendation]:
    return sorted(recommendations, key=lambda rec: rec.cl_product, reverse=True)

def avg(sum: float, count: int):
    return sum / count

def generate_advanced_candidates(basket: set[str], products: list[str], supports: dict[tuple[str, ...], float]) -> list[AdvancedRecommendation]:
    recommendations: list[AdvancedRecommendation] = []
    products_left = set(products) - basket
    for candidate in products_left:
        _occurrences: int = 0
        _confidence_sum = 0.0
        _lift_sum = 0.0
        for subbasket in powerset(basket):
            subbasket_set = set(subbasket)
            if len(subbasket) == 0:
                continue
            _lift = lift(supports, subbasket_set, {candidate}) 
            if _lift > 1.0:
                _lift_sum += _lift
                _confidence_sum += confidence(supports, subbasket_set, {candidate})
                _occurrences += 1
        if _occurrences > 0:
            _lift_avg = avg(_lift_sum, _occurrences)
            _confidence_avg = avg(_confidence_sum, _occurrences)
            recommendations.append(AdvancedRecommendation(candidate, _confidence_avg, _lift_avg, _occurrences))
    return sorted_desc_by_conf_lift_product(recommendations)

In [114]:
def print_top_n_advanced_recommendations(recomendations: list[AdvancedRecommendation], n: int):
    df = pd.DataFrame([rec.to_dict() for rec in recomendations][:n])
    print(df)

In [117]:
print(f"Advanced top {PRINT_TOP_N} recommendations for basket {baskets[1]}:")

advanced_candidates = generate_advanced_candidates(set(baskets[1]), products, supports)
print_top_n_advanced_recommendations(advanced_candidates, PRINT_TOP_N)

Advanced top 20 recommendations for basket {'semi-finished bread', 'sausage', 'yogurt', 'whole milk'}:
            Candidate  CL Product  Confidence      Lift  Occurrences
0          rolls/buns    0.138232    0.123313  1.120980            2
1                soda    0.122309    0.108982  1.122294            2
2                curd    0.070488    0.048726  1.446615            1
3        bottled beer    0.067663    0.055371  1.222000            1
4              pastry    0.054624    0.053156  1.027617            1
5        citrus fruit    0.054268    0.053696  1.010642            1
6   frozen vegetables    0.042087    0.034330  1.225966            1
7           beverages    0.039142    0.025471  1.536764            1
8        frozen meals    0.026392    0.021041  1.254327            1
9       sliced cheese    0.025254    0.018826  1.341407            1
10            dessert    0.025160    0.024363  1.032711            1
11        chewing gum    0.022201    0.016342  1.358508            1


In [118]:
print(f"Basic top {PRINT_TOP_N} recomendations for basket {baskets[33]}:")

basic_candidates = generate_basic_candidates(set(baskets[33]), products, supports)
print_top_n_basic_recomendations(basic_candidates, PRINT_TOP_N)

Basic top 20 recomendations for basket {'root vegetables', 'white wine', 'yogurt', 'soda', 'tropical fruit', 'photo/film', 'domestic eggs'}:
              Candidate  Confidence      Lift          Subbasket
0               sausage    0.066926  1.108986           {yogurt}
1               sausage    0.061253  1.014975             {soda}
2          citrus fruit    0.053696  1.010642           {yogurt}
3         shopping bags    0.048031  1.009388  {root vegetables}
4            newspapers    0.041441  1.065444    {domestic eggs}
5                coffee    0.037838  1.196972    {domestic eggs}
6           frankfurter    0.037838  1.002066    {domestic eggs}
7     frozen vegetables    0.030740  1.097751  {root vegetables}
8           white bread    0.027027  1.126477    {domestic eggs}
9              uht-milk    0.022682  1.060617   {tropical fruit}
10  specialty chocolate    0.019724  1.234846   {tropical fruit}
11            beverages    0.019270  1.162678             {soda}
12            

In [119]:
print(f"Advanced top {PRINT_TOP_N} recommendations for basket {baskets[33]}:")

advanced_candidates = generate_advanced_candidates(set(baskets[33]), products, supports)
print_top_n_advanced_recommendations(advanced_candidates, PRINT_TOP_N)

Advanced top 20 recommendations for basket {'root vegetables', 'white wine', 'yogurt', 'soda', 'tropical fruit', 'photo/film', 'domestic eggs'}:
              Candidate  CL Product  Confidence      Lift  Occurrences
0               sausage    0.068062    0.064089  1.061981            2
1          citrus fruit    0.054268    0.053696  1.010642            1
2         shopping bags    0.048482    0.048031  1.009388            1
3                coffee    0.045291    0.037838  1.196972            1
4            newspapers    0.044154    0.041441  1.065444            1
5           frankfurter    0.037916    0.037838  1.002066            1
6     frozen vegetables    0.033745    0.030740  1.097751            1
7           white bread    0.030445    0.027027  1.126477            1
8                 flour    0.025517    0.015779  1.617141            1
9   specialty chocolate    0.024356    0.019724  1.234846            1
10             uht-milk    0.024057    0.022682  1.060617            1
11 