<a href="https://colab.research.google.com/github/jmarrietar/mineria-de-datos/blob/main/%5BProyecto%5D%20Asociaci%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!gdown https://drive.google.com/uc?id=1_AJd_FiQ6LSWSK4e__ccfLzhu_sWWNUS
!pip install mlxtend --upgrade --quiet

Downloading...
From: https://drive.google.com/uc?id=1_AJd_FiQ6LSWSK4e__ccfLzhu_sWWNUS
To: /content/OnlineRetail.csv
45.6MB [00:00, 88.2MB/s]


In [2]:
import pandas as pd
import math 
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import numpy as np
import seaborn as sns
from os import path
from PIL import Image

from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth

% matplotlib inline
pd.set_option('display.max_rows', 100)

In [31]:
# Auxiliar Functions

def print_rules(rules):
    for index, row in rules.iterrows():
        antecedents = list(row["antecedents"])
        consequents = list(row["consequents"])

        consequents_list = []
        antecedents_list = []

        for c in consequents:
            consequents_list.append(items[c])

        for c in antecedents:
            antecedents_list.append(items[c])

        print(
            "("
            + " & ".join(antecedents_list)
            + ") -> ("
            + " & ".join(consequents_list)
            + ")"
        )


def count_frequent_items(frequent_itemsets):

    size_itemsets = {"1": 0, "2": 0, "3": 0, "4": 0}

    for index, row in frequent_itemsets.iterrows():
        size = str(len(row["itemsets"]))
        size_itemsets[size] = size_itemsets[size] + 1

    return size_itemsets


def get_apriori_statistics(df, MIN_SUPPORT):
    frequent_itemsets = apriori(df, min_support=MIN_SUPPORT, use_colnames=True)
    size_itemsets = count_frequent_items(frequent_itemsets)
    print(
        "Numero de Itemset Frecuentes: {} - Soporte: {} - Itemsets size - 1: {} - 2: {} - 3: {} - 4: {}".format(
            len(frequent_itemsets),
            MIN_SUPPORT,
            size_itemsets["1"],
            size_itemsets["2"],
            size_itemsets["3"],
            size_itemsets["4"],
        )
    )

## 0. Datos

In [4]:
# Load data
online_retail = pd.read_csv(
    "OnlineRetail.csv",
    sep=",",
    dtype={"CustomerID": "object"},
    encoding="unicode_escape",
)

## 1. Tratamiento al conjunto de datos 

In [5]:
# Remove spaces in columns StockCode & Description
online_retail['Description'] = online_retail['Description'].str.strip()
online_retail['StockCode'] = online_retail['StockCode'].str.strip()

# Remove rows with Null Descriptions
online_retail.dropna(axis=0, subset=['InvoiceNo'], inplace=True)

In [6]:
online_retail[online_retail["StockCode"].apply(lambda x: len(x)<5)][["InvoiceNo", "StockCode", "Description"]]

Unnamed: 0,InvoiceNo,StockCode,Description
45,536370,POST,POSTAGE
141,C536379,D,Discount
386,536403,POST,POSTAGE
1123,536527,POST,POSTAGE
1423,536540,C2,CARRIAGE
...,...,...,...
541540,581498,DOT,DOTCOM POSTAGE
541541,C581499,M,Manual
541730,581570,POST,POSTAGE
541767,581574,POST,POSTAGE


In [7]:
# Remove invalid descriptions 
online_retail = online_retail[online_retail["StockCode"].apply(lambda x: len(x)>=5)]

In [8]:
# Create dictionary for products 
stock_description = online_retail[["StockCode", "Description"]]

items = {}

for index, row in stock_description.iterrows():
    if row["StockCode"] not in items:
        items[row["StockCode"]] = row["Description"]

In [9]:
items

{'85123A': 'WHITE HANGING HEART T-LIGHT HOLDER',
 '71053': 'WHITE METAL LANTERN',
 '84406B': 'CREAM CUPID HEARTS COAT HANGER',
 '84029G': 'KNITTED UNION FLAG HOT WATER BOTTLE',
 '84029E': 'RED WOOLLY HOTTIE WHITE HEART.',
 '22752': 'SET 7 BABUSHKA NESTING BOXES',
 '21730': 'GLASS STAR FROSTED T-LIGHT HOLDER',
 '22633': 'HAND WARMER UNION JACK',
 '22632': 'HAND WARMER RED POLKA DOT',
 '84879': 'ASSORTED COLOUR BIRD ORNAMENT',
 '22745': "POPPY'S PLAYHOUSE BEDROOM",
 '22748': "POPPY'S PLAYHOUSE KITCHEN",
 '22749': 'FELTCRAFT PRINCESS CHARLOTTE DOLL',
 '22310': 'IVORY KNITTED MUG COSY',
 '84969': 'BOX OF 6 ASSORTED COLOUR TEASPOONS',
 '22623': 'BOX OF VINTAGE JIGSAW BLOCKS',
 '22622': 'BOX OF VINTAGE ALPHABET BLOCKS',
 '21754': 'HOME BUILDING BLOCK WORD',
 '21755': 'LOVE BUILDING BLOCK WORD',
 '21777': 'RECIPE BOX WITH METAL HEART',
 '48187': 'DOORMAT NEW ENGLAND',
 '22960': 'JAM MAKING SET WITH JARS',
 '22913': 'RED COAT RACK PARIS FASHION',
 '22912': 'YELLOW COAT RACK PARIS FASHION',
 '2

Transformar de dataframe nivel item-product a Lista de Items (Baskets). 

In [10]:
grouped_df = online_retail[["InvoiceNo","StockCode"]].groupby("InvoiceNo")
grouped_lists = grouped_df["StockCode"].agg(lambda column: " ".join(column))
grouped_lists = grouped_lists.reset_index(name="Basket")

basket_transaction = []

for index, row in grouped_lists.iterrows():
    basket_transaction.append(row['Basket'].split(' '))

## 2. Aplicar algoritmos de asociación


#### Creacion de Items frecuentes

Reportar los itemsets frecuentes con los valores de soporte


In [12]:
te = TransactionEncoder()
te_ary = te.fit(basket_transaction).transform(basket_transaction)
df = pd.DataFrame(te_ary, columns=te.columns_)

#### Apriori

Analisis soporte 

In [36]:
# Statistics 
MIN_SUPPORT = 0.1
get_apriori_statistics(df, MIN_SUPPORT)

MIN_SUPPORT = 0.05
get_apriori_statistics(df, MIN_SUPPORT)

MIN_SUPPORT = 0.025
get_apriori_statistics(df, MIN_SUPPORT)

MIN_SUPPORT = 0.02
get_apriori_statistics(df, MIN_SUPPORT)

MIN_SUPPORT = 0.015
get_apriori_statistics(df, MIN_SUPPORT)

MIN_SUPPORT = 0.0125
get_apriori_statistics(df, MIN_SUPPORT)

Numero de Itemset Frecuentes: 0 - Soporte: 0.1 - Itemsets size - 1: 0 - 2: 0 - 3: 0 - 4: 0
Numero de Itemset Frecuentes: 11 - Soporte: 0.05 - Itemsets size - 1: 11 - 2: 0 - 3: 0 - 4: 0
Numero de Itemset Frecuentes: 121 - Soporte: 0.025 - Itemsets size - 1: 113 - 2: 8 - 3: 0 - 4: 0
Numero de Itemset Frecuentes: 225 - Soporte: 0.02 - Itemsets size - 1: 186 - 2: 38 - 3: 1 - 4: 0
Numero de Itemset Frecuentes: 443 - Soporte: 0.015 - Itemsets size - 1: 332 - 2: 107 - 3: 4 - 4: 0
Numero de Itemset Frecuentes: 661 - Soporte: 0.0125 - Itemsets size - 1: 450 - 2: 185 - 3: 26 - 4: 0


Analisis confianza

In [37]:
frequent_itemsets = apriori(df, min_support=0.02, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.020616,(15036)
1,0.027750,(20685)
2,0.020970,(20711)
3,0.034373,(20712)
4,0.026568,(20713)
...,...,...
220,0.021641,"(23301, 23300)"
221,0.020143,"(82494L, 82482)"
222,0.023375,"(85099B, 85099C)"
223,0.021483,"(85099B, 85099F)"


In [40]:
MIN_CONFIDENCE = 0.4
rules = association_rules(
    frequent_itemsets, metric="confidence", min_threshold=MIN_CONFIDENCE
)[["antecedents", "consequents", "support", "confidence"]]
print("Numero de reglas: {} - Confianza: {}".format(len(rules), MIN_CONFIDENCE))

MIN_CONFIDENCE = 0.5
rules = association_rules(
    frequent_itemsets, metric="confidence", min_threshold=MIN_CONFIDENCE
)[["antecedents", "consequents", "support", "confidence"]]
print("Numero de reglas: {} - Confianza: {}".format(len(rules), MIN_CONFIDENCE))

MIN_CONFIDENCE = 0.7
rules = association_rules(
    frequent_itemsets, metric="confidence", min_threshold=MIN_CONFIDENCE
)[["antecedents", "consequents", "support", "confidence"]]
print("Numero de reglas: {} - Confianza: {}".format(len(rules), MIN_CONFIDENCE))

MIN_CONFIDENCE = 0.9
rules = association_rules(
    frequent_itemsets, metric="confidence", min_threshold=MIN_CONFIDENCE
)[["antecedents", "consequents", "support", "confidence"]]
print("Numero de reglas: {} - Confianza: {}".format(len(rules), MIN_CONFIDENCE))


Numero de reglas: 63 - Confianza: 0.4
Numero de reglas: 36 - Confianza: 0.5
Numero de reglas: 8 - Confianza: 0.7
Numero de reglas: 0 - Confianza: 0.9


In [41]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)[
    ["antecedents", "consequents", "support", "confidence", "lift"]
]

In [42]:
print_rules(rules)

(JUMBO BAG WOODLAND ANIMALS) -> (JUMBO BAG RED RETROSPOT)
(CHARLOTTE BAG SUKI DESIGN) -> (RED RETROSPOT CHARLOTTE BAG)
(RED RETROSPOT CHARLOTTE BAG) -> (CHARLOTTE BAG PINK POLKADOT)
(CHARLOTTE BAG PINK POLKADOT) -> (RED RETROSPOT CHARLOTTE BAG)
(LUNCH BAG WOODLAND) -> (LUNCH BAG RED RETROSPOT)
(LUNCH BAG  BLACK SKULL.) -> (LUNCH BAG RED RETROSPOT)
(LUNCH BAG SUKI  DESIGN) -> (LUNCH BAG RED RETROSPOT)
(LUNCH BAG PINK POLKADOT) -> (LUNCH BAG RED RETROSPOT)
(JUMBO BAG SCANDINAVIAN PAISLEY) -> (JUMBO BAG RED RETROSPOT)
(JUMBO BAG PINK VINTAGE PAISLEY) -> (JUMBO BAG RED RETROSPOT)
(JUMBO STORAGE BAG SUKI) -> (JUMBO BAG RED RETROSPOT)
(PAPER CHAIN KIT VINTAGE CHRISTMAS) -> (PAPER CHAIN KIT 50'S CHRISTMAS)
(JUMBO BAG PINK POLKADOT) -> (JUMBO BAG RED RETROSPOT)
(JUMBO SHOPPER VINTAGE RED PAISLEY) -> (JUMBO BAG RED RETROSPOT)
(SPACEBOY LUNCH BOX) -> (DOLLY GIRL LUNCH BOX)
(DOLLY GIRL LUNCH BOX) -> (SPACEBOY LUNCH BOX)
(GREEN REGENCY TEACUP AND SAUCER) -> (PINK REGENCY TEACUP AND SAUCER)
(PINK R

#### Ranking Reglas -  Soporte

In [62]:
rules.sort_values(["support"], ascending=[False])[0:20]

Unnamed: 0,antecedents,consequents,support,confidence,lift
12,(22386),(85099B),0.032835,0.676686,8.040673
18,(22697),(22699),0.030904,0.741722,16.800662
19,(22699),(22697),0.030904,0.7,16.800662
10,(21931),(85099B),0.028894,0.610325,7.252144
13,(22411),(85099B),0.026923,0.5754,6.837155
6,(22383),(20725),0.026134,0.507657,8.009173
5,(20727),(20725),0.025543,0.500386,7.894462
23,(22727),(22726),0.025464,0.597595,15.115038
22,(22726),(22727),0.025464,0.644068,15.115038
16,(22697),(22698),0.025385,0.609272,19.27258


In [63]:
print_rules(rules.sort_values(["support"], ascending=[False])[0:20])

(JUMBO BAG PINK POLKADOT) -> (JUMBO BAG RED RETROSPOT)
(GREEN REGENCY TEACUP AND SAUCER) -> (ROSES REGENCY TEACUP AND SAUCER)
(ROSES REGENCY TEACUP AND SAUCER) -> (GREEN REGENCY TEACUP AND SAUCER)
(JUMBO STORAGE BAG SUKI) -> (JUMBO BAG RED RETROSPOT)
(JUMBO SHOPPER VINTAGE RED PAISLEY) -> (JUMBO BAG RED RETROSPOT)
(LUNCH BAG SUKI  DESIGN) -> (LUNCH BAG RED RETROSPOT)
(LUNCH BAG  BLACK SKULL.) -> (LUNCH BAG RED RETROSPOT)
(ALARM CLOCK BAKELIKE RED) -> (ALARM CLOCK BAKELIKE GREEN)
(ALARM CLOCK BAKELIKE GREEN) -> (ALARM CLOCK BAKELIKE RED)
(GREEN REGENCY TEACUP AND SAUCER) -> (PINK REGENCY TEACUP AND SAUCER)


#### Ranking Reglas - Confianza

In [64]:
rules.sort_values(["confidence"], ascending=[False])[0:20]

Unnamed: 0,antecedents,consequents,support,confidence,lift
33,"(22698, 22699)",(22697),0.021641,0.894137,21.460129
31,"(22697, 22698)",(22699),0.021641,0.852484,19.309534
17,(22698),(22697),0.025385,0.802993,19.27258
20,(22698),(22699),0.024203,0.765586,17.341207
18,(22697),(22699),0.030904,0.741722,16.800662
27,(23300),(23301),0.021641,0.717647,19.70345
32,"(22697, 22699)",(22698),0.021641,0.700255,22.150588
19,(22699),(22697),0.030904,0.7,16.800662
3,(22356),(20724),0.020734,0.692105,16.721922
35,(22698),"(22697, 22699)",0.021641,0.684539,22.150588


In [65]:
print_rules(rules.sort_values(["confidence"], ascending=[False])[0:20])

(PINK REGENCY TEACUP AND SAUCER & ROSES REGENCY TEACUP AND SAUCER) -> (GREEN REGENCY TEACUP AND SAUCER)
(GREEN REGENCY TEACUP AND SAUCER & PINK REGENCY TEACUP AND SAUCER) -> (ROSES REGENCY TEACUP AND SAUCER)
(PINK REGENCY TEACUP AND SAUCER) -> (GREEN REGENCY TEACUP AND SAUCER)
(PINK REGENCY TEACUP AND SAUCER) -> (ROSES REGENCY TEACUP AND SAUCER)
(GREEN REGENCY TEACUP AND SAUCER) -> (ROSES REGENCY TEACUP AND SAUCER)
(GARDENERS KNEELING PAD CUP OF TEA) -> (GARDENERS KNEELING PAD KEEP CALM)
(GREEN REGENCY TEACUP AND SAUCER & ROSES REGENCY TEACUP AND SAUCER) -> (PINK REGENCY TEACUP AND SAUCER)
(ROSES REGENCY TEACUP AND SAUCER) -> (GREEN REGENCY TEACUP AND SAUCER)
(CHARLOTTE BAG PINK POLKADOT) -> (RED RETROSPOT CHARLOTTE BAG)
(PINK REGENCY TEACUP AND SAUCER) -> (GREEN REGENCY TEACUP AND SAUCER & ROSES REGENCY TEACUP AND SAUCER)


#### Ranking Reglas - Lift

In [66]:
rules.sort_values(["lift"], ascending=[False])[0:20]

Unnamed: 0,antecedents,consequents,support,confidence,lift
35,(22698),"(22697, 22699)",0.021641,0.684539,22.150588
32,"(22697, 22699)",(22698),0.021641,0.700255,22.150588
33,"(22698, 22699)",(22697),0.021641,0.894137,21.460129
34,(22697),"(22698, 22699)",0.021641,0.519395,21.460129
27,(23300),(23301),0.021641,0.717647,19.70345
26,(23301),(23300),0.021641,0.594156,19.70345
31,"(22697, 22698)",(22699),0.021641,0.852484,19.309534
16,(22697),(22698),0.025385,0.609272,19.27258
17,(22698),(22697),0.025385,0.802993,19.27258
14,(22629),(22630),0.021207,0.592511,17.478386


In [61]:
print_rules(rules.sort_values(["lift"], ascending=[False])[0:20])

(PINK REGENCY TEACUP AND SAUCER) -> (GREEN REGENCY TEACUP AND SAUCER & ROSES REGENCY TEACUP AND SAUCER)
(GREEN REGENCY TEACUP AND SAUCER & ROSES REGENCY TEACUP AND SAUCER) -> (PINK REGENCY TEACUP AND SAUCER)
(PINK REGENCY TEACUP AND SAUCER & ROSES REGENCY TEACUP AND SAUCER) -> (GREEN REGENCY TEACUP AND SAUCER)
(GREEN REGENCY TEACUP AND SAUCER) -> (PINK REGENCY TEACUP AND SAUCER & ROSES REGENCY TEACUP AND SAUCER)
(GARDENERS KNEELING PAD CUP OF TEA) -> (GARDENERS KNEELING PAD KEEP CALM)
(GARDENERS KNEELING PAD KEEP CALM) -> (GARDENERS KNEELING PAD CUP OF TEA)
(GREEN REGENCY TEACUP AND SAUCER & PINK REGENCY TEACUP AND SAUCER) -> (ROSES REGENCY TEACUP AND SAUCER)
(GREEN REGENCY TEACUP AND SAUCER) -> (PINK REGENCY TEACUP AND SAUCER)
(PINK REGENCY TEACUP AND SAUCER) -> (GREEN REGENCY TEACUP AND SAUCER)
(SPACEBOY LUNCH BOX) -> (DOLLY GIRL LUNCH BOX)


#### FP-Growth

In [None]:
frequent_itemsets = fpgrowth(df, min_support=soporte, use_colnames=True)

Repetir lo hecho arriba pero ahora para FP-Growth  

---