# Market basket Analysis

## Mining simple rules of the form A $\to$ B

In [None]:
def support(item_list, D):
    # return the ratio of transactions in D containing all elements from item_list (list of items)
    # the support is from the interval <0;1>
    # YOUR CODE HERE
    return(sum([1 if set(item_list).issubset(item) else 0 for item in D ])/len(D))


D = [[1,2,3],
     [2,4],
     [1,4],
     [1,2,4]]
print(support([1], D))
print(support([1,2], D))
print(support([2,3], D))


0.75
0.5
0.25


In [None]:
from itertools import combinations

def simple_rules(D, min_support = 0.1, min_confidence=0.2):
    elements = set()
    for transaction in D:
        elements = elements.union(set(transaction))
    print(elements)

    results = []
    
    for pair in combinations(elements, 2):

        supp = support(pair, D)

        conf_1_2 =  support(pair, D)/support([pair[0]], D) #A^C / A
        conf_2_1 = support(pair, D)/support([pair[1]], D)

        lift =  support(pair, D) / (support([pair[0]], D) * support([pair[1]], D))

        if supp >= min_support and conf_1_2 >= min_confidence:
            print(f'{pair[0]} -> {pair[1]}, support {supp}, confidence {conf_1_2}, lift {lift}')
            results.append(((pair[0], pair[1]), supp, conf_1_2, lift))
        if supp >= min_support and conf_2_1 >= min_confidence:
            print(f'{pair[1]} -> {pair[0]}, support {supp}, confidence {conf_2_1}, lift {lift}')
            results.append(((pair[1], pair[0]), supp, conf_2_1, lift))
            
    
    return results

    
simple_rules(D, min_support=0.5, min_confidence=0.1)

{1, 2, 3, 4}
1 -> 2, support 0.5, confidence 0.6666666666666666, lift 0.8888888888888888
2 -> 1, support 0.5, confidence 0.6666666666666666, lift 0.8888888888888888
1 -> 4, support 0.5, confidence 0.6666666666666666, lift 0.8888888888888888
4 -> 1, support 0.5, confidence 0.6666666666666666, lift 0.8888888888888888
2 -> 4, support 0.5, confidence 0.6666666666666666, lift 0.8888888888888888
4 -> 2, support 0.5, confidence 0.6666666666666666, lift 0.8888888888888888


[((1, 2), 0.5, 0.6666666666666666, 0.8888888888888888),
 ((2, 1), 0.5, 0.6666666666666666, 0.8888888888888888),
 ((1, 4), 0.5, 0.6666666666666666, 0.8888888888888888),
 ((4, 1), 0.5, 0.6666666666666666, 0.8888888888888888),
 ((2, 4), 0.5, 0.6666666666666666, 0.8888888888888888),
 ((4, 2), 0.5, 0.6666666666666666, 0.8888888888888888)]

In [None]:
with open("transactions.txt", "r") as f:
    data = f.readlines()

transactions = []
for line in data:
    transaction = [int(word) for word in line.split()]
    transactions.append(transaction)

D = transactions
print(D[:5])

[[27, 35, 70, 158, 173, 175, 196, 296, 319, 336, 366, 377, 411, 415, 450], [93, 104, 127, 131, 175, 207, 280, 443, 489], [43, 96, 197, 240, 355, 450, 471], [72, 104, 131, 151, 207, 269, 362, 415, 443, 489], [70, 227, 240, 263, 280, 335, 471, 487]]


In [None]:
simple_rules(D, min_support=0.1, min_confidence=0.1)

{1, 3, 4, 7, 8, 9, 10, 12, 17, 21, 25, 27, 31, 32, 34, 35, 38, 39, 43, 44, 45, 48, 49, 52, 53, 55, 56, 58, 66, 69, 70, 72, 73, 75, 86, 90, 93, 96, 98, 100, 103, 104, 112, 120, 121, 122, 125, 126, 127, 128, 131, 132, 140, 141, 142, 143, 145, 150, 151, 154, 157, 158, 159, 161, 165, 168, 169, 171, 173, 175, 177, 179, 181, 183, 192, 193, 196, 197, 198, 202, 204, 205, 206, 207, 208, 209, 214, 215, 217, 227, 229, 230, 234, 236, 239, 240, 242, 246, 247, 256, 260, 263, 267, 269, 272, 276, 278, 280, 281, 283, 286, 287, 289, 293, 294, 295, 296, 298, 300, 301, 305, 306, 308, 313, 315, 316, 319, 320, 322, 325, 326, 327, 334, 335, 336, 339, 345, 347, 349, 350, 355, 356, 360, 362, 364, 366, 368, 369, 370, 371, 377, 378, 395, 396, 401, 402, 405, 409, 411, 412, 414, 415, 417, 419, 427, 430, 438, 439, 443, 444, 447, 450, 452, 456, 460, 461, 469, 471, 472, 474, 475, 478, 480, 482, 487, 489, 494, 495, 498}
104 -> 131, support 0.184, confidence 0.8325791855203619, lift 4.405180875769111
131 -> 104, suppor

[((104, 131), 0.184, 0.8325791855203619, 4.405180875769111),
 ((131, 104), 0.184, 0.9735449735449735, 4.405180875769111),
 ((104, 207), 0.186, 0.8416289592760181, 3.9513096679625264),
 ((207, 104), 0.186, 0.8732394366197184, 3.9513096679625264),
 ((104, 443), 0.189, 0.8552036199095022, 3.9229523849059738),
 ((443, 104), 0.189, 0.8669724770642202, 3.9229523849059738),
 ((104, 489), 0.182, 0.8235294117647058, 4.380475594493116),
 ((489, 104), 0.182, 0.9680851063829787, 4.380475594493116),
 ((131, 207), 0.18, 0.9523809523809523, 4.47127207690588),
 ((207, 131), 0.18, 0.8450704225352113, 4.47127207690588),
 ((131, 443), 0.179, 0.9470899470899471, 4.3444492985777385),
 ((443, 131), 0.179, 0.8211009174311926, 4.3444492985777385),
 ((131, 489), 0.175, 0.9259259259259258, 4.925137903861308),
 ((489, 131), 0.175, 0.9308510638297872, 4.925137903861308),
 ((207, 443), 0.18, 0.8450704225352113, 3.876469828143171),
 ((443, 207), 0.18, 0.8256880733944953, 3.876469828143171),
 ((207, 489), 0.179, 0.8

## Mining Association Rules Using Module `mlxtend`


In [None]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

In [None]:
import pandas as pd
import numpy as np

In [None]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


In [None]:
from mlxtend.frequent_patterns import apriori

apriori(df, min_support=0.6)

Unnamed: 0,support,itemsets
0,0.8,(3)
1,1.0,(5)
2,0.6,(6)
3,0.6,(8)
4,0.6,(10)
5,0.8,"(3, 5)"
6,0.6,"(8, 3)"
7,0.6,"(5, 6)"
8,0.6,"(8, 5)"
9,0.6,"(10, 5)"


In [None]:
apriori(df, min_support=0.6, use_colnames=True)

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Kidney Beans, Eggs)"
6,0.6,"(Onion, Eggs)"
7,0.6,"(Kidney Beans, Milk)"
8,0.6,"(Kidney Beans, Onion)"
9,0.6,"(Kidney Beans, Yogurt)"


In [None]:
with open("transactions.txt", "r") as f:
    data = f.readlines()

transactions = []
for line in data:
    transaction = [int(word) for word in line.split()]
    transactions.append(transaction)

D = transactions
print(D[:5])

[[27, 35, 70, 158, 173, 175, 196, 296, 319, 336, 366, 377, 411, 415, 450], [93, 104, 127, 131, 175, 207, 280, 443, 489], [43, 96, 197, 240, 355, 450, 471], [72, 104, 131, 151, 207, 269, 362, 415, 443, 489], [70, 227, 240, 263, 280, 335, 471, 487]]


In [None]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
df


Unnamed: 0,1,3,4,7,8,9,10,12,17,21,...,474,475,478,480,482,487,489,494,495,498
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
997,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
apriori(df, min_support=0.1, use_colnames=True)

Unnamed: 0,support,itemsets
0,0.113,(10)
1,0.104,(12)
2,0.140,(38)
3,0.131,(93)
4,0.221,(104)
...,...,...
63,0.163,"(104, 489, 131, 207)"
64,0.163,"(104, 489, 443, 131)"
65,0.166,"(104, 489, 443, 207)"
66,0.160,"(489, 443, 131, 207)"


In [None]:
apriori(df, min_support=0.1, use_colnames=True, max_len=2)

## Mining Association Rules

An exellent introduction to association rules can be found in the documentaion for `mlxtend` library in the notebook http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/ by [Sebastian Raschka](https://sebastianraschka.com/). The following description is from the notebook:

> _An association rule is an implication expression of the form $X \rightarrow Y$, where $X$ and $Y$ are disjoint itemsets_. To evaluate the "interest" of such an association rule, different metrics have been developed. The current implementation *(in `mlxtend`; remark by F. Mráz)* make use of the `confidence` and `lift` metrics. 


> ### Metrics

>The currently supported metrics for evaluating association rules and setting selection thresholds are listed below. Given a rule $A\rightarrow C$, $A$ stands for *antecedent* and $C$ stands for *consequent*.


> #### 'support':

> $$\text{support}(A\rightarrow C) = \text{support}(A \cup C), \;\;\; \text{range: } [0, 1]$$


> The support metric is defined for itemsets, not assocication rules. The table produced by the association rule mining algorithm contains three different support metrics: 'antecedent support', 'consequent support', and 'support'. Here, 'antecedent support' computes the proportion of transactions that contain the antecedent $A$, and 'consequent support' computes the support for the itemset of the consequent $C$. The 'support' metric then computes the support of the combined itemset $A \cup C$ -- note that 'support' depends on 'antecedent support' and 'consequent support' via min('antecedent support', 'consequent support').

Here note that min('antecedent support', 'consequent support') is only an upper bound for the support of the combined itemset $A \cup C$.


> Typically, support is used to measure the abundance or frequency (often interpreted as significance or importance) of an itemset in a database. We refer to an itemset as a "frequent itemset" if its support is larger than a specified minimum-support threshold. Note that in general, due to the *downward closure* property, all subsets of a frequent itemset are also frequent.


> #### 'confidence':  

> $$\text{confidence}(A\rightarrow C) = \frac{\text{support}(A\rightarrow C)}{\text{support}(A)}, \;\;\; \text{range: } [0, 1]$$


> The confidence of a rule $A\rightarrow C$ is the probability of seeing the consequent in a transaction given that it also contains the antecedent. Note that the metric is not symmetric or directed; for instance, the confidence for $A\rightarrow C$ is different than the confidence for $C\rightarrow A$. The confidence is 1 (maximal) for a rule $A\rightarrow C$ if the consequent and antecedent always occur together. 


> #### 'lift':

> $$\text{lift}(A\rightarrow C) = \frac{\text{confidence}(A\rightarrow C)}{\text{support}(C)}, \;\;\; \text{range: } [0, \infty]$$



> The lift metric is commonly used to measure how much more often the antecedent and consequent of a rule $A\rightarrow C$ occur together than we would expect if they were statistically independent. If $A$ and $C$ are independent, the Lift score will be exactly 1.


> #### 'leverage' *(CZ: vliv)*:

> $$\text{levarage}(A\rightarrow C) = \text{support}(A\rightarrow C) - \text{support}(A) \times \text{support}(C), \;\;\; \text{range: } [-1, 1]$$


> Leverage computes the difference between the observed frequency of $A$ and $C$ appearing together and the frequency that would be expected if $A$ and $C$ were independent. A leverage value of 0 indicates independence.

> #### 'conviction' *(CZ: přesvědčivost, jistota)*:

> $$\text{conviction}(A\rightarrow C) = \frac{1 - \text{support}(C)}{1 - \text{confidence}(A\rightarrow C)}, \;\;\; \text{range: } [0, \infty]$$

> A high conviction value means that the consequent is highly depending on the antecedent. For instance, in the case of a perfect confidence score, the denominator becomes 0 (due to 1 - 1) for which the conviction score is defined as 'inf'. Similar to lift, if items are independent, the conviction is 1.

## A Classic Application on Reatil Data

Let us apply MBA on [Online Retail Data Set](http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx) from [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml) used in the paper 
> Daqing Chen, Sai Liang Sain, and Kun Guo, Data mining for the online retail industry: A case study of RFM model-based customer segmentation using data mining, Journal of Database Marketing and Customer Strategy Management, Vol. 19, No. 3, pp. 197-208, 2012 (Published online before print: 27 August 2012. doi: 10.1057/dbm.2012.17).

Further information on the dataset:

* *Data Set Information:*

  This is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.


* *Attribute Information:*

  1. **InvoiceNo:** Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.
  2. **StockCode:** Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.
  3. **Description:** Product (item) name. Nominal.
  4. **Quantity:** The quantities of each product (item) per transaction. Numeric.
  5. **InvoiceDate:** Invice Date and time. Numeric, the day and time when each transaction was generated.
  6. **UnitPrice:** Unit price. Numeric, Product price per unit in sterling.
  7. **CustomerID:** Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.
  8. **Country:** Country name. Nominal, the name of the country where each customer resides.

What follows is based on 
> ["Introduction to Market Basket Analysis in Python"](https://pbpython.com/market-basket-analysis.html) by Chris Moffitt

In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

Next we will read an excel file using `pandas` function `read_excel`. If you obtain an error `XLRDError: Excel xlsx file; not supported`, you need to install module `openpyxl` and add parameter `engine='openpyxl'` as seen below.

In [None]:
import urllib

try:
    df = pd.read_excel('Online Retail.xlsx',engine='openpyxl')
except:
    print('File not found; wait a moment we will download it.')
    urllib.request.urlretrieve(
        'http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx',
        'Online Retail.xlsx')
    df = pd.read_excel('Online Retail.xlsx',engine='openpyxl')

df.head(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047.0,United Kingdom


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


There are some missing values. Should we remove them?

In [None]:
df['Description']

0          WHITE HANGING HEART T-LIGHT HOLDER
1                         WHITE METAL LANTERN
2              CREAM CUPID HEARTS COAT HANGER
3         KNITTED UNION FLAG HOT WATER BOTTLE
4              RED WOOLLY HOTTIE WHITE HEART.
                         ...                 
541904            PACK OF 20 SPACEBOY NAPKINS
541905           CHILDREN'S APRON DOLLY GIRL 
541906          CHILDRENS CUTLERY DOLLY GIRL 
541907        CHILDRENS CUTLERY CIRCUS PARADE
541908          BAKING SET 9 PIECE RETROSPOT 
Name: Description, Length: 541909, dtype: object

A little cleanup is required:
* remove spaces surrounding some descriptions.

In [None]:
df['Description'] = df['Description'].str.strip()

In [None]:
df.describe()

Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID
count,541909.0,541909,541909.0,406829.0
mean,9.55225,2011-07-04 13:34:57.156386048,4.611114,15287.69057
min,-80995.0,2010-12-01 08:26:00,-11062.06,12346.0
25%,1.0,2011-03-28 11:34:00,1.25,13953.0
50%,3.0,2011-07-19 17:17:00,2.08,15152.0
75%,10.0,2011-10-19 11:27:00,4.13,16791.0
max,80995.0,2011-12-09 12:50:00,38970.0,18287.0
std,218.081158,,96.759853,1713.600303


Some transactions are with negative quantities! Can we distinguish them?

In [None]:
# select the rows with negative quantities
# how many are there?
# YOUR CODE HERE
len(df[df['Quantity'] < 0])

10624

Most of them are connected with credit transactions - marked with the letter `C` in the invoice number field `InvoiceNo`. Let us remove all transactions that have a letter in `InvoiceNo`.

In [None]:
df['InvoiceNo'].str.contains('[a-zA-Z]',regex=True).sum()

9291

In [None]:
print(df.shape)
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('[a-zA-Z]',regex=True)]
print(df.shape)


(541909, 8)
(532618, 8)


If there are left transactions with negative quantities, remove them, too.

In [None]:
# YOUR CODE HERE
df = df.loc[df['Quantity'] >= 0, :]
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


In [None]:
df.describe()

Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID
count,531282.0,531282,531282.0,397924.0
mean,10.655317,2011-07-04 18:15:26.858729728,3.87814,15294.315171
min,1.0,2010-12-01 08:26:00,0.0,12346.0
25%,1.0,2011-03-28 11:59:00,1.25,13969.0
50%,3.0,2011-07-20 12:01:00,2.08,15159.0
75%,10.0,2011-10-19 12:35:00,4.13,16795.0
max,80995.0,2011-12-09 12:50:00,13541.33,18287.0
std,156.830764,,32.510663,1713.169877


Now we should build transactions. Let us do that for customers from France.

In [None]:
basket_fr1 = df[df['Country'] =="France"].groupby(['InvoiceNo', 'Description'])['Quantity'].sum()
print(basket_fr1.shape)
basket_fr1.head(30)      

(8372,)


InvoiceNo  Description                       
536370     ALARM CLOCK BAKELIKE GREEN            12
           ALARM CLOCK BAKELIKE PINK             24
           ALARM CLOCK BAKELIKE RED              24
           CHARLOTTE BAG DOLLY GIRL DESIGN       20
           CIRCUS PARADE LUNCH BOX               24
           INFLATABLE POLITICAL GLOBE            48
           LUNCH BOX I LOVE LONDON               24
           MINI JIGSAW CIRCUS PARADE             24
           MINI JIGSAW SPACEBOY                  24
           MINI PAINT SET VINTAGE                36
           PANDA AND BUNNIES STICKER SHEET       12
           POSTAGE                                3
           RED TOADSTOOL LED NIGHT LIGHT         24
           ROUND SNACK BOXES SET OF4 WOODLAND    24
           SET 2 TEA TOWELS I LOVE LONDON        24
           SET/2 RED RETROSPOT TEA TOWELS        18
           SPACEBOY LUNCH BOX                    24
           STARS GIFT TAPE                       24
           VINTAGE

In [None]:
basket_fr2 = basket_fr1.unstack()
print(basket_fr2.shape)
basket_fr2.head(30)

In [None]:
basket_fr3 = basket_fr2.fillna(0)
print(basket_fr3.shape)
basket_fr3.head(20)

Now we will convert all information on non-zero quantity into 1.

In [None]:
def nonzero2one(x):
    if x != 0:
        return 1
    else:
        return 0
    
basket_fr4 = basket_fr3.applymap(nonzero2one)

Are there any columns with many ones? Find them and possibly remove them.

In [None]:
sum(basket_fr4.sum(axis=0) > 100)

Drop the column with mean greater than 0.5.

In [None]:
print(basket_fr4.shape)

# YOUR CODE HERE
raise NotImplementedError()


Now the data is structured properly. We can generate frequent item sets that have a support of at least 7% (this number was chosen so that we get a small number of examples):

In [None]:
frequent_itemsets = apriori(basket_fr4, min_support=0.07, use_colnames=True)
frequent_itemsets

The final step is to generate the rules with their corresponding support, confidence and lift. Use `association_rules` with the parameters `metric="lift", min_threshold=1`:

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

We have built a list of frequent itemsets using apriori algorithm and then we constructed association rules.

Now, the tricky part is to interpret the obtained results. For instance, we can see that there are quite a few rules with a high lift value which means that it occurs more frequently than would be expected given the number of transaction and product combinations. We can also see several where the confidence is high as well. This part of the analysis is where the domain knowledge will come in handy. Since we do not have that, we will just look for a couple of illustrative examples.

We can filter the dataframe using standard pandas code. In this case, look for a large lift (6) and high confidence (.8):

In [None]:
rules[ (rules['lift'] >= 6) &
       (rules['confidence'] >= 0.8) ]

In looking at the rules, it seems that the green and red alarm clocks are purchased together and the red paper cups, napkins and plates are purchased together in a manner that is higher than the overall probability would suggest.

At this point, you may want to look at how much opportunity there is to use the popularity of one product to drive sales of another. E.g., let us check the amount of sold green and red alarm clocks.

In [None]:
# YOUR CODE HERE
raise NotImplementedError()


What recommendation concerning the alarm clocks could increase sales?

In [None]:
df['Country'].unique()

In [None]:
df.groupby('Country').sum()

Some question we can be answered using histograms:
1. What time do people often purchase online?
2. What is the number of transactions in the whole dataset? What is the distribution of the number of transaction with respect to countries? 
3. Waht is the distribution of the number of items in one transaction?
4. Find top $N$ best-sellers.

Further tasks:
* Compare the association rules extracted for transactions from different countries.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=11482918-e5bc-4678-83a7-ead1e761858b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>