## Market Basket Analysis and Recommendation Systems
#### Perform Market Basket Analysis using Associate Rule Minning (Apriori Algorithm)
#### Use support = 0.1 and confidence=0.4 for threshold.
#### Output the following:
#### a) Support score for all the different products.
#### b) Confidence for each pair of products.

## importing libraries

In [2]:
pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.18.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 2.6 MB/s eta 0:00:01     |███████▌                        | 317 kB 2.6 MB/s eta 0:00:01
Installing collected packages: mlxtend
Successfully installed mlxtend-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [11]:
# python modules used for handling data 
import pandas as pd
import numpy as np

# python module for Apriori Algorithm and Associate Rule 

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

### loading the data in python DataFrame object

In [12]:
data = pd.read_csv("data/GroceryStoreDataSet.csv",header=None)
data

Unnamed: 0,0
0,"MILK,BREAD,BISCUIT"
1,"BREAD,MILK,BISCUIT,CORNFLAKES"
2,"BREAD,TEA,BOURNVITA"
3,"JAM,MAGGI,BREAD,MILK"
4,"MAGGI,TEA,BISCUIT"
5,"BREAD,TEA,BOURNVITA"
6,"MAGGI,TEA,CORNFLAKES"
7,"MAGGI,BREAD,TEA,BISCUIT"
8,"JAM,MAGGI,BREAD,TEA"
9,"BREAD,MILK"


## pre-processing the data

In [13]:
data = list(data[0].apply(lambda x:x.split(',')))
data

[['MILK', 'BREAD', 'BISCUIT'],
 ['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['JAM', 'MAGGI', 'BREAD', 'MILK'],
 ['MAGGI', 'TEA', 'BISCUIT'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['MAGGI', 'TEA', 'CORNFLAKES'],
 ['MAGGI', 'BREAD', 'TEA', 'BISCUIT'],
 ['JAM', 'MAGGI', 'BREAD', 'TEA'],
 ['BREAD', 'MILK'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'COCK'],
 ['BREAD', 'SUGER', 'BISCUIT'],
 ['COFFEE', 'SUGER', 'CORNFLAKES'],
 ['BREAD', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['TEA', 'MILK', 'COFFEE', 'CORNFLAKES']]

### processing the data into algorithm understandable input format

In [14]:
# performing one-hot encoding on the data 
trans = TransactionEncoder()
trans.fit(data)
trans_data = trans.transform(data)
data = pd.DataFrame(trans_data,columns=trans.columns_)
data

Unnamed: 0,BISCUIT,BOURNVITA,BREAD,COCK,COFFEE,CORNFLAKES,JAM,MAGGI,MILK,SUGER,TEA
0,True,False,True,False,False,False,False,False,True,False,False
1,True,False,True,False,False,True,False,False,True,False,False
2,False,True,True,False,False,False,False,False,False,False,True
3,False,False,True,False,False,False,True,True,True,False,False
4,True,False,False,False,False,False,False,True,False,False,True
5,False,True,True,False,False,False,False,False,False,False,True
6,False,False,False,False,False,True,False,True,False,False,True
7,True,False,True,False,False,False,False,True,False,False,True
8,False,False,True,False,False,False,True,True,False,False,True
9,False,False,True,False,False,False,False,False,True,False,False


In [15]:
# reforming the crude data into integer type
data = data.replace(True,1).astype(int)
data

Unnamed: 0,BISCUIT,BOURNVITA,BREAD,COCK,COFFEE,CORNFLAKES,JAM,MAGGI,MILK,SUGER,TEA
0,1,0,1,0,0,0,0,0,1,0,0
1,1,0,1,0,0,1,0,0,1,0,0
2,0,1,1,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,1,1,1,0,0
4,1,0,0,0,0,0,0,1,0,0,1
5,0,1,1,0,0,0,0,0,0,0,1
6,0,0,0,0,0,1,0,1,0,0,1
7,1,0,1,0,0,0,0,1,0,0,1
8,0,0,1,0,0,0,1,1,0,0,1
9,0,0,1,0,0,0,0,0,1,0,0


### Applying Apriori Algorithm on the processed data

In [16]:
# providing the support values i.e. 0.1 to the Apriori Algorithm
freq_items = apriori(data,min_support=0.1,use_colnames=True)
freq_items

Unnamed: 0,support,itemsets
0,0.35,(BISCUIT)
1,0.2,(BOURNVITA)
2,0.65,(BREAD)
3,0.15,(COCK)
4,0.4,(COFFEE)
5,0.3,(CORNFLAKES)
6,0.1,(JAM)
7,0.25,(MAGGI)
8,0.25,(MILK)
9,0.3,(SUGER)


### result 1
#### a) Support score for all the different products.

In [17]:
freq_data = freq_items.sort_values(by = "support" , ascending = False)
res1 = freq_data[freq_data["itemsets"].apply(lambda x:len(x)==1)]
res1

Unnamed: 0,support,itemsets
2,0.65,(BREAD)
4,0.4,(COFFEE)
0,0.35,(BISCUIT)
10,0.35,(TEA)
5,0.3,(CORNFLAKES)
9,0.3,(SUGER)
7,0.25,(MAGGI)
8,0.25,(MILK)
1,0.2,(BOURNVITA)
3,0.15,(COCK)


#### Using the frequent items data(Apriori Algorithm) to get the Association Rules

In [18]:
# providing the confidence value i.e. 0.4 to the Association Rules 
rules = association_rules(freq_items, metric = "confidence", min_threshold = 0.4)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(BISCUIT),(BREAD),0.35,0.65,0.20,0.571429,0.879121,-0.0275,0.816667
1,(COCK),(BISCUIT),0.15,0.35,0.10,0.666667,1.904762,0.0475,1.950000
2,(BISCUIT),(CORNFLAKES),0.35,0.30,0.15,0.428571,1.428571,0.0450,1.225000
3,(CORNFLAKES),(BISCUIT),0.30,0.35,0.15,0.500000,1.428571,0.0450,1.300000
4,(MAGGI),(BISCUIT),0.25,0.35,0.10,0.400000,1.142857,0.0125,1.083333
...,...,...,...,...,...,...,...,...,...
70,"(COCK, COFFEE)","(BISCUIT, CORNFLAKES)",0.15,0.15,0.10,0.666667,4.444444,0.0775,2.550000
71,"(BISCUIT, CORNFLAKES)","(COCK, COFFEE)",0.15,0.15,0.10,0.666667,4.444444,0.0775,2.550000
72,"(BISCUIT, COFFEE)","(COCK, CORNFLAKES)",0.10,0.10,0.10,1.000000,10.000000,0.0900,inf
73,"(CORNFLAKES, COFFEE)","(COCK, BISCUIT)",0.20,0.10,0.10,0.500000,5.000000,0.0800,1.800000


### result 2
#### b) Confidence for each pair of products.

In [19]:
res = rules[rules["antecedents"].apply(lambda x:len(x)==2)]
res2 = res[["antecedents","confidence"]]
res2

Unnamed: 0,antecedents,confidence
25,"(BISCUIT, BREAD)",0.5
26,"(BISCUIT, MILK)",1.0
27,"(BREAD, MILK)",0.5
29,"(COCK, BISCUIT)",1.0
30,"(COCK, COFFEE)",0.666667
31,"(BISCUIT, COFFEE)",1.0
33,"(COCK, BISCUIT)",1.0
34,"(COCK, CORNFLAKES)",1.0
35,"(BISCUIT, CORNFLAKES)",0.666667
37,"(BISCUIT, CORNFLAKES)",0.666667
