## Name: David Geng

# Market Basket Analysis: Unsupervised Learning

## Loading Library

In [1]:
# libraries
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

## Loading and Processing data

In [2]:
# # import into colab
# from google.colab import files
# uploaded = files.upload()

In [3]:
# read the dataset
df = pd.read_csv('books.csv')

In [4]:
# df info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Seq#              4000 non-null   int64
 1   ID#               4000 non-null   int64
 2   Gender            4000 non-null   int64
 3   M                 4000 non-null   int64
 4   R                 4000 non-null   int64
 5   F                 4000 non-null   int64
 6   FirstPurch        4000 non-null   int64
 7   ChildBks          4000 non-null   int64
 8   YouthBks          4000 non-null   int64
 9   CookBks           4000 non-null   int64
 10  DoItYBks          4000 non-null   int64
 11  RefBks            4000 non-null   int64
 12  ArtBks            4000 non-null   int64
 13  GeogBks           4000 non-null   int64
 14  ItalCook          4000 non-null   int64
 15  ItalAtlas         4000 non-null   int64
 16  ItalArt           4000 non-null   int64
 17  Florence          4000 non-null  

In [5]:
# df head
df.head()

Unnamed: 0,Seq#,ID#,Gender,M,R,F,FirstPurch,ChildBks,YouthBks,CookBks,...,ItalCook,ItalAtlas,ItalArt,Florence,Related Purchase,Mcode,Rcode,Fcode,Yes_Florence,No_Florence
0,1,25,1,297,14,2,22,0,1,1,...,0,0,0,0,0,5,4,2,0,1
1,2,29,0,128,8,2,10,0,0,0,...,0,0,0,0,0,4,3,2,0,1
2,3,46,1,138,22,7,56,2,1,2,...,1,0,0,0,2,4,4,3,0,1
3,4,47,1,228,2,1,2,0,0,0,...,0,0,0,0,0,5,1,1,0,1
4,5,51,1,257,10,1,10,0,0,0,...,0,0,0,0,0,5,3,1,0,1


In [6]:
# removing columns in the front
df = df.iloc[:, 7:]
df.head()

Unnamed: 0,ChildBks,YouthBks,CookBks,DoItYBks,RefBks,ArtBks,GeogBks,ItalCook,ItalAtlas,ItalArt,Florence,Related Purchase,Mcode,Rcode,Fcode,Yes_Florence,No_Florence
0,0,1,1,0,0,0,0,0,0,0,0,0,5,4,2,0,1
1,0,0,0,0,0,0,0,0,0,0,0,0,4,3,2,0,1
2,2,1,2,0,1,0,1,1,0,0,0,2,4,4,3,0,1
3,0,0,0,0,0,0,0,0,0,0,0,0,5,1,1,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,5,3,1,0,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   ChildBks          4000 non-null   int64
 1   YouthBks          4000 non-null   int64
 2   CookBks           4000 non-null   int64
 3   DoItYBks          4000 non-null   int64
 4   RefBks            4000 non-null   int64
 5   ArtBks            4000 non-null   int64
 6   GeogBks           4000 non-null   int64
 7   ItalCook          4000 non-null   int64
 8   ItalAtlas         4000 non-null   int64
 9   ItalArt           4000 non-null   int64
 10  Florence          4000 non-null   int64
 11  Related Purchase  4000 non-null   int64
 12  Mcode             4000 non-null   int64
 13  Rcode             4000 non-null   int64
 14  Fcode             4000 non-null   int64
 15  Yes_Florence      4000 non-null   int64
 16  No_Florence       4000 non-null   int64
dtypes: int64(17)
memory usage: 531.4 

In [8]:
# removing columns from the end
df = df.iloc[:, 0:11]
df.head()

Unnamed: 0,ChildBks,YouthBks,CookBks,DoItYBks,RefBks,ArtBks,GeogBks,ItalCook,ItalAtlas,ItalArt,Florence
0,0,1,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,2,1,2,0,1,0,1,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# transforming rows to binary
df[:] = np.where(df >= 1, 1, 0)
df.head()

Unnamed: 0,ChildBks,YouthBks,CookBks,DoItYBks,RefBks,ArtBks,GeogBks,ItalCook,ItalAtlas,ItalArt,Florence
0,0,1,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,1,0,1,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0


## Association Analysis 

In [12]:
# create frequent itemsets
pd.options.display.max_rows = None
itemsets = apriori(df, min_support = 0.1, use_colnames= True)
itemsets



Unnamed: 0,support,itemsets
0,0.394,(ChildBks)
1,0.23825,(YouthBks)
2,0.4155,(CookBks)
3,0.25475,(DoItYBks)
4,0.20475,(RefBks)
5,0.223,(ArtBks)
6,0.26675,(GeogBks)
7,0.1075,(ItalCook)
8,0.1475,"(YouthBks, ChildBks)"
9,0.242,"(CookBks, ChildBks)"


In [17]:
# Extract Rules Based on Confidence
rules = association_rules(itemsets, metric= 'confidence', min_threshold= 0.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(YouthBks),(ChildBks),0.23825,0.394,0.1475,0.619098,1.571314,0.053629,1.590959
1,(CookBks),(ChildBks),0.4155,0.394,0.242,0.582431,1.478251,0.078293,1.451256
2,(ChildBks),(CookBks),0.394,0.4155,0.242,0.614213,1.478251,0.078293,1.515086
3,(DoItYBks),(ChildBks),0.25475,0.394,0.1615,0.633955,1.609022,0.061129,1.655534
4,(RefBks),(ChildBks),0.20475,0.394,0.12825,0.626374,1.589781,0.047579,1.621941
5,(GeogBks),(ChildBks),0.26675,0.394,0.14625,0.548266,1.391538,0.04115,1.341498
6,(YouthBks),(CookBks),0.23825,0.4155,0.161,0.675761,1.62638,0.062007,1.802681
7,(DoItYBks),(CookBks),0.25475,0.4155,0.16875,0.662414,1.594258,0.062901,1.731411
8,(RefBks),(CookBks),0.20475,0.4155,0.13975,0.68254,1.642695,0.054676,1.841175
9,(ArtBks),(CookBks),0.223,0.4155,0.113,0.506726,1.219558,0.020344,1.184941


In [19]:
# sorting rules
rules.sort_values(by = ['lift'], ascending = False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
20,(RefBks),"(CookBks, ChildBks)",0.20475,0.242,0.1035,0.505495,2.08882,0.05395,1.532844
13,(YouthBks),"(CookBks, ChildBks)",0.23825,0.242,0.12,0.503673,2.081292,0.062344,1.527218
15,"(CookBks, ChildBks)",(DoItYBks),0.242,0.25475,0.12775,0.527893,2.072198,0.066101,1.57856
17,(DoItYBks),"(CookBks, ChildBks)",0.25475,0.242,0.12775,0.501472,2.072198,0.066101,1.520476
12,"(YouthBks, ChildBks)",(CookBks),0.1475,0.4155,0.12,0.813559,1.958025,0.058714,3.135045
19,"(ChildBks, RefBks)",(CookBks),0.12825,0.4155,0.1035,0.807018,1.94228,0.050212,3.028773
14,"(CookBks, DoItYBks)",(ChildBks),0.16875,0.394,0.12775,0.757037,1.921414,0.061262,2.494207
16,"(DoItYBks, ChildBks)",(CookBks),0.1615,0.4155,0.12775,0.791022,1.903783,0.060647,2.796941
11,"(YouthBks, CookBks)",(ChildBks),0.161,0.394,0.12,0.745342,1.89173,0.056566,2.379659
18,"(CookBks, RefBks)",(ChildBks),0.13975,0.394,0.1035,0.740608,1.879716,0.048438,2.336234


In [23]:
# Extract Rules Based on Confidence
rules = association_rules(itemsets, metric= 'lift', min_threshold= 2)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(CookBks, ChildBks)",(YouthBks),0.242,0.23825,0.12,0.495868,2.081292,0.062344,1.511012
1,(YouthBks),"(CookBks, ChildBks)",0.23825,0.242,0.12,0.503673,2.081292,0.062344,1.527218
2,"(CookBks, ChildBks)",(DoItYBks),0.242,0.25475,0.12775,0.527893,2.072198,0.066101,1.57856
3,(DoItYBks),"(CookBks, ChildBks)",0.25475,0.242,0.12775,0.501472,2.072198,0.066101,1.520476
4,"(CookBks, ChildBks)",(RefBks),0.242,0.20475,0.1035,0.427686,2.08882,0.05395,1.389534
5,(RefBks),"(CookBks, ChildBks)",0.20475,0.242,0.1035,0.505495,2.08882,0.05395,1.532844


In [24]:
# sorting rules
rules.sort_values(by = ['lift'], ascending = False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
5,(RefBks),"(CookBks, ChildBks)",0.20475,0.242,0.1035,0.505495,2.08882,0.05395,1.532844
4,"(CookBks, ChildBks)",(RefBks),0.242,0.20475,0.1035,0.427686,2.08882,0.05395,1.389534
0,"(CookBks, ChildBks)",(YouthBks),0.242,0.23825,0.12,0.495868,2.081292,0.062344,1.511012
1,(YouthBks),"(CookBks, ChildBks)",0.23825,0.242,0.12,0.503673,2.081292,0.062344,1.527218
2,"(CookBks, ChildBks)",(DoItYBks),0.242,0.25475,0.12775,0.527893,2.072198,0.066101,1.57856
3,(DoItYBks),"(CookBks, ChildBks)",0.25475,0.242,0.12775,0.501472,2.072198,0.066101,1.520476
