# Instacart Market Basket Analysis

In [1]:
#Importing required packages
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
from scipy import sparse
from mlxtend.frequent_patterns import fpgrowth
from scipy.sparse import csr_matrix

In [19]:
#Reading the datasets
order_df = pd.read_csv('all_order_products_2.csv')
product_df = pd.read_csv('products.csv')
date_df = pd.read_csv('orders.csv')

In [20]:
#Merging the datasets
df = pd.merge(order_df, product_df, on='product_id')
df = pd.merge(df, date_df, on= 'order_id')

In [21]:
df

Unnamed: 0,order_id,product_id,add_to_cart_order,product_name,aisle_id,department_id,user_id,timestamp
0,2,33120,1,Organic Egg Whites,86,16,202279,2018-02-03 09:00:00
1,2,28985,2,Michigan Organic Kale,83,4,202279,2018-02-03 09:00:00
2,2,9327,3,Garlic Powder,104,13,202279,2018-02-03 09:00:00
3,2,45918,4,Coconut Butter,19,13,202279,2018-02-03 09:00:00
4,2,30035,5,Natural Sweetener,17,13,202279,2018-02-03 09:00:00
...,...,...,...,...,...,...,...,...
33819101,2839453,28717,1,Sport Deluxe Adjustable Black Ankle Stabilizer,133,11,170084,2018-03-03 16:00:00
33819102,1362475,12841,1,King Crab Legs,39,12,184060,2018-03-09 06:00:00
33819103,2666238,31530,1,Disinfecting Toilet Bowl Cleaner,114,17,51836,2018-01-29 18:00:00
33819104,2770035,39759,1,"Indoor & Outdoor Allergies, Allergy & Congesti...",11,11,37237,2018-12-16 16:00:00


In [145]:
# Checking for missing values in the dataframe
print(df.isnull().sum())

order_id             0
product_id           0
add_to_cart_order    0
product_name         0
aisle_id             0
department_id        0
user_id              0
timestamp            0
dtype: int64


In [88]:
# Group the data by order_id
transactions = df.groupby(['order_id'])['product_name'].apply(list)
transactions

order_id
1          [Bag of Organic Bananas, Organic Hass Avocado,...
2          [Organic Egg Whites, Michigan Organic Kale, Ga...
3          [Total 2% with Strawberry Lowfat Greek Straine...
4          [Plain Pre-Sliced Bagels, Honey/Lemon Cough Dr...
5          [Bag of Organic Bananas, Just Crisp, Parmesan,...
                                 ...                        
3421079                                      [Moisture Soap]
3421080    [Organic Cilantro, Organic Whole Milk, Organic...
3421081    [Lime Sparkling Water, Classic Wheat Bread, Di...
3421082    [Raspberries, Strawberries, Toasted Coconut Ch...
3421083    [Banana, Organic Mixed Berry Yogurt & Fruit Sn...
Name: product_name, Length: 3346083, dtype: object

# Association Rules Algorithms

In [89]:
# Converting the transactions to a one-hot encoded matrix
te = TransactionEncoder()
onehot_sparse = te.fit_transform(transactions, sparse=True)

In [123]:
# Converting the basket sets to boolean data type for algorithms to accept as an input
basket_sets_sparse = csr_matrix(onehot_sparse.astype(bool))
basket_sets_df = pd.DataFrame.sparse.from_spmatrix(basket_sets_sparse, columns=te.columns_)

## Apriori Algorithm

In [28]:
# apply the Apriori algorithm to generate frequent itemsets
frequent_itemsets = apriori(basket_sets_df, min_support=0.01, use_colnames=True)

  return self.todense() == other
  return self.todense() == other


In [29]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.011736,(399)
1,0.018862,(459)
2,0.011545,(661)
3,0.026082,(2629)
4,0.021264,(3013)
...,...,...
97,0.011088,(46653)
98,0.015462,(46775)
99,0.010773,(46818)
100,0.011048,(48752)


### Apriori algorithm raises en error as it requires a dense matrix to run elementwise comparison. However, a dense matrix cannot be obtained with one-hot encoding without raising a memory error. Therefore, an alternative algorithm will be used.

## FP-Growth Algorithm

In [134]:
# Applying the FP-Growth algorithm to generate frequent itemsets with 0.01 support
frequent_itemsets = fpgrowth(basket_sets_df, min_support=0.01, use_colnames=True)

In [135]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.118028,(Bag of Organic Bananas)
1,0.066011,(Organic Hass Avocado)
2,0.029804,(Cucumber Kirby)
3,0.01843,(Organic Whole String Cheese)
4,0.022484,(Carrots)
5,0.020943,(Michigan Organic Kale)
6,0.075224,(Organic Baby Spinach)
7,0.016166,(Organic Ginger Root)
8,0.015462,(Unsweetened Almondmilk)
9,0.042618,(Organic Raspberries)


In [137]:
# Generate association rules with a minimum confidence of 0.1 and with lift 2
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.1)
rules = rules[rules['lift'] >= 2]

# Sort rules by descending order of confidence
rules = rules.sort_values(by='confidence', ascending=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
25,(Organic Fuji Apple),(Banana),0.027761,0.146826,0.010506,0.378441,2.577484,0.00643,1.372636
21,(Organic Avocado),(Banana),0.055057,0.146826,0.01662,0.301866,2.055949,0.008536,1.222078
12,(Organic Raspberries),(Bag of Organic Bananas),0.042618,0.118028,0.012637,0.296508,2.512197,0.007606,1.253707
0,(Organic Hass Avocado),(Bag of Organic Bananas),0.066011,0.118028,0.019354,0.293199,2.48416,0.011563,1.247838
14,(Organic Raspberries),(Organic Strawberries),0.042618,0.082358,0.010619,0.249174,3.025499,0.007109,1.222177
4,(Organic Hass Avocado),(Organic Strawberries),0.066011,0.082358,0.012652,0.191659,2.327139,0.007215,1.135216
1,(Bag of Organic Bananas),(Organic Hass Avocado),0.118028,0.066011,0.019354,0.163981,2.48416,0.011563,1.117187
3,(Organic Hass Avocado),(Organic Baby Spinach),0.066011,0.075224,0.010805,0.163679,2.175899,0.005839,1.105768
5,(Organic Strawberries),(Organic Hass Avocado),0.082358,0.066011,0.012652,0.153616,2.327139,0.007215,1.103505
2,(Organic Baby Spinach),(Organic Hass Avocado),0.075224,0.066011,0.010805,0.143632,2.175899,0.005839,1.090641


### Different support value 0.005

In [138]:
# Applying the FP-Growth algorithm to generate frequent itemsets with 0.005 support
frequent_itemsets = fpgrowth(basket_sets_df, min_support=0.005, use_colnames=True)

In [139]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.118028,(Bag of Organic Bananas)
1,0.066011,(Organic Hass Avocado)
2,0.029804,(Cucumber Kirby)
3,0.01843,(Organic Whole String Cheese)
4,0.007438,(Organic Celery Hearts)
5,0.022484,(Carrots)
6,0.020943,(Michigan Organic Kale)
7,0.006047,(Organic Egg Whites)
8,0.075224,(Organic Baby Spinach)
9,0.016166,(Organic Ginger Root)


In [140]:
# Generate association rules with a minimum confidence of 0.1 and with lift 2
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.1)
rules = rules[rules['lift'] >= 2]

# Sort rules by descending order of confidence
rules = rules.sort_values(by='confidence', ascending=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
53,(Organic Fuji Apple),(Banana),0.027761,0.146826,0.010506,0.378441,2.577484,0.00643,1.372636
39,(Honeycrisp Apple),(Banana),0.024901,0.146826,0.008858,0.355725,2.42277,0.005202,1.324239
7,(Cucumber Kirby),(Banana),0.029804,0.146826,0.009814,0.329296,2.242766,0.005438,1.272057
77,(Organic Large Extra Fancy Fuji Apple),(Bag of Organic Bananas),0.023328,0.118028,0.007273,0.311789,2.641663,0.00452,1.281544
27,(Organic Avocado),(Banana),0.055057,0.146826,0.01662,0.301866,2.055949,0.008536,1.222078
52,(Seedless Red Grapes),(Banana),0.025925,0.146826,0.007697,0.296906,2.022166,0.003891,1.213457
14,(Organic Raspberries),(Bag of Organic Bananas),0.042618,0.118028,0.012637,0.296508,2.512197,0.007606,1.253707
0,(Organic Hass Avocado),(Bag of Organic Bananas),0.066011,0.118028,0.019354,0.293199,2.48416,0.011563,1.247838
88,(Apple Honeycrisp Organic),(Bag of Organic Bananas),0.026082,0.118028,0.007287,0.279391,2.367166,0.004209,1.223926
66,(Organic Cucumber),(Bag of Organic Bananas),0.025404,0.118028,0.006815,0.268243,2.272715,0.003816,1.205281


### Different support value 0.001

In [141]:
# Applying the FP-Growth algorithm to generate frequent itemsets with 0.001 support
frequent_itemsets = fpgrowth(basket_sets_df, min_support=0.001, use_colnames=True)

In [142]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.118028,(Bag of Organic Bananas)
1,0.066011,(Organic Hass Avocado)
2,0.029804,(Cucumber Kirby)
3,0.01843,(Organic Whole String Cheese)
4,0.007438,(Organic Celery Hearts)
5,0.00138,(Organic 4% Milk Fat Whole Milk Cottage Cheese)
6,0.022484,(Carrots)
7,0.020943,(Michigan Organic Kale)
8,0.006047,(Organic Egg Whites)
9,0.001963,(Garlic Powder)


In [144]:
# Generate association rules with a minimum confidence of 0.1 and with lift 2
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.1)
rules = rules[rules['lift'] >= 2]
pd.set_option('display.max_colwidth', None)

# Sort rules by descending order of confidence
rules = rules.sort_values(by='confidence', ascending=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1770,"(Total 2% Lowfat Greek Strained Yogurt with Peach, Total 2% Lowfat Greek Strained Yogurt With Blueberry)",(Total 2% with Strawberry Lowfat Greek Strained Yogurt),0.001931,0.009225,0.001143,0.591859,64.161499,0.001125,2.427531
1676,"(Sparkling Lemon Water, Lime Sparkling Water)",(Sparkling Water Grapefruit),0.002652,0.023683,0.001369,0.51634,21.802209,0.001307,2.018602
348,"(Strawberries, Honeycrisp Apple)",(Banana),0.002366,0.146826,0.00116,0.490399,3.340009,0.000813,1.674201
153,"(Organic Raspberries, Organic Hass Avocado, Organic Strawberries)",(Bag of Organic Bananas),0.002472,0.118028,0.001212,0.49027,4.153857,0.00092,1.730272
675,"(Strawberries, Organic Fuji Apple)",(Banana),0.002325,0.146826,0.00114,0.490231,3.338866,0.000798,1.67365
1675,"(Sparkling Water Grapefruit, Sparkling Lemon Water)",(Lime Sparkling Water),0.002855,0.014498,0.001369,0.47964,33.082844,0.001328,1.893884
1620,"(Organic Hass Avocado, Organic Navel Orange)",(Bag of Organic Bananas),0.002547,0.118028,0.001188,0.466385,3.951493,0.000887,1.652826
1771,"(Total 2% Lowfat Greek Strained Yogurt with Peach, Total 2% with Strawberry Lowfat Greek Strained Yogurt)",(Total 2% Lowfat Greek Strained Yogurt With Blueberry),0.002463,0.006397,0.001143,0.464021,72.536976,0.001127,1.853811
677,"(Cucumber Kirby, Organic Fuji Apple)",(Banana),0.002177,0.146826,0.001007,0.462859,3.152436,0.000688,1.588361
1841,(Zero Calorie Cola),(Soda),0.002661,0.011147,0.001227,0.461089,41.365301,0.001197,1.834911
