In [1]:
import pandas as pd 

%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns
color = sns.color_palette()

pd.set_option('display.float_format', lambda x: '%.3f' % x) 

from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
order_products_prior = pd.read_csv('Instacart market basket dataset/order_products__prior.csv')
products = pd.read_csv('Instacart market basket dataset/products.csv')

### Merging the order_products_prior and products dataset into a single dataframe

In [4]:
data = pd.merge(order_products_prior, products, how='inner', on='product_id')
print(data.shape)
data.head()

(32434489, 7)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,33120,1,1,Organic Egg Whites,86,16
1,26,33120,5,0,Organic Egg Whites,86,16
2,120,33120,13,0,Organic Egg Whites,86,16
3,327,33120,5,1,Organic Egg Whites,86,16
4,390,33120,28,1,Organic Egg Whites,86,16


Selecting products which have an "order_id" less than 1000

The reason behind doing this that the dataset contains a lot of columns

In [5]:
data = data[data['order_id']<1000]

In [6]:
data

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,33120,1,1,Organic Egg Whites,86,16
1,26,33120,5,0,Organic Egg Whites,86,16
2,120,33120,13,0,Organic Egg Whites,86,16
3,327,33120,5,1,Organic Egg Whites,86,16
4,390,33120,28,1,Organic Egg Whites,86,16
...,...,...,...,...,...,...,...
22517391,998,5451,4,1,Organic Zucchini Spirals,83,4
22522344,998,42847,8,0,100% Whole Wheat Sandwich Rolls,43,3
22522513,998,17758,10,0,Strawberry Rhubarb Yoghurt,120,16
22532474,999,5445,3,0,Protein Zone Protein Juice Smoothie,31,7


In [7]:
df_item = data[['order_id','product_name']].copy()
df_item.rename(columns={'order_id':'order','product_name':'items'}, inplace=True)
df_item['temp']=1

In [8]:
df_item

Unnamed: 0,order,items,temp
0,2,Organic Egg Whites,1
1,26,Organic Egg Whites,1
2,120,Organic Egg Whites,1
3,327,Organic Egg Whites,1
4,390,Organic Egg Whites,1
...,...,...,...
22517391,998,Organic Zucchini Spirals,1
22522344,998,100% Whole Wheat Sandwich Rolls,1
22522513,998,Strawberry Rhubarb Yoghurt,1
22532474,999,Protein Zone Protein Juice Smoothie,1


In [9]:
df = df_item.groupby(['order','items'])['temp'].sum().unstack().fillna(0)

In [10]:
def myencoder(i):
    if i <= 0:
        return 0
    elif i>=1:
        return 1

In [11]:
df.applymap(myencoder)

items,0% Fat Blueberry Greek Yogurt,0% Fat Free Organic Milk,0% Fat Organic Greek Vanilla Yogurt,0% Greek Strained Yogurt,0% Milkfat Greek Plain Yogurt,1 % Lowfat Milk,1 Apple + 1 Pear Fruit Bar,1 Ply Paper Towels,1% Low Fat Milk,1% Lowfat Milk,...,Zero Strawberry Lemon,Zero Vitamin Water,Zinfandel,Zucchini Noodles,from Concentrate Mango Nectar,gel hand wash sea minerals,smartwater® Electrolyte Enhanced Water,with Crispy Almonds Cereal,with Olive Oil Mayonnaise,with Olive Oil Mayonnaise Dressing
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Frequent Itemsets generated

In [12]:
freq_itemsets = apriori(df, min_support=0.015, use_colnames=True)
freq_itemsets

Unnamed: 0,support,itemsets
0,0.017,(100% Whole Wheat Bread)
1,0.018,(Apple Honeycrisp Organic)
2,0.025,(Asparagus)
3,0.123,(Bag of Organic Bananas)
4,0.159,(Banana)
...,...,...
64,0.016,"(Organic Raspberries, Bag of Organic Bananas)"
65,0.020,"(Bag of Organic Bananas, Organic Strawberries)"
66,0.017,"(Banana, Organic Avocado)"
67,0.025,"(Organic Baby Spinach, Banana)"


## Association rules generated

In [13]:
rules = association_rules(freq_itemsets, metric='lift', min_threshold=1)
rules.sort_values(by='confidence',ascending=False)
rules.drop(rules.columns[[7, 8]], axis=1)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
0,(Bag of Organic Bananas),(Organic Hass Avocado),0.123,0.067,0.023,0.188,2.806
1,(Organic Hass Avocado),(Bag of Organic Bananas),0.067,0.123,0.023,0.344,2.806
2,(Organic Raspberries),(Bag of Organic Bananas),0.04,0.123,0.016,0.395,3.222
3,(Bag of Organic Bananas),(Organic Raspberries),0.123,0.04,0.016,0.128,3.222
4,(Bag of Organic Bananas),(Organic Strawberries),0.123,0.074,0.02,0.162,2.184
5,(Organic Strawberries),(Bag of Organic Bananas),0.074,0.123,0.02,0.268,2.184
6,(Banana),(Organic Avocado),0.159,0.054,0.017,0.105,1.933
7,(Organic Avocado),(Banana),0.054,0.159,0.017,0.308,1.933
8,(Organic Baby Spinach),(Banana),0.069,0.159,0.025,0.364,2.285
9,(Banana),(Organic Baby Spinach),0.159,0.069,0.025,0.158,2.285


## Association Rule Mining for best selling products

In [14]:
data2 = pd.merge(order_products_prior,products,how='inner',on='product_id')
print(data2.shape)
data2.head()

data2 = data2[0:10000000]

(32434489, 7)


In [15]:
data = pd.merge(order_products_prior,products,how='inner',on='product_id')
print(data.shape)
data.head()

(32434489, 7)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,33120,1,1,Organic Egg Whites,86,16
1,26,33120,5,0,Organic Egg Whites,86,16
2,120,33120,13,0,Organic Egg Whites,86,16
3,327,33120,5,1,Organic Egg Whites,86,16
4,390,33120,28,1,Organic Egg Whites,86,16


In [16]:
grouped = data2.groupby("order_id")["reordered"].aggregate(Total_reorders='count').reset_index()
grouped = pd.merge(grouped, data2[['order_id', 'product_name']], how='left', on=['order_id'])
grouped = grouped.sort_values(by='Total_reorders', ascending=False)
grouped

Unnamed: 0,order_id,Total_reorders,product_name
4570503,1564244,34,Bag of Organic Bananas
4570529,1564244,34,Organic Baby Arugula
4570506,1564244,34,Organic Bread with 21 Whole Grains
4570505,1564244,34,"Organic Red Radish, Bunch"
4570525,1564244,34,Organic Large Extra Fancy Fuji Apple
...,...,...,...
8801426,3010059,1,Sparkling Natural Mineral Water
3410805,1167942,1,Organic Baby Carrots
3410776,1167930,1,Organic Reduced Fat Milk
3410775,1167929,1,Organic Mayonnaise


In [17]:
final_data = grouped[0:1000]

In [18]:
final_data

Unnamed: 0,order_id,Total_reorders,product_name
4570503,1564244,34,Bag of Organic Bananas
4570529,1564244,34,Organic Baby Arugula
4570506,1564244,34,Organic Bread with 21 Whole Grains
4570505,1564244,34,"Organic Red Radish, Bunch"
4570525,1564244,34,Organic Large Extra Fancy Fuji Apple
...,...,...,...
2881578,986435,27,Organic Low Sodium Vegetable Broth
2881579,986435,27,Green Bell Pepper
2881580,986435,27,Seedless Red Grapes
2881581,986435,27,"Clementines, Bag"


In [19]:
df_item2 = final_data[['order_id','product_name']].copy()
df_item2.rename(columns={'order_id':'order','product_name':'items'},inplace=True)
df_item2['temp']=1

In [20]:
df_item2

Unnamed: 0,order,items,temp
4570503,1564244,Bag of Organic Bananas,1
4570529,1564244,Organic Baby Arugula,1
4570506,1564244,Organic Bread with 21 Whole Grains,1
4570505,1564244,"Organic Red Radish, Bunch",1
4570525,1564244,Organic Large Extra Fancy Fuji Apple,1
...,...,...,...
2881578,986435,Organic Low Sodium Vegetable Broth,1
2881579,986435,Green Bell Pepper,1
2881580,986435,Seedless Red Grapes,1
2881581,986435,"Clementines, Bag",1


In [21]:
df2 = df_item2.groupby(['order','items'])['temp'].sum().unstack().fillna(0)

In [22]:
df2.applymap(myencoder)

items,100% Recycled Paper Towels,100% Whole Wheat Bread,3 lb Clementines,Air Chilled Organic Boneless Skinless Chicken Breasts,All Natural Marinara Pasta Sauce,All Natural No Stir Creamy Almond Butter,Ancient Grain Blueberry Hemp Granola,Apple Cider Vinegar,Apple Honeycrisp Organic,Apricots,...,Vanilla Almond Breeze Almond Milk,Vegan Buttery Sticks,Walnut Halves & Pieces,Watermelon Chunks,White Corn,Whole Organic Omega 3 Milk,Yellow Bell Pepper,Yellow Onions,Yellow Straightneck Squash,"YoKids Squeezers Organic Low-Fat Yogurt, Strawberry"
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18,1,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
129627,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
214174,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
325855,0,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
382008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
420892,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
598905,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
711170,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
737251,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
746136,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Frequent Itemsets generated

In [26]:
freq_itemsets2 = apriori(df2, min_support=0.3, use_colnames=True)
freq_itemsets2

Unnamed: 0,support,itemsets
0,0.316,(Asparagus)
1,0.368,(Bag of Organic Bananas)
2,0.421,(Banana)
3,0.316,(Large Lemon)
4,0.579,(Limes)
5,0.316,(Organic Avocado)
6,0.342,(Organic Baby Arugula)
7,0.605,(Organic Baby Spinach)
8,0.316,(Organic D'Anjou Pears)
9,0.395,(Organic Garlic)


In [28]:
freq_itemsets2.sort_values('support', ascending=False)

Unnamed: 0,support,itemsets
7,0.605,(Organic Baby Spinach)
4,0.579,(Limes)
13,0.5,(Organic Raspberries)
14,0.5,(Organic Red Onion)
19,0.421,"(Organic Baby Spinach, Limes)"
2,0.421,(Banana)
16,0.421,(Seedless Red Grapes)
23,0.421,"(Organic Red Onion, Organic Baby Spinach)"
15,0.421,(Organic Strawberries)
9,0.395,(Organic Garlic)


## Association Rules Generated 

In [24]:
rules2 = association_rules(freq_itemsets2, metric='lift', min_threshold=1)
rules2.sort_values(by='confidence',ascending=False)
rules2 = rules2.drop(rules2.columns[[7, 8]], axis=1)
rules2.sort_values(by='confidence',ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
17,"(Organic Red Onion, Limes)",(Organic Baby Spinach),0.368,0.605,0.316,0.857,1.416
12,(Organic Red Onion),(Organic Baby Spinach),0.5,0.605,0.421,0.842,1.391
0,(Banana),(Limes),0.421,0.579,0.316,0.75,1.295
16,"(Organic Red Onion, Organic Baby Spinach)",(Limes),0.421,0.579,0.316,0.75,1.295
3,(Banana),(Organic Baby Spinach),0.421,0.605,0.316,0.75,1.239
18,"(Organic Baby Spinach, Limes)",(Organic Red Onion),0.421,0.5,0.316,0.75,1.5
8,(Organic Red Onion),(Limes),0.5,0.579,0.368,0.737,1.273
10,(Organic Raspberries),(Organic Baby Spinach),0.5,0.605,0.368,0.737,1.217
5,(Limes),(Organic Baby Spinach),0.579,0.605,0.421,0.727,1.202
4,(Organic Baby Spinach),(Limes),0.605,0.579,0.421,0.696,1.202
