In [2]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

In [73]:
order_products_train_df = pd.read_csv("order_products__train.csv")
order_products_prior_df = pd.read_csv("order_products__prior.csv")
orders_df = pd.read_csv("orders.csv")
products_df = pd.read_csv("products.csv")
aisles_df = pd.read_csv("aisles.csv")
departments_df = pd.read_csv("departments.csv")

In [4]:
orders_df.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [5]:
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [6]:
aisles_df.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [7]:
departments_df.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [74]:
order_products_train_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [9]:
order_products_prior_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [76]:
order_products_prior_df.shape, order_products_train_df.shape

((32434489, 4), (1384617, 4))

In [77]:
order_products_prior_df.shape[0]+order_products_train_df.shape[0]

33819106

In [75]:
order_products_df = pd.concat([order_products_train_df, order_products_prior_df], ignore_index=True)
order_products_df.shape

(33819106, 4)

In [41]:
merged_df = pd.merge(order_products_df, products_df[['product_id', 'product_name']], on='product_id', how='inner')
merged_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name
0,1,49302,1,1,Bulgarian Yogurt
1,816049,49302,7,1,Bulgarian Yogurt
2,1242203,49302,1,1,Bulgarian Yogurt
3,1383349,49302,11,1,Bulgarian Yogurt
4,1787378,49302,8,0,Bulgarian Yogurt


In [42]:
merged_df.shape

(33819106, 5)

In [43]:
merged_df[merged_df['product_id']==24852]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name
3324854,226,24852,2,0,Banana
3324855,473,24852,2,0,Banana
3324856,878,24852,2,1,Banana
3324857,1042,24852,1,1,Banana
3324858,1139,24852,1,1,Banana
...,...,...,...,...,...
3816140,3421027,24852,3,1,Banana
3816141,3421030,24852,9,1,Banana
3816142,3421038,24852,2,0,Banana
3816143,3421078,24852,2,1,Banana


In [44]:
merged_df.shape

(33819106, 5)

In [46]:
product_counts = merged_df.groupby('product_id')['order_id'].count().reset_index().rename(columns = {'order_id':'frequency'})
product_counts = product_counts.sort_values('frequency', ascending=False)[0:10000].reset_index(drop=True)
product_counts.head(10)

Unnamed: 0,product_id,frequency
0,24852,491291
1,13176,394930
2,21137,275577
3,21903,251705
4,47209,220877
5,47766,184224
6,47626,160792
7,16797,149445
8,26209,146660
9,27845,142813


In [47]:
product_counts.shape

(10000, 2)

In [48]:
freq_products = list(product_counts.product_id)
freq_products[1:10]

[13176, 21137, 21903, 47209, 47766, 47626, 16797, 26209, 27845]

In [52]:
order_products = merged_df[merged_df.product_id.isin(freq_products)]
order_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name
171,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese
172,68474,11109,8,1,Organic 4% Milk Fat Whole Milk Cottage Cheese
173,78197,11109,3,1,Organic 4% Milk Fat Whole Milk Cottage Cheese
174,120619,11109,1,0,Organic 4% Milk Fat Whole Milk Cottage Cheese
175,128666,11109,12,1,Organic 4% Milk Fat Whole Milk Cottage Cheese


In [53]:
order_products.shape

(30774315, 5)

In [55]:
basket = (
    order_products.groupby('order_id')['product_name']
    .apply(list)
    .reset_index(name='products')
)
basket

Unnamed: 0,order_id,products
0,1,[Organic 4% Milk Fat Whole Milk Cottage Cheese...
1,2,"[Carrots, Michigan Organic Kale, Organic Egg W..."
2,3,"[Organic Baby Spinach, Organic Ginger Root, To..."
3,4,"[Chewy 25% Low Sugar Chocolate Chip Granola, K..."
4,5,"[Bag of Organic Bananas, Organic Hass Avocado,..."
...,...,...
3321331,3421078,"[Guacamole, Banana, Honey Nut Cheerios, Organi..."
3321332,3421080,"[Organic Cilantro, Organic Whole Milk, Organic..."
3321333,3421081,"[Lime Sparkling Water, Hint of Lime Flavored T..."
3321334,3421082,"[Strawberries, Original Spray, Raspberries, To..."


In [61]:
sample_basket = basket[:50000]

In [64]:
basket_encoded = sample_basket['products'].str.join('|').str.get_dummies()
basket_encoded

Unnamed: 0,#2 Coffee Filters,0% Fat Blueberry Greek Yogurt,0% Fat Free Organic Milk,0% Fat Organic Greek Vanilla Yogurt,0% Fat Superfruits Greek Yogurt,0% Greek Strained Yogurt,0% Greek Yogurt Black Cherry on the Bottom,"0% Greek, Blueberry on the Bottom Yogurt",1 % Lowfat Milk,1 Apple + 1 Mango Fruit Bar,...,Zucchini Noodles,Zucchini Soufflé,Zucchini Squash,for Tots Apple Juice,gel hand wash sea minerals,of Hanover 100 Calorie Pretzels Mini,smartwater® Electrolyte Enhanced Water,vitaminwater® XXX Acai Blueberry Pomegranate,with Crispy Almonds Cereal,with Olive Oil Mayonnaise
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
frequent_itemsets = apriori(basket_encoded, min_support=0.01, use_colnames=True)
frequent_itemsets.head(10)



Unnamed: 0,support,itemsets
0,0.01190,(100% Raw Coconut Water)
1,0.01898,(100% Whole Wheat Bread)
2,0.01172,(2% Reduced Fat Milk)
3,0.02704,(Apple Honeycrisp Organic)
4,0.02034,(Asparagus)
...,...,...
104,0.01632,"(Organic Baby Spinach, Bag of Organic Bananas)"
105,0.01940,"(Bag of Organic Bananas, Organic Hass Avocado)"
106,0.01366,"(Organic Raspberries, Bag of Organic Bananas)"
107,0.01910,"(Bag of Organic Bananas, Organic Strawberries)"


In [66]:
rules['support_A'] = rules['antecedents'].apply(lambda x: frequent_itemsets[frequent_itemsets['itemsets'] == frozenset(x)]['support'].values[0])
rules['support_B'] = rules['consequents'].apply(lambda x: frequent_itemsets[frequent_itemsets['itemsets'] == frozenset(x)]['support'].values[0])
rules['support_A_and_B'] = rules['support']

In [67]:
rules['frequency_A'] = rules['support_A'] * len(basket)
rules['frequency_B'] = rules['support_B'] * len(basket)
rules['frequency_A_and_B'] = rules['support_A_and_B'] * len(basket)

In [68]:
rules['lift'] = (rules['support_A_and_B'] / (rules['support_A'] * rules['support_B']))

In [79]:
result_df = rules[['antecedents', 'consequents', 'support_A', 'support_B', 'support_A_and_B','frequency_A', 'frequency_B', 'frequency_A_and_B','confidence', 'lift']]
result_df

Unnamed: 0,antecedents,consequents,support_A,support_B,support_A_and_B,frequency_A,frequency_B,frequency_A_and_B,confidence,lift
0,(Large Lemon),(Bag of Organic Bananas),0.04846,0.12008,0.011263,160951.94256,398826.02688,37407.055429,0.129932,1.935471
1,(Organic Avocado),(Bag of Organic Bananas),0.05506,0.12008,0.011401,182872.76016,398826.02688,37867.123282,0.144419,1.724418
2,(Organic Baby Spinach),(Bag of Organic Bananas),0.07690,0.12008,0.023825,255410.73840,398826.02688,79131.670709,0.228536,2.580124
3,(Bag of Organic Bananas),(Organic Baby Spinach),0.12008,0.07690,0.023825,398826.02688,255410.73840,79131.670709,0.144444,2.580124
4,(Organic Blueberries),(Bag of Organic Bananas),0.03156,0.12008,0.012115,104821.36416,398826.02688,40238.242216,0.228957,3.196822
...,...,...,...,...,...,...,...,...,...,...
60,(Organic Strawberries),(Organic Hass Avocado),0.08236,0.06698,0.016399,273545.23296,222463.08528,54464.955823,0.141270,2.972644
61,(Organic Hass Avocado),(Organic Strawberries),0.06698,0.08236,0.016399,222463.08528,273545.23296,54464.955823,0.211024,2.972644
62,(Organic Raspberries),(Organic Strawberries),0.04426,0.08236,0.017794,147002.33136,273545.23296,59101.024188,0.301118,4.881513
63,(Organic Strawberries),(Organic Raspberries),0.08236,0.04426,0.017794,273545.23296,147002.33136,59101.024188,0.153295,4.881513


In [72]:
input_product_name = 'Bag of Organic Bananas'
recommendations = result_df[(result_df['antecedents'].apply(lambda x: input_product_name in x)) | (result_df['consequents'].apply(lambda x: input_product_name in x))]
recommendations

Unnamed: 0,antecedents,consequents,support_A,support_B,support_A_and_B,frequency_A,frequency_B,frequency_A_and_B,confidence,lift
0,(Large Lemon),(Bag of Organic Bananas),0.04846,0.12008,0.011263,160951.94256,398826.02688,37407.055429,0.129932,1.935471
1,(Organic Avocado),(Bag of Organic Bananas),0.05506,0.12008,0.011401,182872.76016,398826.02688,37867.123282,0.144419,1.724418
2,(Organic Baby Spinach),(Bag of Organic Bananas),0.0769,0.12008,0.023825,255410.7384,398826.02688,79131.670709,0.228536,2.580124
3,(Bag of Organic Bananas),(Organic Baby Spinach),0.12008,0.0769,0.023825,398826.02688,255410.7384,79131.670709,0.144444,2.580124
4,(Organic Blueberries),(Bag of Organic Bananas),0.03156,0.12008,0.012115,104821.36416,398826.02688,40238.242216,0.228957,3.196822
5,(Organic Cucumber),(Bag of Organic Bananas),0.02496,0.12008,0.013511,82900.54656,398826.02688,44874.310581,0.274875,4.507853
6,(Bag of Organic Bananas),(Organic Hass Avocado),0.12008,0.06698,0.025786,398826.02688,222463.08528,85643.40032,0.156331,3.206013
7,(Organic Hass Avocado),(Bag of Organic Bananas),0.06698,0.12008,0.025786,222463.08528,398826.02688,85643.40032,0.331825,3.206013
8,(Organic Large Extra Fancy Fuji Apple),(Bag of Organic Bananas),0.02336,0.12008,0.010368,77586.40896,398826.02688,34434.309302,0.336562,3.696026
9,(Organic Lemon),(Bag of Organic Bananas),0.02962,0.12008,0.011369,98377.97232,398826.02688,37760.953777,0.304422,3.196498


In [19]:
basket = order_products.pivot_table(columns='product_name', values='reordered', index='order_id').reset_index().fillna(0).set_index('order_id')

In [20]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
    
basket = basket.applymap(encode_units)
basket.head()

product_name,100% Whole Wheat Bread,2% Reduced Fat Milk,Apple Honeycrisp Organic,Asparagus,Bag of Organic Bananas,Banana,Blueberries,Boneless Skinless Chicken Breasts,Broccoli Crown,Bunched Cilantro,...,Sparkling Lemon Water,Sparkling Natural Mineral Water,Sparkling Water Grapefruit,Spring Water,Strawberries,Uncured Genoa Salami,Unsalted Butter,Unsweetened Almondmilk,Unsweetened Original Almond Breeze Almond Milk,Yellow Onions
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
38,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [21]:
frequent_items = apriori(basket[:100000], min_support=0.01, use_colnames=True)
frequent_items.head()



Unnamed: 0,support,itemsets
0,0.018668,(100% Whole Wheat Bread)
1,0.013298,(2% Reduced Fat Milk)
2,0.017304,(Apple Honeycrisp Organic)
3,0.02618,(Asparagus)
4,0.142376,(Bag of Organic Bananas)


In [23]:
rules = association_rules(frequent_items, metric='lift', min_threshold=1)
rules.sort_values('lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
19,(Limes),(Large Lemon),0.045115,0.063111,0.010325,0.228862,3.626315,0.007478,1.214942,0.758455
18,(Large Lemon),(Limes),0.063111,0.045115,0.010325,0.1636,3.626315,0.007478,1.141661,0.773025
31,(Organic Strawberries),(Organic Raspberries),0.091668,0.045594,0.013074,0.142625,3.128143,0.008895,1.113172,0.748979
30,(Organic Raspberries),(Organic Strawberries),0.045594,0.091668,0.013074,0.286749,3.128143,0.008895,1.273511,0.712822
20,(Organic Avocado),(Large Lemon),0.06634,0.063111,0.010293,0.155156,2.458445,0.006106,1.108948,0.635391
21,(Large Lemon),(Organic Avocado),0.063111,0.06634,0.010293,0.163093,2.458445,0.006106,1.115608,0.633201
3,(Organic Hass Avocado),(Bag of Organic Bananas),0.064379,0.142376,0.021449,0.333168,2.340054,0.012283,1.286117,0.612064
2,(Bag of Organic Bananas),(Organic Hass Avocado),0.142376,0.064379,0.021449,0.150651,2.340054,0.012283,1.101574,0.667728
4,(Organic Raspberries),(Bag of Organic Bananas),0.045594,0.142376,0.014811,0.324842,2.281578,0.008319,1.270257,0.588541
5,(Bag of Organic Bananas),(Organic Raspberries),0.142376,0.045594,0.014811,0.104026,2.281578,0.008319,1.065217,0.654957


In [24]:
df_ar = association_rules(frequent_items, metric = 'confidence', min_threshold = 0.2)
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Organic Baby Spinach),(Bag of Organic Bananas),0.085828,0.142376,0.018498,0.215518,1.513725,0.006278,1.093236,0.371241
1,(Organic Hass Avocado),(Bag of Organic Bananas),0.064379,0.142376,0.021449,0.333168,2.340054,0.012283,1.286117,0.612064
2,(Organic Raspberries),(Bag of Organic Bananas),0.045594,0.142376,0.014811,0.324842,2.281578,0.008319,1.270257,0.588541
3,(Organic Strawberries),(Bag of Organic Bananas),0.091668,0.142376,0.025456,0.277694,1.950424,0.012404,1.187341,0.536468
4,(Large Lemon),(Banana),0.063111,0.17642,0.016537,0.262029,1.48526,0.005403,1.116007,0.348726
5,(Organic Avocado),(Banana),0.06634,0.17642,0.019318,0.291198,1.650598,0.007614,1.161933,0.422165
6,(Strawberries),(Banana),0.050996,0.17642,0.015248,0.298997,1.694804,0.006251,1.17486,0.431991
7,(Limes),(Large Lemon),0.045115,0.063111,0.010325,0.228862,3.626315,0.007478,1.214942,0.758455
8,(Organic Raspberries),(Organic Strawberries),0.045594,0.091668,0.013074,0.286749,3.128143,0.008895,1.273511,0.712822
