In [1]:
# import packages
import random
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity 

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline 
plt.style.use('seaborn')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
# import data
order_products_prior = pd.read_csv('order_products__prior.csv')
order_products_train = pd.read_csv('order_products__train.csv')
orders = pd.read_csv('orders.csv')
products = pd.read_csv('products.csv')
departments = pd.read_csv('departments.csv')
aisles = pd.read_csv('aisles.csv')

In [3]:
print(order_products_prior.shape)
print(order_products_train.shape)
print(orders.shape)
print(aisles.shape)
print(departments.shape)
print(products.shape)

(32434489, 4)
(1384617, 4)
(3421083, 7)
(134, 2)
(21, 2)
(49688, 4)


In [4]:
# concat order_products_prior and train together to get a total list of ordered products
order_products_total = pd.concat([order_products_prior, order_products_train])

print('there are', order_products_total.shape[0], 'number of products have been ordered')

there are 33819106 number of products have been ordered


In [5]:
# merge order_products_total with products to get product names
order_products_total = order_products_total.drop('add_to_cart_order', axis = 1)
order_products_total = order_products_total.merge(products[['product_id', 'product_name']],how='left', on='product_id')
order_products_total.head()

Unnamed: 0,order_id,product_id,reordered,product_name
0,2,33120,1,Organic Egg Whites
1,2,28985,1,Michigan Organic Kale
2,2,9327,0,Garlic Powder
3,2,45918,1,Coconut Butter
4,2,30035,0,Natural Sweetener


- To build my recommeder, I decide to focus on products that have been reordered before for users.
- Pipeline:
    - Find products that have reordered before.
    - 

In [6]:
# get the list of orders that have been reordered before
reorders = order_products_total[order_products_total['reordered'] == 1]
reorders.shape

(19955360, 4)

In [7]:
reorders.head()

Unnamed: 0,order_id,product_id,reordered,product_name
0,2,33120,1,Organic Egg Whites
1,2,28985,1,Michigan Organic Kale
3,2,45918,1,Coconut Butter
5,2,17794,1,Carrots
6,2,40141,1,Original Unflavored Gelatine Mix


In [12]:
orders2 = orders[['order_id', 'user_id']]

In [13]:
# merge to get user_id and product_id
user_orders = reorders.merge(orders2, on='order_id')

In [14]:
# filtering out the high volumn products that user reordered more than once
user_orders['high_volume'] = (user_orders['product_id'].value_counts().sort_values(ascending=False)>1)
high_volume = user_orders[user_orders['high_volume'] == True]

In [15]:
# get a matrix of different high volume items that particular user purchased
high_volume_users = high_volume.groupby(['user_id', 'product_name']).size().sort_values(ascending=False).unstack().fillna(0)

In [16]:
# calculate similarity between each user
cosine_dists = pd.DataFrame(cosine_similarity(high_volume_users),index=high_volume_users.index, columns=high_volume_users.index)

In [17]:
cosine_dists.head()

user_id,27,66,90,150,155,206,208,214,222,382,...,205908,205943,205970,205990,206043,206082,206105,206158,206162,206206
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.176777,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
90,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
155,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
def Recommender_System(user_id):
    
    '''
    enter user_id and return a list of 5 recommendations.
    '''
    
    u = high_volume.groupby(['user_id','product_name']).size().sort_values(ascending=False).unstack().fillna(0)
    u_sim = pd.DataFrame(cosine_similarity(u), index=u.index, columns=u.index)

    p = high_volume.groupby(['product_name','user_id']).size().sort_values(ascending=False).unstack().fillna(0)
    
    recommendations = pd.Series(np.dot(p.values,cosine_dists[user_id]), index=p.index)
    return recommendations.sort_values(ascending=False).head()

In [19]:
#gives a random user ID to for input to the recommender system
random.sample(high_volume['user_id'].tolist(),1)

[12325]

In [15]:
Recommender_System(91397)

product_name
Bag of Organic Bananas                  132.579424
Organic Whole Milk                       44.788580
Limes                                    36.012075
Organic Large Extra Fancy Fuji Apple     23.582827
Organic Hass Avocado                     20.269398
dtype: float64

In [19]:
_ = order_products_total.merge(orders, on='order_id', how='left')

In [20]:
_.head()

Unnamed: 0,order_id,product_id,reordered,product_name,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,Organic Egg Whites,202279,prior,3,5,9,8.0
1,2,28985,1,Michigan Organic Kale,202279,prior,3,5,9,8.0
2,2,9327,0,Garlic Powder,202279,prior,3,5,9,8.0
3,2,45918,1,Coconut Butter,202279,prior,3,5,9,8.0
4,2,30035,0,Natural Sweetener,202279,prior,3,5,9,8.0


In [18]:
_[_.user_id == 91397].product_name.value_counts().head(20)

Organic Whole Milk                        19
Organic Chicken Strips                    13
Limes                                      9
Maple & Pecan Granola Gluten Free          7
Soft Pretzel Mini Buns                     7
Shredded Mild Cheddar Cheese               7
Organic Large Extra Fancy Fuji Apple       6
Smoked Turkey Breast Slices                6
Organic Traditional Flour Tortillas        6
Red Vine Tomato                            6
Organic Ginger Root                        5
Organic Decorticated Cardamom              5
Ginger Brew Caffeine Free                  5
Organic Vanilla Whole Milk Yogurt          5
Natural Classic Pork Breakfast Sausage     4
Organic Cream Cheese Bar                   4
Uncured Slow Cooked Ham                    4
Bag of Organic Bananas                     4
Organic Whole String Cheese                4
Pico De Gallo Chunky Salsa                 4
Name: product_name, dtype: int64

In [25]:
random.sample(high_volume['user_id'].tolist(),1)

[141736]

In [26]:
Recommender_System(175965)

product_name
Organic Blueberries                16.798603
Unsweetened Vanilla Almond Milk     6.828189
Organic Large Green Asparagus       4.162072
Bag of Organic Bananas              3.947690
Organic Strawberries                3.527694
dtype: float64

In [27]:
_[_.user_id == 175965].product_name.value_counts().head(20)

Organic Granny Smith Apple                                       9
Unsweetened Vanilla Almond Milk                                  8
Organic Avocado                                                  7
Organic Raspberries                                              6
Organic Bagged Mini Dark Peanut Butter                           6
Organic Strawberries                                             6
Organic Blackberries                                             6
Holler Mountain Organic Coffee                                   5
Organic Carrot Bunch                                             5
Organic D'Anjou Pears                                            5
Organic Cauliflower                                              5
Sweet Onion                                                      4
Organic SprouTofu Silken Tofu                                    4
Organic Peeled Whole Baby Carrots                                4
Organic Hot Italian Chicken Sausage                           

## Metric

In [21]:
users = high_volume.user_id.unique().tolist()

In [33]:
def how_match():
    res = []
    for user in sorted(users)[1000:2000]:
        recommendations = Recommender_System(user)
        top_20_itmes = _[_.user_id == user].product_name.value_counts().head(20)
    
        recommendations_list = recommendations.index.tolist()
        top_20_items_list = top_20_itmes.index.tolist()
    
        res.append((len(set(recommendations_list) & set(top_20_items_list)))/5)
    return np.mean(res)

In [35]:
# get metric for the 1000:2000 users
how_match()

0.522