# Similarity Recommendation

* Collaborative Filtering
  * Similarity score is merchant similarity rank
  * Products list is most sold products in recent X weeks
    * Didn't choose most valuable products from `product_values` table is because they are largely overlapped with the top products in each merchant.
  * Avg daily purchase frequency is the count of each product in the list

In [1]:
import pandas as pd
import numpy as np
import datetime
import Levenshtein

import warnings
warnings.filterwarnings("ignore")

In [2]:
import ray

ray.shutdown()

In [None]:
ray.init()

In [4]:
target_merchant = '49th Parallel Grocery'

In [5]:
all_order_train = pd.read_pickle('../all_order_train.pkl')
all_order_test = pd.read_pickle('../all_order_test.pkl')
print(all_order_train.shape, all_order_test.shape)

all_order_train.head()

(33720820, 12) (98286, 12)


Unnamed: 0,order_id,user_id,order_number,order_hour_of_day,product_id,purchase_date,merchant,product_name,price,aisle,department,week_number
0,2539329,1,1,8,196,2019-02-13,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,7
1,2231262,31,17,11,196,2019-03-10,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
2,3058369,195,34,10,196,2019-02-25,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,9
3,2257155,951,3,20,196,2019-03-09,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
4,1121647,992,7,10,196,2019-03-08,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10


In [6]:
target_train = all_order_train.loc[all_order_train['merchant'] == target_merchant]
target_test = all_order_test.loc[all_order_test['merchant'] == target_merchant]

print(target_train.shape, target_test.shape)
target_train.head()

(1365312, 12) (3850, 12)


Unnamed: 0,order_id,user_id,order_number,order_hour_of_day,product_id,purchase_date,merchant,product_name,price,aisle,department,week_number
705698,2452257,7,20,9,47272,2019-04-04,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,14
705699,1867977,2850,19,10,47272,2019-03-14,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,11
705700,2879669,18719,1,23,47272,2019-02-23,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,8
705701,252427,38924,5,13,47272,2019-03-31,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,13
705702,1252985,55521,6,16,47272,2019-02-23,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,8


In [7]:
all_order_train = all_order_train.loc[all_order_train['merchant'] != target_merchant]
all_order_test = all_order_test.loc[all_order_test['merchant'] != target_merchant]

print(all_order_train.shape, all_order_test.shape)
all_order_train.head()

(32355508, 12) (94436, 12)


Unnamed: 0,order_id,user_id,order_number,order_hour_of_day,product_id,purchase_date,merchant,product_name,price,aisle,department,week_number
0,2539329,1,1,8,196,2019-02-13,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,7
1,2231262,31,17,11,196,2019-03-10,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
2,3058369,195,34,10,196,2019-02-25,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,9
3,2257155,951,3,20,196,2019-03-09,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
4,1121647,992,7,10,196,2019-03-08,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10


## Merchant Similarity Score

* Here, I converted the 3 similarity factors (top products, size, name) into 1 score, higher score represents higher similarity.
* Commapring with sorting by 3 factors, 1 similarity score brings a bit different results.

In [8]:
@ray.remote
def get_merchant_data(merchant_df, top=10):
    merchant_size = merchant_df[['merchant', 'product_id']].astype('str').drop_duplicates()\
                                                  .groupby(['merchant'], as_index=False)['product_id']\
                                                  .agg('count').reset_index(drop=True).T.to_dict()
    merchant_data = merchant_size[0]
    merchant_data['product_ct'] = merchant_data.pop('product_id')
    
    top_prod_lst_df = merchant_df[['product_id', 'order_id']].astype('str').drop_duplicates()\
                                        .groupby(['product_id'], as_index=False)['order_id']\
                                        .agg('count').reset_index(drop=True)\
                                        .sort_values(by='order_id', ascending=False)\
                                        .head(n=top)
    top_prod_lst = list(top_prod_lst_df['product_id'].values)
    
    merchant_data['top_prod_lst'] = top_prod_lst
    
    return merchant_data


@ray.remote
def get_merchant_similarity(target_merchant_dct, merchant_dct):
    prod_similarity = len(set(target_merchant_dct['top_prod_lst']).intersection(set(merchant_dct['top_prod_lst'])))
    size_similarity = abs(target_merchant_dct['product_ct'] - merchant_dct['product_ct'])
    name_similarity = Levenshtein.ratio(target_merchant_dct['merchant'], merchant_dct['merchant'])
    
    return {'merchant': merchant_dct['merchant'], 'prod_sim': prod_similarity, 'size_sim': size_similarity, 'name_sim': name_similarity}

In [9]:
target_merchant_train = get_merchant_data.remote(target_train[['merchant', 'product_id', 'order_id']], top=10)
target_merchant_dct = ray.get(target_merchant_train)

print(target_merchant_dct)

{'merchant': '49th Parallel Grocery', 'product_ct': 37655, 'top_prod_lst': ['24852', '13176', '21137', '21903', '47209', '47766', '47626', '16797', '26209', '27845']}


In [10]:
merchant_lst = all_order_train['merchant'].unique()

results = [get_merchant_data.remote(all_order_train.loc[all_order_train['merchant']==merchant][['merchant', 'product_id', 'order_id']]) 
           for merchant in merchant_lst]
merchant_data_lst = ray.get(results)

print(len(merchant_data_lst))
merchant_data_lst[7:9]

48


[{'merchant': 'North Central Co-op',
  'product_ct': 35920,
  'top_prod_lst': ['24852',
   '13176',
   '21137',
   '21903',
   '47209',
   '47766',
   '47626',
   '16797',
   '26209',
   '27966']},
 {'merchant': 'Nations Fresh Food',
  'product_ct': 24493,
  'top_prod_lst': ['24852',
   '13176',
   '21137',
   '21903',
   '47209',
   '47766',
   '47626',
   '27966',
   '16797',
   '27845']}]

In [11]:
results = [get_merchant_similarity.remote(target_merchant_train, merchant_dct) for merchant_dct in merchant_data_lst]
merchant_similarity_lst = ray.get(results)

merchant_similarity_df = pd.DataFrame(merchant_similarity_lst)
print(merchant_similarity_df.shape)

merchant_similarity_df = merchant_similarity_df.sort_values(by=['prod_sim', 'size_sim', 'name_sim'], ascending=[False, True, False])
merchant_similarity_df.head()

(48, 4)


Unnamed: 0,merchant,prod_sim,size_sim,name_sim
5,Avril (Health Supermarket),10,20,0.297872
15,Askew's Foods,10,80,0.176471
2,Coleman's,10,150,0.133333
4,Asian Food Centre,10,186,0.263158
10,Valu Foods,10,557,0.258065


In [12]:
prod_sim_min = min(merchant_similarity_df['prod_sim'])
prod_sim_max = max(merchant_similarity_df['prod_sim'])

size_sim_min = min(merchant_similarity_df['size_sim'])
size_sim_max = max(merchant_similarity_df['size_sim'])

print(prod_sim_min, prod_sim_max, size_sim_min, size_sim_max)

9 10 20 23859


In [13]:
def get_similarity_score(r):
    similarity = (r['prod_sim'] - prod_sim_min)/(prod_sim_max - prod_sim_min) * (size_sim_max - r['size_sim'])/(size_sim_max - size_sim_min) * r['name_sim']
    
    return round(similarity, 4)

In [16]:
merchant_similarity_df['similarity_score'] = merchant_similarity_df.apply(get_similarity_score, axis=1)
merchant_similarity_df = merchant_similarity_df.sort_values(by='similarity_score', ascending=False)

merchant_similarity_df.head()

Unnamed: 0,merchant,prod_sim,size_sim,name_sim,similarity_score
1,Fairway Markets,10,1177,0.333333,0.3172
5,Avril (Health Supermarket),10,20,0.297872,0.2979
4,Asian Food Centre,10,186,0.263158,0.2613
10,Valu Foods,10,557,0.258065,0.2523
23,Fresh City Market,10,3535,0.263158,0.2244


## Recent Popular Products

In [17]:
all_order_train.head()

Unnamed: 0,order_id,user_id,order_number,order_hour_of_day,product_id,purchase_date,merchant,product_name,price,aisle,department,week_number
0,2539329,1,1,8,196,2019-02-13,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,7
1,2231262,31,17,11,196,2019-03-10,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
2,3058369,195,34,10,196,2019-02-25,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,9
3,2257155,951,3,20,196,2019-03-09,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
4,1121647,992,7,10,196,2019-03-08,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10


In [21]:
latest_period = 2  # in weeks
week_lst = sorted(all_order_train['week_number'].unique())[-latest_period:]
week_lst

[20, 21]

In [30]:
prod_ct_df = all_order_train.loc[all_order_train['week_number'].isin(week_lst)][['product_id', 'product_name', 'order_id']].astype('str').drop_duplicates()\
                                        .groupby(['product_id', 'product_name'], as_index=False)['order_id']\
                                        .agg('count').reset_index(drop=True)\
                                        .sort_values(by='order_id', ascending=False)

# remove product_id that's in target merchant's top popular products
prod_ct_df = prod_ct_df.loc[~prod_ct_df['product_id'].isin(target_merchant_dct['top_prod_lst'])]
prod_ct_df.head()

Unnamed: 0,product_id,product_name,order_id
18505,49683,Cucumber Kirby,715
7009,24964,Organic Garlic,693
8423,27966,Organic Raspberries,688
6098,22935,Organic Yellow Onion,642
13740,39275,Organic Blueberries,595


In [33]:
n = 20
product_lst = prod_ct_df['product_id'].values[:n]
print(product_lst)
print()
print(prod_ct_df['product_name'].values[:n])

['49683' '24964' '27966' '22935' '39275' '45007' '28204' '4605' '42265'
 '44632' '5876' '4920' '40706' '30391' '30489' '8518' '27104' '45066'
 '5077' '17794']

['Cucumber Kirby' 'Organic Garlic' 'Organic Raspberries'
 'Organic Yellow Onion' 'Organic Blueberries' 'Organic Zucchini'
 'Organic Fuji Apple' 'Yellow Onions' 'Organic Baby Carrots'
 'Sparkling Water Grapefruit' 'Organic Lemon' 'Seedless Red Grapes'
 'Organic Grape Tomatoes' 'Organic Cucumber' 'Original Hummus'
 'Organic Red Onion' 'Fresh Cauliflower' 'Honeycrisp Apple'
 '100% Whole Wheat Bread' 'Carrots']


## Collaborative Filtering