# Calculate Merchant Similarity

Find the top N most similar merchants of a target merchant.

In [63]:
import pandas as pd
import numpy as np
import datetime
import Levenshtein

import warnings
warnings.filterwarnings("ignore")

In [57]:
import ray

ray.shutdown()

In [None]:
ray.init()

In [44]:
target_merchant = '49th Parallel Grocery'

In [51]:
all_order_train = pd.read_pickle('../all_order_train.pkl')
all_order_test = pd.read_pickle('../all_order_test.pkl')
print(all_order_train.shape, all_order_test.shape)

all_order_train.head()

(33720820, 12) (98286, 12)


Unnamed: 0,order_id,user_id,order_number,order_hour_of_day,product_id,purchase_date,merchant,product_name,price,aisle,department,week_number
0,2539329,1,1,8,196,2019-02-13,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,7
1,2231262,31,17,11,196,2019-03-10,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
2,3058369,195,34,10,196,2019-02-25,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,9
3,2257155,951,3,20,196,2019-03-09,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
4,1121647,992,7,10,196,2019-03-08,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10


In [52]:
target_train = all_order_train.loc[all_order_train['merchant'] == target_merchant]
target_test = all_order_test.loc[all_order_test['merchant'] == target_merchant]

print(target_train.shape, target_test.shape)
target_train.head()

(1365312, 12) (3850, 12)


Unnamed: 0,order_id,user_id,order_number,order_hour_of_day,product_id,purchase_date,merchant,product_name,price,aisle,department,week_number
705698,2452257,7,20,9,47272,2019-04-04,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,14
705699,1867977,2850,19,10,47272,2019-03-14,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,11
705700,2879669,18719,1,23,47272,2019-02-23,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,8
705701,252427,38924,5,13,47272,2019-03-31,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,13
705702,1252985,55521,6,16,47272,2019-02-23,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,8


In [53]:
all_order_train = all_order_train.loc[all_order_train['merchant'] != target_merchant]
all_order_test = all_order_test.loc[all_order_test['merchant'] != target_merchant]

print(all_order_train.shape, all_order_test.shape)
all_order_train.head()

(32355508, 12) (94436, 12)


Unnamed: 0,order_id,user_id,order_number,order_hour_of_day,product_id,purchase_date,merchant,product_name,price,aisle,department,week_number
0,2539329,1,1,8,196,2019-02-13,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,7
1,2231262,31,17,11,196,2019-03-10,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
2,3058369,195,34,10,196,2019-02-25,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,9
3,2257155,951,3,20,196,2019-03-09,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
4,1121647,992,7,10,196,2019-03-08,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10


In [54]:
@ray.remote
def get_merchant_data(merchant_df, top=10):
    merchant_size = merchant_df[['merchant', 'product_id']].astype('str').drop_duplicates()\
                                                  .groupby(['merchant'], as_index=False)['product_id']\
                                                  .agg('count').reset_index(drop=True).T.to_dict()
    merchant_data = merchant_size[0]
    merchant_data['product_ct'] = merchant_data.pop('product_id')
    
    top_prod_lst_df = merchant_df[['product_id', 'order_id']].astype('str').drop_duplicates()\
                                        .groupby(['product_id'], as_index=False)['order_id']\
                                        .agg('count').reset_index(drop=True)\
                                        .sort_values(by='order_id', ascending=False)\
                                        .head(n=top)
    top_prod_lst = list(top_prod_lst_df['product_id'].values)
    
    merchant_data['top_prod_lst'] = top_prod_lst
    
    return merchant_data

In [84]:
target_merchant_train = get_merchant_data.remote(target_train[['merchant', 'product_id', 'order_id']], top=10)
target_merchant_dct = ray.get(target_merchant_train)

print(target_merchant_dct)

{'merchant': '49th Parallel Grocery', 'product_ct': 37655, 'top_prod_lst': ['24852', '13176', '21137', '21903', '47209', '47766', '47626', '16797', '26209', '27845']}


In [59]:
merchant_lst = all_order_train['merchant'].unique()

results = [get_merchant_data.remote(all_order_train.loc[all_order_train['merchant']==merchant][['merchant', 'product_id', 'order_id']]) 
           for merchant in merchant_lst]
merchant_data_lst = ray.get(results)

print(len(merchant_data_lst))
merchant_data_lst[7:9]

48


[{'merchant': 'North Central Co-op',
  'product_ct': 35920,
  'top_prod_lst': ['24852',
   '13176',
   '21137',
   '21903',
   '47209',
   '47766',
   '47626',
   '16797',
   '26209',
   '27966']},
 {'merchant': 'Nations Fresh Food',
  'product_ct': 24493,
  'top_prod_lst': ['24852',
   '13176',
   '21137',
   '21903',
   '47209',
   '47766',
   '47626',
   '27966',
   '16797',
   '27845']}]

### Calculate Merchant Similarity

In [87]:
@ray.remote
def get_merchant_similarity(target_merchant_dct, merchant_dct):
    prod_similarity = len(set(target_merchant_dct['top_prod_lst']).intersection(set(merchant_dct['top_prod_lst'])))
    size_similarity = abs(target_merchant_dct['product_ct'] - merchant_dct['product_ct'])
    name_similarity = Levenshtein.ratio(target_merchant_dct['merchant'], merchant_dct['merchant'])
    
    return {'merchant': merchant_dct['merchant'], 'prod_sim': prod_similarity, 'size_sim': size_similarity, 'name_sim': name_similarity}

In [91]:
results = [get_merchant_similarity.remote(target_merchant_train, merchant_dct) for merchant_dct in merchant_data_lst]
merchant_similarity_lst = ray.get(results)

merchant_similarity_df = pd.DataFrame(merchant_similarity_lst)
print(merchant_similarity_df.shape)

merchant_similarity_df = merchant_similarity_df.sort_values(by=['prod_sim', 'size_sim', 'name_sim'], ascending=[False, True, False])
merchant_similarity_df.head()

(48, 4)


Unnamed: 0,merchant,prod_sim,size_sim,name_sim
5,Avril (Health Supermarket),10,20,0.297872
15,Askew's Foods,10,80,0.176471
2,Coleman's,10,150,0.133333
4,Asian Food Centre,10,186,0.263158
10,Valu Foods,10,557,0.258065


In [96]:
similarity_count = 5
similar_merchants = merchant_similarity_df.head(n=similarity_count)['merchant'].values
similar_merchants

array(['Avril (Health Supermarket)', "Askew's Foods", "Coleman's",
       'Asian Food Centre', 'Valu Foods'], dtype=object)

## Summary

* The key points I'm using to calculate merchants similarity are:
  * Top selled products
  * Store size
  * Merchant Name
* In real world, the geo distance can also be very helpful.
* In the code, the main part here is to try to use ray to do parallel processing,since the groupby of dataframes here can be slow.