# Popularity Recommendation

In [1]:
import pandas as pd
import numpy as np
import datetime

import warnings
warnings.filterwarnings("ignore")

In [2]:
all_order_train = pd.read_pickle('../all_order_train.pkl')
all_order_test = pd.read_pickle('../all_order_test.pkl')
print(all_order_train.shape, all_order_test.shape)

all_order_train.head()

(33720820, 12) (98286, 12)


Unnamed: 0,order_id,user_id,order_number,order_hour_of_day,product_id,purchase_date,merchant,product_name,price,aisle,department,week_number
0,2539329,1,1,8,196,2019-02-13,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,7
1,2231262,31,17,11,196,2019-03-10,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
2,3058369,195,34,10,196,2019-02-25,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,9
3,2257155,951,3,20,196,2019-03-09,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
4,1121647,992,7,10,196,2019-03-08,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10


## Products Popularity & Value

The purpose of this step is to generate the popularity and value for each product.

* For each product there are 3 values:
  * `popularity = total purchase amount in a specific period`
    * Purchase frequency is 1 for both daily and weekly frequency. So for this date, I'm using total amount instead of frequency to calculate the popularity.
  * `total_popularity = latest_popularity * weight_latest + earlier_popularity * weight_earlier`
    * `weight_latest` is the weight for the latest period, such as latest 2 weeks
    * `weight_earlier` is the weight for the rest of the earlier period
    * weight_latest should be higher than weight_earlier, since the assumption is the forecast is closer to its latest previous period.
  * `product_value = unit_price * latest_period_purchase / latest_period`
    * `latest_period` is latest N weeks, such as latest 2 weeks. Same value as latest period used in `total_popularity`
* Sort all the products by total_popularity, product_value descending order
* For the target merchant who do not have the top N products, recommend these

### Get Product Popularity

In [3]:
popularity_df = all_order_train[['product_id', 'order_id', 'week_number']].drop_duplicates()\
                                                  .groupby(['product_id', 'week_number'], as_index=False)['order_id']\
                                                  .agg('count').reset_index()\
                                                  .sort_values(by=['product_id', 'week_number'])
popularity_df.head()

Unnamed: 0,index,product_id,week_number,order_id
0,0,1,7,178
1,1,1,8,332
2,2,1,9,185
3,3,1,10,246
4,4,1,11,219


In [4]:
popularity_df.drop('index', inplace=True, axis=1)
popularity_df = popularity_df.rename(index=str, columns={'order_id': 'weekly_popularity'})

popularity_df.head()

Unnamed: 0,product_id,week_number,weekly_popularity
0,1,7,178
1,1,8,332
2,1,9,185
3,1,10,246
4,1,11,219


In [10]:
def get_total_popularity(latest_weeks, latest_weight, earlier_weight, popularity_df):
    latest_popularity_df = popularity_df.groupby('product_id')\
                                    .apply(lambda x: x.nlargest(latest_weeks, 'week_number'))\
                                    .reset_index(drop=True)  

    earlier_popularity_df = popularity_df.groupby('product_id')\
                                    .apply(lambda x: x.nsmallest(x['week_number'].nunique()-latest_weeks, 'week_number'))\
                                    .reset_index(drop=True) 
    
    latest_popularity_agg = latest_popularity_df.groupby('product_id', 
                                                     as_index=False)['weekly_popularity'].agg(['sum']).reset_index()
    latest_popularity_agg = latest_popularity_agg.rename(index=str, columns={'sum': 'latest_popularity'})
    earlier_popularity_agg = earlier_popularity_df.groupby('product_id', 
                                                         as_index=False)['weekly_popularity'].agg(['sum']).reset_index()
    earlier_popularity_agg = earlier_popularity_agg.rename(index=str, columns={'sum': 'earlier_popularity'})
    
    popularity_agg = latest_popularity_agg.merge(earlier_popularity_agg, on='product_id')
    popularity_agg['total_popularity'] = popularity_agg['latest_popularity']*latest_weight + popularity_agg['earlier_popularity']*earlier_weight

    return popularity_agg

In [11]:
popularity_agg = get_total_popularity(latest_weeks=2, latest_weight=0.7, earlier_weight=0.01, popularity_df=popularity_df)
print(popularity_agg.shape)

popularity_agg.head()

(49019, 4)


Unnamed: 0,product_id,latest_popularity,earlier_popularity,total_popularity
0,1,8,1909,24.69
1,2,3,91,3.01
2,3,5,278,6.28
3,4,8,342,9.02
4,5,2,14,1.54


In [17]:
popularity_agg['product_id'].nunique()

49019

### Get Product Value

The value here is calculated for latets_period.
Latest period should be the same as forecasting moving window.

In [7]:
def get_product_value(latest_weeks, df):
    latest_df = df[['product_id', 'week_number']].drop_duplicates().groupby('product_id')\
                                    .apply(lambda x: x.nlargest(latest_weeks, 'week_number'))\
                                    .reset_index(drop=True) 
    
    value_df = pd.merge(df[['product_id', 'price', 'order_id', 'week_number']], latest_df, how='inner',
                                                        left_on=['product_id', 'week_number'],
                                                        right_on=['product_id', 'week_number'])
    
    latest_value_df = value_df.groupby(['product_id', 'price'], as_index=False)['order_id']\
                              .agg({'order_id': 'count'})

    latest_value_df['product_value'] = latest_value_df['price'] * latest_value_df['order_id']/latest_weeks
    
    return latest_value_df

In [15]:
latest_value_df = get_product_value(latest_weeks=2, df=all_order_train)
print(latest_value_df.shape)

latest_value_df.head()

(49685, 4)


Unnamed: 0,product_id,price,order_id,product_value
0,1,13.33,8,53.32
1,2,6.57,3,9.855
2,3,7.76,5,19.4
3,4,13.76,8,55.04
4,5,6.21,2,6.21


In [16]:
latest_value_df['product_id'].nunique()

49685

### Get Product Average Daily Sales

Based on the special situation in this data, the average daily sales might be the same as product price.

In [12]:
purchase_ct_df = all_order_train[['product_id', 'order_id', 'purchase_date']].astype('str').drop_duplicates()\
                                                  .groupby(['product_id'], as_index=False)[['order_id', 'purchase_date']]\
                                                  .agg({'order_id': 'count', 'purchase_date': 'count'}).reset_index()

purchase_ct_df.head()

Unnamed: 0,index,product_id,order_id,purchase_date
0,0,1,1917,1917
1,1,10,2686,2686
2,2,100,457,457
3,3,1000,2688,2688
4,4,10000,13,13


In [14]:
print(max(purchase_ct_df['purchase_date']/purchase_ct_df['order_id']), min(purchase_ct_df['purchase_date']/purchase_ct_df['order_id']))

1.0 1.0


In [20]:
product_values = popularity_agg[['product_id', 'total_popularity']].merge(latest_value_df[['product_id', 'product_value', 'price']],
                                                                          on='product_id')
print(product_values.shape, product_values['product_id'].nunique())
product_values = product_values.rename(index=str, columns={'price': 'avg_daily_sales'})

product_values.head()

(49019, 4) 49019


Unnamed: 0,product_id,total_popularity,product_value,avg_daily_sales
0,1,24.69,53.32,13.33
1,2,3.01,9.855,6.57
2,3,6.28,19.4,7.76
3,4,9.02,55.04,13.76
4,5,1.54,6.21,6.21


In [24]:
product_values = product_values.sort_values(by=['total_popularity', 'product_value'], ascending=[False, False])

product_values.head()

Unnamed: 0,product_id,total_popularity,product_value,avg_daily_sales
24517,24852,6935.67,31742.0,21.52
12984,13176,5385.83,12622.375,12.05
20846,21137,3708.33,6519.1,9.38
21602,21903,3537.39,15351.59,20.62
46574,47209,3003.98,6164.91,10.62


In [23]:
product_values.to_pickle('product_values.pkl')

## Sample Merchants Recommendation

* Recommend top 10 products cannot be found in a merchant's transactions.
* Forecast the sales in the next 1 week after adding each recommended product.

* Notes
  * Product value is the latest value, there should be `latest_period = forecasting moving window size`

In [9]:
sample_merchant_df1 = all_order_train.loc[all_order_train['merchant']=='49th Parallel Grocery']
print(sample_merchant_df1.shape)

sample_merchant_df1.head()

(1365312, 12)


Unnamed: 0,order_id,user_id,order_number,order_hour_of_day,product_id,purchase_date,merchant,product_name,price,aisle,department,week_number
705698,2452257,7,20,9,47272,2019-04-04,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,14
705699,1867977,2850,19,10,47272,2019-03-14,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,11
705700,2879669,18719,1,23,47272,2019-02-23,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,8
705701,252427,38924,5,13,47272,2019-03-31,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,13
705702,1252985,55521,6,16,47272,2019-02-23,49th Parallel Grocery,Antioxidant Infusions Beverage Malawi Mango,2.03,refrigerated,beverages,8


In [47]:
sample_products_set1 = sample_merchant_df1['product_id'].unique()
print(len(sample_products_set1), len(sample_merchant_df1['department'].unique()))

recommended_products = product_values.loc[~product_values['product_id'].isin(sample_products_set1)]\
                                     .sort_values(by=['total_popularity', 'product_value'], ascending=[False, False]).head(n=10)
recommended_products

37655 21


Unnamed: 0,product_id,total_popularity,product_value,avg_daily_sales
4519,4585,21.11,83.25,5.55
14401,14609,17.6,190.0,15.2
30991,31418,16.49,42.32,3.68
35604,36088,14.9,101.955,9.71
42351,42926,14.81,74.13,7.06
2439,2465,13.44,81.89,8.62
36184,36674,13.07,363.69,40.41
7840,7957,11.61,24.32,3.04
7950,8067,11.3,26.96,3.37
37039,37539,11.27,99.36,12.42
