In [1]:
import pandas as pd
import numpy as np
import datetime

import warnings
warnings.filterwarnings("ignore")

In [2]:
all_order_train = pd.read_pickle('../all_order_train.pkl')
all_order_test = pd.read_pickle('../all_order_test.pkl')
print(all_order_train.shape, all_order_test.shape)

all_order_train.head()

(33720820, 12) (98286, 12)


Unnamed: 0,order_id,user_id,order_number,order_hour_of_day,product_id,purchase_date,merchant,product_name,price,aisle,department,week_number
0,2539329,1,1,8,196,2019-02-13,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,7
1,2231262,31,17,11,196,2019-03-10,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
2,3058369,195,34,10,196,2019-02-25,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,9
3,2257155,951,3,20,196,2019-03-09,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
4,1121647,992,7,10,196,2019-03-08,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10


## Popurity Recommendation

* For each product there are 3 values:
  * `popularity = total purchase amount in a specific period`
    * Purchase frequency is 1 for both daily and weekly frequency. So for this date, I'm using total amount instead of frequency to calculate the popularity.
  * `total_popularity = latest_popularity * weight_latest + earlier_popularity * weight_earlier`
    * `weight_latest` is the weight for the latest period, such as latest 2 weeks
    * `weight_earlier` is the weight for the rest of the earlier period
    * weight_latest should be higher than weight_earlier, since the assumption is the forecast is closer to its latest previous period.
  * `product_value = unit_price * latest_period_purchase / latest_period`
    * `latest_period` is latest N weeks, such as latest 2 weeks. Same value as latest period used in `total_popularity`
* Sort all the products by total_popularity, product_value descending order
* For the target merchant who do not have the top N products, recommend these

In [34]:
popularity_df = all_order_train[['product_id', 'order_id', 'week_number']].drop_duplicates()\
                                                  .groupby(['product_id', 'week_number'], as_index=False)['order_id']\
                                                  .agg('count').reset_index()\
                                                  .sort_values(by=['product_id', 'week_number'])
popularity_df.head()

Unnamed: 0,index,product_id,week_number,order_id
0,0,1,7,178
1,1,1,8,332
2,2,1,9,185
3,3,1,10,246
4,4,1,11,219


In [35]:
popularity_df.drop('index', inplace=True, axis=1)
popularity_df = popularity_df.rename(index=str, columns={'order_id': 'weekly_popularity'})

popularity_df.head()

Unnamed: 0,product_id,week_number,weekly_popularity
0,1,7,178
1,1,8,332
2,1,9,185
3,1,10,246
4,1,11,219


In [46]:
def get_total_popularity(latest_weeks, latest_weight, earlier_weight, popularity_df):
    latest_popularity_df = popularity_df.groupby('product_id')\
                                    .apply(lambda x: x.nlargest(latest_weeks, 'week_number'))\
                                    .reset_index(drop=True)  

    earlier_popularity_df = popularity_df.groupby('product_id')\
                                    .apply(lambda x: x.nsmallest(x['week_number'].nunique()-latest_weeks, 'week_number'))\
                                    .reset_index(drop=True) 
    
    latest_popularity_agg = latest_popularity_df.groupby('product_id', 
                                                     as_index=False)['weekly_popularity'].agg(['sum']).reset_index()
    latest_popularity_agg = latest_popularity_agg.rename(index=str, columns={'sum': 'latest_popularity'})
    earlier_popularity_agg = earlier_popularity_df.groupby('product_id', 
                                                         as_index=False)['weekly_popularity'].agg(['sum']).reset_index()
    earlier_popularity_agg = earlier_popularity_agg.rename(index=str, columns={'sum': 'earlier_popularity'})
    
    popularity_agg = latest_popularity_agg.merge(earlier_popularity_agg, on='product_id')
    popularity_agg['total_populatiry'] = popularity_agg['latest_popularity']*latest_weight + popularity_agg['earlier_popularity']*earlier_weight

    return popularity_agg

In [48]:
popularity_agg = get_total_popularity(latest_weeks=2, latest_weight=0.7, earlier_weight=0.01, popularity_df=popularity_df)
print(popularity_agg.shape)

popularity_agg.head()

(49019, 4)


Unnamed: 0,product_id,latest_popularity,earlier_popularity,total_populatiry
0,1,8,1909,24.69
1,2,3,91,3.01
2,3,5,278,6.28
3,4,8,342,9.02
4,5,2,14,1.54


In [71]:
def get_product_value(latest_weeks, df):
    latest_df = df[['product_id', 'week_number']].drop_duplicates().groupby('product_id')\
                                    .apply(lambda x: x.nlargest(latest_weeks, 'week_number'))\
                                    .reset_index(drop=True) 
    
    value_df = pd.merge(df[['product_id', 'price', 'order_id', 'week_number']], latest_df, how='inner',
                                                        left_on=['product_id', 'week_number'],
                                                        right_on=['product_id', 'week_number'])
    
    latest_value_df = value_df.groupby(['product_id', 'price'], as_index=False)['order_id']\
                              .agg({'order_id': 'count'})

    latest_value_df['product_value'] = latest_value_df['price'] * latest_value_df['order_id']/latest_weeks
    
    return latest_value_df

In [73]:
latest_value_df = get_product_value(latest_weeks=2, df=all_order_train)

latest_value_df.head()

Unnamed: 0,product_id,price,order_id,product_value
0,1,13.33,8,53.32
1,2,6.57,3,9.855
2,3,7.76,5,19.4
3,4,13.76,8,55.04
4,5,6.21,2,6.21
