# Frequent Items Recommendations

Get the frequent item sets in the latest N weeks, recommend the missing frequent items to merchants

In [18]:
from pyspark.ml.fpm import PrefixSpan
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import Row
import pandas as pd
import numpy as np
import datetime

import warnings
warnings.filterwarnings("ignore")

sc = SparkContext('local[2]', appName='local')  # local n specifies n threads
spark = SparkSession(sc) 

In [25]:
all_order_train = pd.read_pickle('../all_order_train.pkl')
print(all_order_train.shape)

all_order_train.head()

(33720820, 12)


Unnamed: 0,order_id,user_id,order_number,order_hour_of_day,product_id,purchase_date,merchant,product_name,price,aisle,department,week_number
0,2539329,1,1,8,196,2019-02-13,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,7
1,2231262,31,17,11,196,2019-03-10,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
2,3058369,195,34,10,196,2019-02-25,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,9
3,2257155,951,3,20,196,2019-03-09,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10
4,1121647,992,7,10,196,2019-03-08,L&M Markets (Hometown Grocers Co-op),Soda,3.08,soft drinks,beverages,10


In [26]:
print(all_order_train['order_id'].nunique())
print(all_order_train['product_id'].nunique())
print(all_order_train['aisle'].nunique())

3336195
49685
134


In [11]:
order_prodlst_df = all_order_train.groupby('order_id')['product_id'].apply(list).reset_index(name='prod_lst')
print(order_prodlst_df.shape)

order_prodlst_df.head()

(3336195, 2)


Unnamed: 0,order_id,prod_lst
0,1,"[13176, 47209, 49683, 10246, 22035, 11109, 436..."
1,2,"[28985, 45918, 17794, 33120, 30035, 40141, 181..."
2,3,"[21903, 33754, 17461, 24838, 46667, 32665, 176..."
3,4,"[25146, 32645, 41276, 27761, 46842, 22598, 402..."
4,5,"[13176, 47209, 27966, 38693, 23909, 6184, 6348..."


In [15]:
prod_df = all_order_train[['product_id', 'product_name']].drop_duplicates()
print(prod_df.shape)

prod_df.head()

(49685, 2)


Unnamed: 0,product_id,product_name
0,196,Soda
662,14084,Organic Unsweetened Vanilla Almond Milk
956,12427,Original Beef Jerky
1093,26088,Aged White Cheddar Popcorn
1140,26405,XL Pick-A-Size Paper Towel Rolls


In [31]:
order_aisleset_df = all_order_train.groupby('order_id')['aisle'].apply(set).apply(list).reset_index(name='aisle_set')
print(order_aisleset_df.shape)

order_aisleset_df.head()

(3336195, 2)


Unnamed: 0,order_id,aisle_set
0,1,"[other creams cheeses, yogurt, packaged cheese..."
1,2,"[spices seasonings, baking ingredients, packag..."
2,3,"[yogurt, packaged vegetables fruits, poultry c..."
3,4,"[cold flu allergy, breakfast bars pastries, en..."
4,5,"[paper goods, instant foods, packaged poultry,..."


In [32]:
# order_prodlst_df.to_pickle('order_prodlst_df.pkl')
# prod_df.to_pickle('prod_df.pkl')
order_aisleset_df.to_pickle('order_aisleset_df.pkl')

### FP-Growth

#### Frequent Pattern with Product ID

In [3]:
from pyspark.ml.fpm import FPGrowth

fp_df = spark.createDataFrame(order_prodlst_df)

fp_df.show()

+--------+--------------------+
|order_id|            prod_lst|
+--------+--------------------+
|       1|[13176, 47209, 49...|
|       2|[28985, 45918, 17...|
|       3|[21903, 33754, 17...|
|       4|[25146, 32645, 41...|
|       5|[13176, 47209, 27...|
|       6|[40462, 15873, 41...|
|       7|      [46802, 34050]|
|       8|             [23423]|
|       9|[23288, 2014, 183...|
|      10|[47766, 24852, 21...|
|      11|[31506, 5994, 131...|
|      12|[43511, 37215, 34...|
|      13|[196, 33198, 2578...|
|      14|[27845, 45066, 24...|
|      15|[19660, 21195, 74...|
|      16|[45437, 25466, 9755]|
|      18|[34969, 47766, 52...|
|      19|[42265, 24838, 41...|
|      20|[24852, 35430, 33...|
|      21|[25718, 4149, 314...|
+--------+--------------------+
only showing top 20 rows



In [11]:
fpGrowth = FPGrowth(itemsCol="prod_lst", minSupport=0.1, minConfidence=0.2)
model = fpGrowth.fit(fp_df)

# Display frequent itemsets.
model.freqItemsets.show()

+-------+------+
|  items|  freq|
+-------+------+
|[24852]|490017|
|[13176]|394028|
+-------+------+



In [12]:
model.associationRules.show()

+----------+----------+----------+----+
|antecedent|consequent|confidence|lift|
+----------+----------+----------+----+
+----------+----------+----------+----+



In [21]:
# transform examines the input items against all the association rules and summarize the consequents as prediction
model.transform(fp_df).sort(col('prediction').desc()).show()

+--------+--------------------+----------+
|order_id|            prod_lst|prediction|
+--------+--------------------+----------+
|       1|[13176, 47209, 49...|        []|
|       2|[28985, 45918, 17...|        []|
|       3|[21903, 33754, 17...|        []|
|       4|[25146, 32645, 41...|        []|
|       5|[13176, 47209, 27...|        []|
|       6|[40462, 15873, 41...|        []|
|       7|      [46802, 34050]|        []|
|       8|             [23423]|        []|
|       9|[23288, 2014, 183...|        []|
|      10|[47766, 24852, 21...|        []|
|      11|[31506, 5994, 131...|        []|
|      12|[43511, 37215, 34...|        []|
|      13|[196, 33198, 2578...|        []|
|      14|[27845, 45066, 24...|        []|
|      15|[19660, 21195, 74...|        []|
|      16|[45437, 25466, 9755]|        []|
|      18|[34969, 47766, 52...|        []|
|      19|[42265, 24838, 41...|        []|
|      20|[24852, 35430, 33...|        []|
|      21|[25718, 4149, 314...|        []|
+--------+-

No matter it's ascending or descending, there is no prediction results. And the thresholds of support and confidence are too low, otherwise there is even no frequent items.

#### Frequent Pattern with Product Aisle

In [34]:
fp_df = spark.createDataFrame(order_aisleset_df)

fp_df.show()

+--------+--------------------+
|order_id|           aisle_set|
+--------+--------------------+
|       1|[other creams che...|
|       2|[spices seasoning...|
|       3|[yogurt, packaged...|
|       4|[cold flu allergy...|
|       5|[paper goods, ins...|
|       6|[laundry, air fre...|
|       7|[frozen produce, ...|
|       8|        [buns rolls]|
|       9|[yogurt, packaged...|
|      10|[canned meals bea...|
|      11|[canned meals bea...|
|      12|[milk, frozen app...|
|      13|[other creams che...|
|      14|[milk, frozen bre...|
|      15|[oils vinegars, s...|
|      16|[popcorn jerky, c...|
|      18|[milk, paper good...|
|      19|[packaged vegetab...|
|      20|[frozen breakfast...|
|      21|[fresh fruits, to...|
+--------+--------------------+
only showing top 20 rows



In [37]:
fpGrowth = FPGrowth(itemsCol="aisle_set", minSupport=0.3, minConfidence=0.3)
model = fpGrowth.fit(fp_df)

# Display frequent itemsets.
model.freqItemsets.show()

+--------------------+-------+
|               items|   freq|
+--------------------+-------+
|[packaged vegetab...|1226107|
|      [fresh fruits]|1857917|
|  [fresh vegetables]|1482470|
|[fresh vegetables...|1061493|
+--------------------+-------+



In [38]:
model.associationRules.show()

+------------------+------------------+------------------+------------------+
|        antecedent|        consequent|        confidence|              lift|
+------------------+------------------+------------------+------------------+
|    [fresh fruits]|[fresh vegetables]|0.5713349950509091|1.2857494275188488|
|[fresh vegetables]|    [fresh fruits]|0.7160300039798444|1.2857494275188488|
+------------------+------------------+------------------+------------------+



In [42]:
model.transform(fp_df).sort(col('prediction').desc()).show(truncate=False)

+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|order_id|aisle_set                                                                                                                                                                                                                                                           |prediction        |
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|78      |[milk, fresh dips tapenades, packaged cheese, crackers, fresh fruits]                                                

Notes

* So when using FP-growth, even using aisle, at most it tells you fresh vegetables should be put together with fresh fruits. This makes sense, but almost all the stores are doing this...

### PrefixSPAN

#### Frequent Pattern with Product ID

In [46]:
all_prodlst = [Row(sequence=[prod_lst]) for prod_lst in order_prodlst_df['prod_lst'].values]
print(len(all_prodlst))

all_prodlst[7:9]

3336195


[Row(sequence=[[23423]]),
 Row(sequence=[[23288, 2014, 18362, 21405, 432, 31506, 14992, 44533, 3990, 14183, 47890, 11182, 34203, 29193, 27366]])]

In [47]:
prefixspan_df = sc.parallelize(all_prodlst).toDF()

prefixspan_df.show()

+--------------------+
|            sequence|
+--------------------+
|[[13176, 47209, 4...|
|[[28985, 45918, 1...|
|[[21903, 33754, 1...|
|[[25146, 32645, 4...|
|[[13176, 47209, 2...|
|[[40462, 15873, 4...|
|    [[46802, 34050]]|
|           [[23423]]|
|[[23288, 2014, 18...|
|[[47766, 24852, 2...|
|[[31506, 5994, 13...|
|[[43511, 37215, 3...|
|[[196, 33198, 257...|
|[[27845, 45066, 2...|
|[[19660, 21195, 7...|
|[[45437, 25466, 9...|
|[[34969, 47766, 5...|
|[[42265, 24838, 4...|
|[[24852, 35430, 3...|
|[[25718, 4149, 31...|
+--------------------+
only showing top 20 rows



In [52]:
prefixSpan = PrefixSpan(minSupport=0.1, maxPatternLength=7,
                        maxLocalProjDBSize=32000000)

# Find frequent sequential patterns.
frequent_aptterns = prefixSpan.findFrequentSequentialPatterns(prefixspan_df)

In [53]:
frequent_aptterns.show()

+---------+------+
| sequence|  freq|
+---------+------+
|[[24852]]|490017|
|[[13176]]|394028|
+---------+------+



#### Frequent Pattern with Product Aisle 

In [54]:
all_aisleset = [Row(sequence=[aisle_set]) for aisle_set in order_aisleset_df['aisle_set'].values]
print(len(all_aisleset))

all_aisleset[7:9]

3336195


[Row(sequence=[['buns rolls']]),
 Row(sequence=[['yogurt', 'packaged vegetables fruits', 'water seltzer sparkling water', 'oils vinegars', 'packaged cheese', 'bread', 'juice nectars', 'cookies cakes', 'soy lactosefree', 'granola', 'fresh vegetables', 'canned fruit applesauce']])]

In [55]:
prefixspan_df = sc.parallelize(all_aisleset).toDF()

prefixspan_df.show()

+--------------------+
|            sequence|
+--------------------+
|[[other creams ch...|
|[[spices seasonin...|
|[[yogurt, package...|
|[[cold flu allerg...|
|[[paper goods, in...|
|[[laundry, air fr...|
|[[frozen produce,...|
|      [[buns rolls]]|
|[[yogurt, package...|
|[[canned meals be...|
|[[canned meals be...|
|[[milk, frozen ap...|
|[[other creams ch...|
|[[milk, frozen br...|
|[[oils vinegars, ...|
|[[popcorn jerky, ...|
|[[milk, paper goo...|
|[[packaged vegeta...|
|[[frozen breakfas...|
|[[fresh fruits, t...|
+--------------------+
only showing top 20 rows



In [None]:
prefixSpan = PrefixSpan(minSupport=0.3, maxPatternLength=7,
                        maxLocalProjDBSize=32000000)

# Find frequent sequential patterns.
frequent_aptterns = prefixSpan.findFrequentSequentialPatterns(prefixspan_df)
frequent_aptterns.show()

In [58]:
frequent_aptterns.show(truncate=False)

+----------------------------------+-------+
|sequence                          |freq   |
+----------------------------------+-------+
|[[fresh fruits]]                  |1857917|
|[[packaged vegetables fruits]]    |1226107|
|[[fresh vegetables]]              |1482470|
|[[fresh fruits, fresh vegetables]]|1061493|
+----------------------------------+-------+



## Summary

After trying both FP-growth and PrefixSPAN, the conclusions are the same:

* Most frequent items are fresh vegetables together with fresh fruits, which is already common in grocery stores. Therefore the frequent mining results here are not very helpful.
* The purpose of getting frequent itemsets was to calculate merchants' similarity scores. Obviously they are not a good option.