#### Requirement:
* Read dataset
* Pre-processing data
* Apply FPGrowth algorithm to find association rules from this dataset. Find the most popular items in a basket

In [1]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, collect_set, count
from pyspark.sql.types import StringType

from pyspark.ml.feature import StringIndexer
from pyspark.ml.fpm import FPGrowth
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
spark = SparkSession.builder.appName('ex_demo').getOrCreate()

In [5]:
# Load data
data = spark.read.csv("../../Data/instacart_2017_05_01/order_products__train.csv", header=True, inferSchema=True)

In [6]:
data.count()

1384617

In [7]:
data.show()

+--------+----------+-----------------+---------+
|order_id|product_id|add_to_cart_order|reordered|
+--------+----------+-----------------+---------+
|       1|     49302|                1|        1|
|       1|     11109|                2|        1|
|       1|     10246|                3|        0|
|       1|     49683|                4|        0|
|       1|     43633|                5|        1|
|       1|     13176|                6|        0|
|       1|     47209|                7|        0|
|       1|     22035|                8|        1|
|      36|     39612|                1|        0|
|      36|     19660|                2|        1|
|      36|     49235|                3|        0|
|      36|     43086|                4|        1|
|      36|     46620|                5|        1|
|      36|     34497|                6|        1|
|      36|     48679|                7|        1|
|      36|     46979|                8|        1|
|      38|     11913|                1|        0|


### Pre-processing data

In [8]:
data.createOrReplaceTempView("order_products_train")

In [9]:
products = spark.sql("SELECT DISTINCT product_id FROM order_products_train")
products.count()

39123

In [10]:
rawData = spark.sql("SELECT * FROM order_products_train")
baskets = rawData.groupBy('order_id').agg(collect_set('product_id').alias('items'))
baskets.createOrReplaceTempView("baskets")

In [11]:
baskets.show(5, False)

+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|order_id|items                                                                                                                                                                                                               |
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1342    |[30827, 3798, 14966, 21137, 46129, 33081, 13176, 7862]                                                                                                                                                              |
|1591    |[48246, 44116, 24852, 5194, 9130, 48823, 46473, 40310, 32520, 22105, 16900, 27681, 4103, 44008

In [12]:
type(baskets)

pyspark.sql.dataframe.DataFrame

In [13]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.003, minConfidence=0.003)
model = fpGrowth.fit(baskets)

In [14]:
# Display frequent itemsets
model.freqItemsets.show()

+--------------------+-----+
|               items| freq|
+--------------------+-----+
|             [13629]|  772|
|              [5194]|  475|
|             [24852]|18726|
|             [13176]|15480|
|             [35921]|  769|
|             [20345]|  473|
|             [21137]|10894|
|      [21137, 13176]| 3074|
|      [21137, 24852]| 2174|
|             [23165]|  764|
|             [13380]|  473|
|              [7969]|  472|
|             [21903]| 9784|
|      [21903, 21137]| 1639|
|[21903, 21137, 13...|  587|
|      [21903, 13176]| 2236|
|      [21903, 24852]| 2000|
|             [32478]|  763|
|             [47626]| 8135|
|      [47626, 21137]| 1017|
+--------------------+-----+
only showing top 20 rows



In [15]:
# Transform examines the input items agaist all the association rules and summary consquents as predict
mostPopularItemInABasket = model.transform(baskets)

In [16]:
mostPopularItemInABasket.show()

+--------+--------------------+--------------------+
|order_id|               items|          prediction|
+--------+--------------------+--------------------+
|    1342|[30827, 3798, 149...|[21903, 47626, 47...|
|    1591|[48246, 44116, 24...|[21137, 21903, 47...|
|    4519|             [29270]|                  []|
|    4935|             [45190]|                  []|
|    6357|[33731, 14669, 43...|[21137, 21903, 47...|
|   10362|[28522, 43789, 12...|[21137, 47626, 47...|
|   19204|[45255, 37285, 48...|                  []|
|   29601|[2716, 48057, 219...|[21137, 21903, 47...|
|   31035|[40723, 8174, 131...|[21137, 21903, 47...|
|   40011|[27292, 35213, 21...|[21137, 13176, 24...|
|   46266|[38558, 48642, 13...|[47626, 47766, 47...|
|   51607|[41390, 42752, 17...|                  []|
|   58797|[30827, 8803, 326...|[21137, 21903, 47...|
|   61793|[26348, 6184, 433...|[21137, 16797, 39...|
|   67089|[47766, 29388, 21...|[47626, 21137, 47...|
|   70863|[34791, 2618, 173...|      [13176, 2

### Use product_name instead of product_id


In [17]:
product_data = spark.read.csv('../../Data/instacart_2017_05_01/products.csv', header=True, inferSchema=True)

In [18]:
product_data.show(5, truncate=False)

+----------+-----------------------------------------------------------------+--------+-------------+
|product_id|product_name                                                     |aisle_id|department_id|
+----------+-----------------------------------------------------------------+--------+-------------+
|1         |Chocolate Sandwich Cookies                                       |61      |19           |
|2         |All-Seasons Salt                                                 |104     |13           |
|3         |Robust Golden Unsweetened Oolong Tea                             |94      |7            |
|4         |Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce|38      |1            |
|5         |Green Chile Anytime Sauce                                        |5       |13           |
+----------+-----------------------------------------------------------------+--------+-------------+
only showing top 5 rows



In [19]:
product_data.createOrReplaceTempView("products")

In [20]:
rawData_1 = spark.sql('''SELECT p.product_name, o.order_id 
                         FROM products p 
                         INNER JOIN order_products_train o
                         WHERE o.product_id = p.product_id''')
baskets_1 = rawData.groupBy('order_id').agg(collect_set('product_id').alias('items'))
baskets_1.createOrReplaceTempView("baskets_1")

In [21]:
baskets_1.head(3)

[Row(order_id=1342, items=[30827, 3798, 14966, 21137, 46129, 33081, 13176, 7862]),
 Row(order_id=1591, items=[48246, 44116, 24852, 5194, 9130, 48823, 46473, 40310, 32520, 22105, 16900, 27681, 4103, 44008, 17758, 41671, 25316, 45061, 38805, 48205, 25237, 19604, 5384, 27344, 17203, 18792, 12986, 39758, 34358, 31215, 9387]),
 Row(order_id=4519, items=[29270])]

In [22]:
fpGrowth_1 = FPGrowth(itemsCol="items", minSupport=0.003, minConfidence=0.003)
model_1 = fpGrowth.fit(baskets_1)

In [23]:
# Display frequent itemsets
model_1.freqItemsets.show()

+--------------------+-----+
|               items| freq|
+--------------------+-----+
|             [13629]|  772|
|              [5194]|  475|
|             [24852]|18726|
|             [13176]|15480|
|             [35921]|  769|
|             [20345]|  473|
|             [21137]|10894|
|      [21137, 13176]| 3074|
|      [21137, 24852]| 2174|
|             [23165]|  764|
|             [13380]|  473|
|              [7969]|  472|
|             [21903]| 9784|
|      [21903, 21137]| 1639|
|[21903, 21137, 13...|  587|
|      [21903, 13176]| 2236|
|      [21903, 24852]| 2000|
|             [32478]|  763|
|             [47626]| 8135|
|      [47626, 21137]| 1017|
+--------------------+-----+
only showing top 20 rows



In [24]:
mostPopularItemInABasket_1 = model.transform(baskets_1)

In [25]:
mostPopularItemInABasket_1.head(3)

[Row(order_id=1342, items=[30827, 3798, 14966, 21137, 46129, 33081, 13176, 7862], prediction=[21903, 47626, 47766, 47209, 16797, 26209, 27966, 39275, 27845, 30391, 45007, 22935, 24964, 4920, 46979, 40706, 8518, 42265, 45066, 31717, 5876, 44632, 43352, 28204, 5450, 21616, 19057, 30489, 26604, 37646, 27104, 49235, 28985, 44359, 48679, 41950, 17794, 43961, 34126, 10749, 39877, 12341, 8277, 19660, 35951, 24838, 46667, 22035, 8174, 11520, 22825, 39928, 18465, 27521, 16759, 9839, 27156, 24852, 8424, 37067]),
 Row(order_id=1591, items=[48246, 44116, 24852, 5194, 9130, 48823, 46473, 40310, 32520, 22105, 16900, 27681, 4103, 44008, 17758, 41671, 25316, 45061, 38805, 48205, 25237, 19604, 5384, 27344, 17203, 18792, 12986, 39758, 34358, 31215, 9387], prediction=[21137, 21903, 47626, 47766, 47209, 16797, 26209, 27966, 39275, 27845, 30391, 45007, 22935, 24964, 4920, 46979, 40706, 8518, 4605, 42265, 45066, 31717, 44632, 43352, 28204, 5450, 8424, 21616, 24184, 19057, 30489, 26604, 37646, 27104, 49235, 

In [26]:
type(mostPopularItemInABasket_1)

pyspark.sql.dataframe.DataFrame

In [27]:
mostPopularItemInABasket_1.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- items: array (nullable = false)
 |    |-- element: integer (containsNull = false)
 |-- prediction: array (nullable = true)
 |    |-- element: integer (containsNull = false)



In [28]:
mostPopularItemInABasket_1.createOrReplaceTempView("popular_items")

In [29]:
DF_cast = mostPopularItemInABasket_1.select('order_id', 
                                            mostPopularItemInABasket_1.items.cast(StringType()), 
                                            mostPopularItemInABasket_1.prediction.cast(StringType()))

In [30]:
DF_cast.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- items: string (nullable = false)
 |-- prediction: string (nullable = true)



In [31]:
DF_cast.head(3)

[Row(order_id=1342, items='[30827, 3798, 14966, 21137, 46129, 33081, 13176, 7862]', prediction='[21903, 47626, 47766, 47209, 16797, 26209, 27966, 39275, 27845, 30391, 45007, 22935, 24964, 4920, 46979, 40706, 8518, 42265, 45066, 31717, 5876, 44632, 43352, 28204, 5450, 21616, 19057, 30489, 26604, 37646, 27104, 49235, 28985, 44359, 48679, 41950, 17794, 43961, 34126, 10749, 39877, 12341, 8277, 19660, 35951, 24838, 46667, 22035, 8174, 11520, 22825, 39928, 18465, 27521, 16759, 9839, 27156, 24852, 8424, 37067]'),
 Row(order_id=1591, items='[48246, 44116, 24852, 5194, 9130, 48823, 46473, 40310, 32520, 22105, 16900, 27681, 4103, 44008, 17758, 41671, 25316, 45061, 38805, 48205, 25237, 19604, 5384, 27344, 17203, 18792, 12986, 39758, 34358, 31215, 9387]', prediction='[21137, 21903, 47626, 47766, 47209, 16797, 26209, 27966, 39275, 27845, 30391, 45007, 22935, 24964, 4920, 46979, 40706, 8518, 4605, 42265, 45066, 31717, 44632, 43352, 28204, 5450, 8424, 21616, 24184, 19057, 30489, 26604, 37646, 27104, 