#### Requirement:
* Read dataset
* Pre-processing data
* Apply FPGrowth algorithm to find association rules from this dataset. Find the most popular items in a basket

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, collect_set, count
from pyspark.sql.types import StringType

from pyspark.ml.feature import StringIndexer
from pyspark.ml.fpm import FPGrowth
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
spark = SparkSession.builder.appName('ex_demo').getOrCreate()

In [None]:
# Load data
data = spark.read.csv("../../Data/order_products_train.csv", header=True, inferSchema=True)

In [None]:
data.count()

In [None]:
data.show()

### Pre-processing data

In [None]:
data.createOrReplaceTempView("order_products_train")

In [None]:
products = spark.sql("SELECT DISTINCT product_id FROM order_product_train")
products.count()

In [None]:
rawData = spark.sql("SELECT * FROM order_product_train")
baskets = rawData.groupBy('order_id').agg(collect_set('product_id').alias('items'))
baskets.createOrReplaceTempView("baskets")

In [None]:
baskets.show(5, False)

In [None]:
type(baskets)

In [None]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.003, minConfidence=0.003)
model = fpGrowth.fit(baskets)

In [None]:
# Display frequent itemsets
model.freqItemsets.show()

In [None]:
# Transform examines the input items agaist all the association rules and summary consquents as predict
mostPopularItemInABasket = model.transform(baskets)

In [None]:
mostPopularItemInABasket.show()

### Use product_name instead of product_id


In [None]:
product_data = spark.read.csv('../../Data/product.csv', header=True, inferSchema=True)

In [None]:
product_data.show(5, truncate=False)

In [None]:
product_data.createOrReplaceTempView("products")

In [None]:
rawData_1 = spark.sql('''SELECT p.product_name o.order_id 
                         FROM products p 
                         INNER JOIN order_products_train o
                         WHERE o.product_id = p.product_id''')
baskets_1 = rawData.groupBy('order_id').agg(collect_set('product_id').alias('items'))
baskets_1.createOrReplaceTempView("baskets_1")

In [None]:
baskets_1.head(3)

In [None]:
fpGrowth_1 = FPGrowth(itemsCol="items", minSupport=0.003, minConfidence=0.003)
model_1 = fpGrowth.fit(baskets_1)

In [None]:
# Display frequent itemsets
model_1.freqItemsets.show()

In [None]:
mostPopularItemInABasket_1 = model.transform(baskets_1)

In [None]:
mostPopularItemInABasket_1.head(3)

In [None]:
type(mostPopularItemInABasket_1)

In [None]:
mostPopularItemInABasket_1.printSchema()

In [None]:
mostPopularItemInABasket_1.createOrReplaceTempView("popular_items")

In [None]:
DF_cast = mostPopularItemInABasket_1.select('order_id', 
                                            mostPopularItemInABasket_1.items.cast(StringType()), 
                                            mostPopularItemInABasket_1.prediction.cast(StringType()))

In [None]:
DF_cast.printSchema()

In [None]:
DF_cast.head(3)