# Midterm Exam Big Data
- Name : Ingwer Ludwig
- NRP : 5025201259

# Install Libraries

In [None]:
# Install Apache Spark
!pip install pyspark



In [None]:
# Import required libraries

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.fpm import FPGrowth


# Initialize Spark Session

In [None]:
# Create Spark Session/Context
spark = SparkSession.builder \
    .master("local") \
    .appName("Frequent Itemset") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Import Dataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Import the data from CSV
data = spark.read.option("header", True).option("delimiter", ";").csv("/content/gdrive/My Drive/market-basket.csv")

data.show()

+------+--------------------+--------+----------------+-----+----------+--------------+
|BillNo|            Itemname|Quantity|            Date|Price|CustomerID|       Country|
+------+--------------------+--------+----------------+-----+----------+--------------+
|536365|WHITE HANGING HEA...|       6|01.12.2010 08:26| 2,55|     17850|United Kingdom|
|536365| WHITE METAL LANTERN|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|CREAM CUPID HEART...|       8|01.12.2010 08:26| 2,75|     17850|United Kingdom|
|536365|KNITTED UNION FLA...|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|RED WOOLLY HOTTIE...|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|SET 7 BABUSHKA NE...|       2|01.12.2010 08:26| 7,65|     17850|United Kingdom|
|536365|GLASS STAR FROSTE...|       6|01.12.2010 08:26| 4,25|     17850|United Kingdom|
|536366|HAND WARMER UNION...|       6|01.12.2010 08:28| 1,85|     17850|United Kingdom|
|536366|HAND WARMER RED P...|   

In [None]:
# Select only the first two column
itemlist = data.select("BillNo", "Itemname")
itemlist.show()

+------+--------------------+
|BillNo|            Itemname|
+------+--------------------+
|536365|WHITE HANGING HEA...|
|536365| WHITE METAL LANTERN|
|536365|CREAM CUPID HEART...|
|536365|KNITTED UNION FLA...|
|536365|RED WOOLLY HOTTIE...|
|536365|SET 7 BABUSHKA NE...|
|536365|GLASS STAR FROSTE...|
|536366|HAND WARMER UNION...|
|536366|HAND WARMER RED P...|
|536367|ASSORTED COLOUR B...|
|536367|POPPY'S PLAYHOUSE...|
|536367|POPPY'S PLAYHOUSE...|
|536367|FELTCRAFT PRINCES...|
|536367|IVORY KNITTED MUG...|
|536367|BOX OF 6 ASSORTED...|
|536367|BOX OF VINTAGE JI...|
|536367|BOX OF VINTAGE AL...|
|536367|HOME BUILDING BLO...|
|536367|LOVE BUILDING BLO...|
|536367|RECIPE BOX WITH M...|
+------+--------------------+
only showing top 20 rows



In [None]:
# Before dropping the duplicates
itemlist.count()

522064

In [None]:
# Remove the duplicates
item_raw = itemlist.dropDuplicates(["BillNo", "Itemname"])
item_raw.count()

511280

There is no duplicate data

In [None]:
# Convert into this format: ["id", "items"] = [1, [a, b, c]]
# You can use groupBy, agg, and collect_list methods

item_input = item_raw.groupBy("BillNo").agg(collect_list('Itemname').alias('item'))
item_input.show()

+------+--------------------+
|BillNo|                item|
+------+--------------------+
|536365|[KNITTED UNION FL...|
|536366|[HAND WARMER UNIO...|
|536367|[BOX OF VINTAGE J...|
|536368|[YELLOW COAT RACK...|
|536369|[BATH BUILDING BL...|
|536370|[SPACEBOY LUNCH B...|
|536371|[PAPER CHAIN KIT ...|
|536372|[HAND WARMER UNIO...|
|536373|[GLASS STAR FROST...|
|536374|[VICTORIAN SEWING...|
|536375|[SAVE THE PLANET ...|
|536376|[RED HANGING HEAR...|
|536377|[HAND WARMER RED ...|
|536378|[PACK OF 60 PINK ...|
|536380|[JAM MAKING SET P...|
|536381|[ZINC WILLIE WINK...|
|536382|[VINTAGE SNAKES &...|
|536384|[ENAMEL BREAD BIN...|
|536385|[TRADITIONAL CHRI...|
|536386|[JUMBO BAG RED RE...|
+------+--------------------+
only showing top 20 rows



In [None]:
dh_new = item_input.withColumn("item", array(item_input["item"]))
dh_new.show()

+------+--------------------+
|BillNo|                item|
+------+--------------------+
|536365|[[KNITTED UNION F...|
|536366|[[HAND WARMER UNI...|
|536367|[[BOX OF VINTAGE ...|
|536368|[[YELLOW COAT RAC...|
|536369|[[BATH BUILDING B...|
|536370|[[SPACEBOY LUNCH ...|
|536371|[[PAPER CHAIN KIT...|
|536372|[[HAND WARMER UNI...|
|536373|[[GLASS STAR FROS...|
|536374|[[VICTORIAN SEWIN...|
|536375|[[SAVE THE PLANET...|
|536376|[[RED HANGING HEA...|
|536377|[[HAND WARMER RED...|
|536378|[[PACK OF 60 PINK...|
|536380|[[JAM MAKING SET ...|
|536381|[[ZINC WILLIE WIN...|
|536382|[[VINTAGE SNAKES ...|
|536384|[[ENAMEL BREAD BI...|
|536385|[[TRADITIONAL CHR...|
|536386|[[JUMBO BAG RED R...|
+------+--------------------+
only showing top 20 rows



# Experiment with different minSupport and minConfidence

In [None]:
min_support= [0.1, 0.25, 0.5]
min_confidence = [0.25, 0.5]

In [None]:
def fpGrowthModel(ms, mc):
  for x in min_support:
    for y in min_confidence:
      # 1. Create the instance
      fpGrowth = FPGrowth(itemsCol="item", minSupport=x, minConfidence=y)

      # 2. Train/fit the data training to become a model
      model = fpGrowth.fit(dh_new)
      print(f"minSupport: {x}, minConfidence: {y}")


      # Display frequent itemsets.
      model.freqItemsets.show()

      # Display generated association rules.
      model.associationRules.show()

      # transform examines the input items against all the association rules and summarize the
      # consequents as prediction
      model.transform(dh_new).show()

In [None]:
fpGrowthModel(min_support,min_confidence)

minSupport: 0.1, minConfidence: 0.25
+-----+----+
|items|freq|
+-----+----+
+-----+----+

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

+------+--------------------+----------+
|BillNo|                item|prediction|
+------+--------------------+----------+
|536365|[[KNITTED UNION F...|        []|
|536366|[[HAND WARMER UNI...|        []|
|536367|[[BOX OF VINTAGE ...|        []|
|536368|[[YELLOW COAT RAC...|        []|
|536369|[[BATH BUILDING B...|        []|
|536370|[[SPACEBOY LUNCH ...|        []|
|536371|[[PAPER CHAIN KIT...|        []|
|536372|[[HAND WARMER UNI...|        []|
|536373|[[GLASS STAR FROS...|        []|
|536374|[[VICTORIAN SEWIN...|        []|
|536375|[[SAVE THE PLANET...|        []|
|536376|[[RED HANGING HEA...|        []|
|536377|[[HAND WARMER RED...|        []|
|536378|[[PACK OF 60 PINK...|        []|
|536380|[[JAM MAKING

We take a look into several metrics like min_support, confidence, lift, antencedence and consequent

### To sum up, the FP-Growth algorithm has not been successful so far. This could be due to the way the Itemname column is structured as a string of sentences or the absence of item IDs. If the CSV file included item IDs as a separate column, it would likely be easier to resolve these issues.