# Spark Inizialitation

In [1]:
# Import findspark to read SPARK_HOME and HADOOP_HOME
import findspark
findspark.init()

In [2]:
# Import required library
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Tugas FP Growth") \
    .getOrCreate()

In [3]:
# Print Spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x0000029F41829780>


# Loading Dataset

In [5]:
#E-Commerce Data from https://www.kaggle.com/carrie1/ecommerce-data

df = spark.read.csv("F:/Semester 6/BigData/Tugas FPGrowth/data.csv", header=True, inferSchema=True)

In [6]:
df.show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.

In [7]:
df.schema

StructType(List(StructField(InvoiceNo,StringType,true),StructField(StockCode,StringType,true),StructField(Description,StringType,true),StructField(Quantity,IntegerType,true),StructField(InvoiceDate,StringType,true),StructField(UnitPrice,DoubleType,true),StructField(CustomerID,IntegerType,true),StructField(Country,StringType,true)))

In [8]:
df.createOrReplaceTempView("ecommerce")

In [9]:
query1 = spark.sql("SELECT InvoiceNo as ID, StockCode as Items FROM ecommerce")
query1.show()

+------+------+
|    ID| Items|
+------+------+
|536365|85123A|
|536365| 71053|
|536365|84406B|
|536365|84029G|
|536365|84029E|
|536365| 22752|
|536365| 21730|
|536366| 22633|
|536366| 22632|
|536367| 84879|
|536367| 22745|
|536367| 22748|
|536367| 22749|
|536367| 22310|
|536367| 84969|
|536367| 22623|
|536367| 22622|
|536367| 21754|
|536367| 21755|
|536367| 21777|
+------+------+
only showing top 20 rows



In [11]:
query2 = spark.sql("SELECT InvoiceNo as ID, StockCode as Items, Description FROM ecommerce")
query2.show()

+------+------+--------------------+
|    ID| Items|         Description|
+------+------+--------------------+
|536365|85123A|WHITE HANGING HEA...|
|536365| 71053| WHITE METAL LANTERN|
|536365|84406B|CREAM CUPID HEART...|
|536365|84029G|KNITTED UNION FLA...|
|536365|84029E|RED WOOLLY HOTTIE...|
|536365| 22752|SET 7 BABUSHKA NE...|
|536365| 21730|GLASS STAR FROSTE...|
|536366| 22633|HAND WARMER UNION...|
|536366| 22632|HAND WARMER RED P...|
|536367| 84879|ASSORTED COLOUR B...|
|536367| 22745|POPPY'S PLAYHOUSE...|
|536367| 22748|POPPY'S PLAYHOUSE...|
|536367| 22749|FELTCRAFT PRINCES...|
|536367| 22310|IVORY KNITTED MUG...|
|536367| 84969|BOX OF 6 ASSORTED...|
|536367| 22623|BOX OF VINTAGE JI...|
|536367| 22622|BOX OF VINTAGE AL...|
|536367| 21754|HOME BUILDING BLO...|
|536367| 21755|LOVE BUILDING BLO...|
|536367| 21777|RECIPE BOX WITH M...|
+------+------+--------------------+
only showing top 20 rows



# Data Grouping

In [12]:
#Mengelompokkan data berdasarkan InvoiceNo

from pyspark.sql.functions import collect_list
query1_group = query1.groupby("ID").agg(collect_list('Items').alias('Items'))

In [13]:
query1_group.show()

+-------+--------------------+
|     ID|               Items|
+-------+--------------------+
| 536596|[21624, 22900, 22...|
| 536938|[22386, 85099C, 2...|
| 537252|             [22197]|
| 537691|[22791, 22171, 82...|
| 538041|             [22145]|
| 538184|[22585, 21481, 22...|
| 538517|[22491, 21232, 21...|
| 538879|[84819, 22150, 21...|
| 539275|[22909, 22423, 22...|
| 539630|[21484, 85099B, 2...|
| 540499|[21868, 22697, 22...|
| 540540|[21877, 21868, 21...|
| 540976|[22394, 21890, 22...|
| 541432|[21485, 22457, 84...|
| 541518|[21880, 21881, 21...|
| 541783|[22423, 22854, 22...|
| 542026|[21754, 82600, 22...|
| 542375|[21731, 22367, 22...|
|C540850|             [21231]|
| 543641|[85123A, 21833, 2...|
+-------+--------------------+
only showing top 20 rows



# Remove Duplicates Data

In [15]:
#Menghilangkan data rangkap

from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

distinct_items= udf(lambda row: list(set(row)), ArrayType(StringType()))
query1_group = query1_group.withColumn("Distinct Items", distinct_items("Items"))

In [16]:
query1_group.show()

+-------+--------------------+--------------------+
|     ID|               Items|      Distinct Items|
+-------+--------------------+--------------------+
| 536596|[21624, 22900, 22...|[84926A, 21624, 2...|
| 536938|[22386, 85099C, 2...|[21479, 84997B, 2...|
| 537252|             [22197]|             [22197]|
| 537691|[22791, 22171, 82...|[20975, 22149, 21...|
| 538041|             [22145]|             [22145]|
| 538184|[22585, 21481, 22...|[22492, 22561, 48...|
| 538517|[22491, 21232, 21...|[22197, 22844, 22...|
| 538879|[84819, 22150, 21...|[22593, 22983, 22...|
| 539275|[22909, 22423, 22...|[22423, 21914, 22...|
| 539630|[21484, 85099B, 2...|[22988, 84347, 22...|
| 540499|[21868, 22697, 22...|[21755, 84978, 22...|
| 540540|[21877, 21868, 21...|[22555, 22551, 22...|
| 540976|[22394, 21890, 22...|[22207, 21110, 84...|
| 541432|[21485, 22457, 84...|[22113, 22457, 21...|
| 541518|[21880, 21881, 21...|[20724, 21982, 20...|
| 541783|[22423, 22854, 22...|[22197, 84978, 22...|
| 542026|[21

# FP Growth Algorithm

In [17]:
from pyspark.ml.fpm import FPGrowth

In [18]:
fpGrowth = FPGrowth(itemsCol="Distinct Items", minSupport=0.5, minConfidence=0.5)
model=fpGrowth.fit(query1_group)

In [19]:
model.freqItemsets.show()

+-----+----+
|items|freq|
+-----+----+
+-----+----+



In [20]:
model.associationRules.show()

+----------+----------+----------+
|antecedent|consequent|confidence|
+----------+----------+----------+
+----------+----------+----------+



In [21]:
fpGrowth1 = FPGrowth(itemsCol="Distinct Items", minSupport=0.05, minConfidence=0.1)
model1=fpGrowth1.fit(query1_group)

In [22]:
model1.freqItemsets.show()

+--------+----+
|   items|freq|
+--------+----+
|[85123A]|2246|
| [22423]|2172|
|[85099B]|2135|
| [47566]|1706|
| [20725]|1608|
| [84879]|1468|
| [22720]|1462|
| [22197]|1442|
| [21212]|1334|
| [22383]|1306|
| [20727]|1295|
+--------+----+



In [23]:
model1.associationRules.show()

+----------+----------+----------+
|antecedent|consequent|confidence|
+----------+----------+----------+
+----------+----------+----------+



In [24]:
fpGrowth2 = FPGrowth(itemsCol="Distinct Items", minSupport=0.01, minConfidence=0.1)
model2=fpGrowth2.fit(query1_group)

In [25]:
model2.freqItemsets.show()

+----------------+----+
|           items|freq|
+----------------+----+
|         [22633]| 487|
|         [23236]| 344|
|        [85123A]|2246|
|         [22423]|2172|
| [22423, 85123A]| 355|
|         [22667]| 486|
|         [22579]| 343|
|  [22579, 22578]| 282|
|        [85099B]|2135|
| [85099B, 22423]| 288|
|[85099B, 85123A]| 404|
|         [22620]| 486|
|        [84536A]| 342|
|         [71053]| 342|
|         [47566]|1706|
| [47566, 85099B]| 332|
|  [47566, 22423]| 398|
| [47566, 85123A]| 391|
|         [85150]| 483|
|         [20725]|1608|
+----------------+----+
only showing top 20 rows



In [26]:
model2.associationRules.show()

+--------------+----------+-------------------+
|    antecedent|consequent|         confidence|
+--------------+----------+-------------------+
|       [22554]|   [22551]| 0.4823695345557123|
|       [22554]|   [22556]| 0.3991537376586742|
|       [22960]|   [21212]|0.21885245901639344|
|       [22960]|  [85099B]|0.23688524590163934|
|       [22960]|   [22423]|0.23852459016393443|
|       [22960]|   [22720]| 0.3155737704918033|
|       [22960]|   [22961]|0.38934426229508196|
|       [22960]|   [22666]|0.28032786885245903|
|       [22960]|   [22993]| 0.2540983606557377|
|       [22960]|   [22697]|0.21475409836065573|
|       [22960]|   [22722]|0.22131147540983606|
|[20726, 22382]|   [20728]|  0.546583850931677|
|[20726, 22382]|   [20725]| 0.6356107660455487|
|[20726, 22382]|   [20727]| 0.5445134575569358|
|[20726, 22382]|   [22383]| 0.5403726708074534|
|       [21977]|   [21212]| 0.4948571428571429|
|       [21977]|   [84991]| 0.4045714285714286|
|       [22699]|   [22423]|0.47946428571

In [27]:
model2.transform(query1_group).show()

+-------+--------------------+--------------------+--------------------+
|     ID|               Items|      Distinct Items|          prediction|
+-------+--------------------+--------------------+--------------------+
| 536596|[21624, 22900, 22...|[84926A, 21624, 2...|[23355, 22112, 22...|
| 536938|[22386, 85099C, 2...|[21479, 84997B, 2...|[85099B, 20725, 2...|
| 537252|             [22197]|             [22197]|[85099B, 20725, 2...|
| 537691|[22791, 22171, 82...|[20975, 22149, 21...|[21212, 85099B, 2...|
| 538041|             [22145]|             [22145]|                  []|
| 538184|[22585, 21481, 22...|[22492, 22561, 48...|[85099B, 85123A, ...|
| 538517|[22491, 21232, 21...|[22197, 22844, 22...|[85099B, 20725, 2...|
| 538879|[84819, 22150, 21...|[22593, 22983, 22...|[84991, 85099B, 2...|
| 539275|[22909, 22423, 22...|[22423, 21914, 22...|[85123A, 85099B, ...|
| 539630|[21484, 85099B, 2...|[22988, 84347, 22...|[85123A, 47566, 2...|
| 540499|[21868, 22697, 22...|[21755, 84978, 22...|

In [44]:
dataframe=spark.createDataFrame([
            ('0',['21231'])
        ],['ID','Distinct Items'])

In [45]:
dataframe.show()

+---+--------------+
| ID|Distinct Items|
+---+--------------+
|  0|       [21231]|
+---+--------------+



In [46]:
model2.transform(dataframe).show()

+---+--------------+----------+
| ID|Distinct Items|prediction|
+---+--------------+----------+
|  0|       [21231]|   [21232]|
+---+--------------+----------+



# Conclusion

In [37]:
query2.createOrReplaceTempView("conclusion")

In [49]:
conc=spark.sql("SELECT DISTINCT Description FROM conclusion WHERE Items='21231'")

In [50]:
conc.show()

+--------------------+
|         Description|
+--------------------+
|SWEETHEART CERAMI...|
+--------------------+



In [52]:
conc1=spark.sql("SELECT DISTINCT Description FROM conclusion WHERE Items='21232'")

In [53]:
conc1.show()

+--------------------+
|         Description|
+--------------------+
|STRAWBERRY CERAMI...|
|STRAWBERRY CERAMI...|
+--------------------+



##### Dapat disimpulkan jika ketika orang membeli Sweetheart Ceramic juga akan memberli Strawberry Ceramic

# Reference

1. FP Growth Tutorial https://spark.apache.org/docs/2.3.0/ml-frequent-pattern-mining.html
2. Menghilangkan multiple data https://stackoverflow.com/questions/54185710/remove-duplicates-from-pyspark-array-column