<a href="https://colab.research.google.com/github/julioger/FP_Growth_Apache/blob/main/FP_Growth_Apache.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import pandas as pd
import numpy as np
data = pd.read_excel('/content/gdrive/MyDrive/DDDDFolder/DDDD.xlsx') 
data.head()

Unnamed: 0,SalesDate,SalesValue,SalesAmount,Customer,SalesTransactionID,SalesItem
0,2018-09-28,8280.0,10,0,0,0
1,2018-09-28,7452.0,10,0,0,0
2,2019-04-23,21114.0,30,0,1,0
3,2019-04-23,7038.0,10,0,1,1
4,2019-04-23,7000.0,2,0,1,2


In [4]:
%%capture

!sudo apt-get update --fix-missing

!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

#!wget -q https://downloads.apache.org/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
!mv spark-3.0.0-bin-hadoop3.2.tgz sparkkk

!tar xf sparkkk

!pip install -q findspark

In [5]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark = SparkSession \
.builder \
.appName('fpgrowth') \
.getOrCreate()
spark

In [6]:
from pyspark.sql import functions as F
from pyspark.ml.fpm import FPGrowth
import pandas
sparkdata = spark.createDataFrame(data)

In [7]:
basketdata = sparkdata.dropDuplicates(['SalesTransactionID', 'SalesItem']).sort('SalesTransactionID')
basketdata = basketdata.groupBy("SalesTransactionID").agg(F.collect_list("SalesItem")).sort('SalesTransactionID')

In [8]:
#Frequent Pattern Growth – FP Growth is a method of mining frequent itemsets using support, lift, and confidence.
fpGrowth = FPGrowth(itemsCol="collect_list(SalesItem)", minSupport=0.006, minConfidence=0.006)
model = fpGrowth.fit(basketdata)
# Display frequent itemsets.
model.freqItemsets.show()
items = model.freqItemsets
# Display generated association rules.
model.associationRules.show()
rules = model.associationRules
# transform examines the input items against all the association rules and summarize the consequents as prediction
model.transform(basketdata).show()
transformed = model.transform(basketdata)

+-------------------+----+
|              items|freq|
+-------------------+----+
|              [257]| 432|
|               [20]|2837|
|              [104]|2417|
|          [104, 20]| 981|
|             [1491]| 432|
|              [110]|2172|
|         [110, 104]| 745|
|     [110, 104, 20]| 476|
|          [110, 20]| 765|
|             [1495]| 431|
|              [103]|2123|
|         [103, 110]| 671|
|    [103, 110, 104]| 445|
|[103, 110, 104, 20]| 348|
|     [103, 110, 20]| 444|
|         [103, 104]| 885|
|     [103, 104, 20]| 572|
|          [103, 20]| 861|
|              [179]| 431|
|               [67]|1975|
+-------------------+----+
only showing top 20 rows

+------------+----------+-------------------+------------------+
|  antecedent|consequent|         confidence|              lift|
+------------+----------+-------------------+------------------+
|       [128]|      [67]| 0.3379978471474704|  8.28753607390552|
|       [128]|      [91]|0.34230355220667386|10.666918802548512|
|

In [9]:
# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = items.select("*").toPandas()
result_pdf.head()

Unnamed: 0,items,freq
0,[257],432
1,[20],2837
2,[104],2417
3,"[104, 20]",981
4,[1491],432


In [10]:
result_pdf.to_excel('result_pdfItemsFreq.xlsx')

In [11]:
rules_pdf = rules.select("*").toPandas()
rules_pdf.head()

Unnamed: 0,antecedent,consequent,confidence,lift
0,[128],[67],0.337998,8.287536
1,[128],[91],0.342304,10.666919
2,[128],[104],0.387513,7.764057
3,[128],[92],0.315393,9.648273
4,[128],[103],0.355221,8.102645


In [12]:
rules_pdf.to_excel('rules_pdfAnteConseConfLift.xlsx')
transformed_pdf = transformed.select("*").toPandas()
transformed_pdf.head()

Unnamed: 0,SalesTransactionID,collect_list(SalesItem),prediction
0,0,[0],[]
1,1,"[0, 1, 2]",[]
2,2,[1],[]
3,3,[0],[]
4,4,[0],[]


In [13]:
transformed_pdf.to_excel('transformed_pdfSalesTransactionIDCollectListPred.xlsx')