In [0]:
%run "./transform"

In [0]:
%run "./extractor"

In [0]:
%run "./loader"

In [0]:
class FirstWorkFlow:

    """
    ETL Pipeline to generate the data for all customers who have bought Airpods just after buying Iphone
    """
    def __init__(self):
        pass

    def runner(self):
        #Step 1: Extract all required data from different source
        inputDf = AirpodsAfterIPhoneExtractor().extract()

        #Step 2: Implement the transformation logic
        #Customers who bought Airpods after Iphone
        firstTransformedDF = AirpodsAfterIphone().transform(inputDf)

        #Step 3: Load all required data to different link

        AirpodsLoader(firstTransformedDF).sink()
    

In [0]:
class SecondWorkFlow:

    """
    ETL Pipeline to generate the data for all customers who have bought only Iphone and Airpods
    """
    def __init__(self):
        pass

    def runner(self):
        #Step 1: Extract all required data from different source
        inputDf = AirpodsAfterIPhoneExtractor().extract()

        #Step 2: Implement the transformation logic
        #Customers who bought Airpods and Iphone only
        onlyAirPodsAndIphoneDF = onlyAirPodsAndIphone().transform(inputDf)

        #Step 3: Load all required data to different link

        AirpodsAndIphoneLoader(onlyAirPodsAndIphoneDF).sink()


In [0]:
class ThirdWorkFlow():
    """
    Find the avg time delay buying an iphone and buying airpods for each customer
    """

    def __init__(self):
        pass

    def runner(self):
        inputDf =  AirpodsAfterIPhoneExtractor().extract()
        avgTimeDelayDF = AverageTimeDelay().transform(inputDf)
        AverageTimeDelayLoader(avgTimeDelayDF).sink()


In [0]:
class FourthWorkFlow():
    """
    Find the top 3 selling products in each category by total revenue
    """

    def __init__(self):
        pass

    def runner(self):
        inputDf =  AirpodsAfterIPhoneExtractor().extract()
        topSellingProductsDF = TopSellingProducts().transform(inputDf)
        TopSellingProductsLoader(topSellingProductsDF).sink()


In [0]:
class WorkFlowRunner:
    def __init__(self,name):
        self.name = name


    def runner(self):
        if self.name == 'firstWorkFlow':
            return FirstWorkFlow().runner()
        elif self.name == 'secondWorkFlow':
            return SecondWorkFlow().runner()
        elif self.name == 'thirdWorkFlow':
            return ThirdWorkFlow().runner()
        elif self.name == 'fourthWorkFlow':
            return FourthWorkFlow().runner()

name = "fourthWorkFlow"

workFlowrunner = WorkFlowRunner(name).runner()

+--------------+-----------+------------+----------------+
|transaction_id|customer_id|product_name|transaction_date|
+--------------+-----------+------------+----------------+
|            11|        105|      iPhone|      2022-02-01|
|            12|        106|      iPhone|      2022-02-02|
|            13|        107|     AirPods|      2022-02-03|
|            14|        105|     AirPods|      2022-02-04|
|            15|        108|      iPhone|      2022-02-05|
|            16|        106|     MacBook|      2022-02-06|
|            17|        107|      iPhone|      2022-02-07|
|            18|        105|     MacBook|      2022-02-08|
|            19|        108|     AirPods|      2022-02-09|
|            20|        106|     AirPods|      2022-02-10|
+--------------+-----------+------------+----------------+

+----------+------------+----------+-----+
|product_id|product_name|  category|price|
+----------+------------+----------+-----+
|         5|   iPhone SE|Smartphone|  450|
|

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("test").getOrCreate()

input_trans_df  = spark.read.format("csv").option("header",True).load("dbfs:/FileStore/tables/Transaction_Updated.csv")

input_trans_df.show()


input_prod_df  = spark.read.format("csv").option("header",True).load("dbfs:/FileStore/tables/Products_Updated.csv")

input_prod_df.show()

input_cust_df  = spark.read.format("csv").option("header",True).load("dbfs:/FileStore/tables/Customer_Updated.csv")

input_cust_df.show()

+--------------+-----------+------------+----------------+
|transaction_id|customer_id|product_name|transaction_date|
+--------------+-----------+------------+----------------+
|            11|        105|      iPhone|      2022-02-01|
|            12|        106|      iPhone|      2022-02-02|
|            13|        107|     AirPods|      2022-02-03|
|            14|        105|     AirPods|      2022-02-04|
|            15|        108|      iPhone|      2022-02-05|
|            16|        106|     MacBook|      2022-02-06|
|            17|        107|      iPhone|      2022-02-07|
|            18|        105|     MacBook|      2022-02-08|
|            19|        108|     AirPods|      2022-02-09|
|            20|        106|     AirPods|      2022-02-10|
+--------------+-----------+------------+----------------+

+----------+------------+----------+-----+
|product_id|product_name|  category|price|
+----------+------------+----------+-----+
|         5|   iPhone SE|Smartphone|  450|
|