In [25]:
from pyspark.sql import SparkSession, functions as f
import os

In [26]:
if not os.path.exists("../../../data/insights/agg_insight_data"):
    os.makedirs("../../../data/insights/agg_insight_data")

In [27]:
spark = (
    SparkSession.builder.appName("Preprocessing_Yellow")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '3g')   
    .config('spark.executor.memory', '4g')  
    .config('spark.executor.instances', '2')  
    .config('spark.executor.cores', '2')
    .getOrCreate()
)

In [28]:
merchant = spark.read.parquet("../../../data/insights/pre_insights/merchant.parquet/")

In [29]:
merchant.show(truncate= False)

+------------+------------------------------------+----------------------+--------------------+----------------------------+
|merchant_abn|merchant_name                       |merchant_revenue_level|merchant_take_rate_%|merchant_fraud_probability_%|
+------------+------------------------------------+----------------------+--------------------+----------------------------+
|92980848589 |Maecenas Industries                 |a                     |6.86                |0.0                         |
|58454491168 |Diam At Foundation                  |a                     |6.01                |0.0                         |
|24852446429 |Erat Vitae LLP                      |c                     |2.94                |0.0                         |
|45559085309 |A Auctor Non Corporation            |a                     |5.58                |0.0                         |
|58392414752 |Mattis Ornare Lectus Inc.           |c                     |1.47                |0.0                         |


In [30]:
merchant.dtypes

[('merchant_abn', 'bigint'),
 ('merchant_name', 'string'),
 ('merchant_revenue_level', 'string'),
 ('merchant_take_rate_%', 'float'),
 ('merchant_fraud_probability_%', 'double')]

In [31]:
merchant.select(merchant.merchant_name, merchant.merchant_revenue_level).distinct().count()

                                                                                

4026

In [32]:
merchant_revenue_level_count = merchant.groupBy("merchant_abn").agg(f.countDistinct("merchant_revenue_level").alias("revenue_level_count"))
merchant_take_rate_count = merchant.groupBy("merchant_abn").agg(f.countDistinct("merchant_take_rate_%").alias("take_rate_count"))

In [33]:
print(merchant_revenue_level_count.filter(merchant_revenue_level_count.revenue_level_count > 1).count())
print(merchant_take_rate_count.filter(merchant_take_rate_count.take_rate_count > 1).count())

0




0


                                                                                

In [34]:
merchant_aggregated = merchant.groupBy("merchant_abn").agg(\
                        f.first("merchant_name").alias("name"),
                        f.first("merchant_revenue_level").alias("revenue_level"),
                        f.first("merchant_take_rate_%").alias("take_rate"),
                        f.avg("merchant_fraud_probability_%").alias("average_merchant_fraud_probability"))

In [35]:
merchant_aggregated.show()



+------------+--------------------+-------------+---------+----------------------------------+
|merchant_abn|                name|revenue_level|take_rate|average_merchant_fraud_probability|
+------------+--------------------+-------------+---------+----------------------------------+
| 10023283211|       Felis Limited|            e|     0.18|                               0.0|
| 10323485998|           Nunc Inc.|            a|     6.61|                               0.0|
| 10342410215|Facilisis Facilis...|            a|     6.34|                               0.0|
| 10346855916|      Odio Institute|            b|     3.57|                               0.0|
| 10385011947|   Tellus Foundation|            b|     3.17|                               0.0|
| 10385163239|      Sed Et Company|            a|     6.61|                               0.0|
| 10487253336|Laoreet Lectus As...|            b|     3.92|                               0.0|
| 10553813474|Commodo Hendrerit...|            b| 

                                                                                

In [36]:
merchant_aggregated.write.mode("overwrite").parquet("../../../data/insights/agg_insight_data/merchant_agg.parquet")

                                                                                