In [15]:
from pyspark.sql import SparkSession, functions as f
import os

In [16]:

if not os.path.exists("../../../data/insights"):
    os.makedirs("../../../data/insights")

if not os.path.exists("../../../data/insights/pre_insights"):
    os.makedirs("../../../data/insigths/pre_insights")

In [17]:
spark = (
    SparkSession.builder.appName("Preprocessing_Yellow")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '3g')   
    .config('spark.executor.memory', '4g')  
    .config('spark.executor.instances', '2')  
    .config('spark.executor.cores', '2')
    .getOrCreate()
)

In [18]:
all_data_combined = spark.read.parquet("../../../data/curated/all_data_combined.parquet")

In [19]:
all_data_combined.show()

+-----------------+------------+--------------+-------+-----------+-------------+--------------+---------------+-----------------------------+----------------------------+--------------------+--------------------+----------------------+--------------------+----------------------------+--------------------------------------+----------------------------+-----------------------------------------------+-----------------------------------------------+------------------------------------+-------------------------------------------+------------------------------------------+-------------------------------------------+------------------------------------+
|consumer_postcode|merchant_abn|order_datetime|user_id|consumer_id|consumer_name|consumer_state|consumer_gender|transaction_dollar_value_$AUD|consumer_fraud_probability_%|       merchant_name|merchant_description|merchant_revenue_level|merchant_take_rate_%|merchant_fraud_probability_%|consumer_postcode_estimated_population|consumer_postcode_m

In [20]:
all_data_combined.columns

['consumer_postcode',
 'merchant_abn',
 'order_datetime',
 'user_id',
 'consumer_id',
 'consumer_name',
 'consumer_state',
 'consumer_gender',
 'transaction_dollar_value_$AUD',
 'consumer_fraud_probability_%',
 'merchant_name',
 'merchant_description',
 'merchant_revenue_level',
 'merchant_take_rate_%',
 'merchant_fraud_probability_%',
 'consumer_postcode_estimated_population',
 'consumer_postcode_median_age',
 'consumer_postcode_median_mortgage_repay_monthly',
 'consumer_postcode_median_totl_prsnal_inc_weekly',
 'consumer_postcode_median_rent_weekly',
 'consumer_postcode_median_tot_fam_inc_weekly',
 'consumer_postcode_avg_num_psns_per_bedroom',
 'consumer_postcode_median_tot_hhd_inc_weekly',
 'consumer_postcode_avg_household_size']

In [21]:
all_df = all_data_combined.drop(all_data_combined.consumer_gender).\
        drop(all_data_combined.user_id).\
        drop(all_data_combined.consumer_gender).\
        withColumnRenamed("transaction_dollar_value_$AUD", "dollar_value")
    

In [22]:
all_df.columns

['consumer_postcode',
 'merchant_abn',
 'order_datetime',
 'consumer_id',
 'consumer_name',
 'consumer_state',
 'dollar_value',
 'consumer_fraud_probability_%',
 'merchant_name',
 'merchant_description',
 'merchant_revenue_level',
 'merchant_take_rate_%',
 'merchant_fraud_probability_%',
 'consumer_postcode_estimated_population',
 'consumer_postcode_median_age',
 'consumer_postcode_median_mortgage_repay_monthly',
 'consumer_postcode_median_totl_prsnal_inc_weekly',
 'consumer_postcode_median_rent_weekly',
 'consumer_postcode_median_tot_fam_inc_weekly',
 'consumer_postcode_avg_num_psns_per_bedroom',
 'consumer_postcode_median_tot_hhd_inc_weekly',
 'consumer_postcode_avg_household_size']

In [23]:
merchant = all_df.select(all_df['merchant_abn'],\
                         all_df['merchant_name'],\
                         all_df['merchant_revenue_level'],\
                         all_df['merchant_take_rate_%'],\
                         all_df['merchant_fraud_probability_%'],\
                        )

merchant.write.mode("overwrite").parquet("../../../data/insights/pre_insights/merchant.parquet")

                                                                                

In [24]:
consumers = all_df.select("merchant_abn",
                "merchant_name",
                "consumer_id",
                "consumer_name",
                "consumer_fraud_probability_%")
consumers.write.mode("overwrite").parquet("../../../data/insights/pre_insights/consumers.parquet")

                                                                                

In [25]:
orders = all_df.select(all_df.merchant_abn,
                       all_df.merchant_name,
                       all_df.consumer_id,
                       all_df.order_datetime,
                       all_df.dollar_value)

orders.write.mode("overwrite").parquet("../../../data/insights/pre_insights/orders.parquet")

                                                                                

In [26]:
descriptions = all_df.select(all_df.merchant_abn,
                     all_df.merchant_name,
                     all_df.merchant_description)

descriptions.write.mode("overwrite").parquet("../../../data/insights/pre_insights/descriptions.parquet")

                                                                                

In [27]:
postcode = all_df.select(all_df.merchant_abn,
                         all_df.consumer_postcode,
                         all_df.consumer_id,
                         all_df.merchant_name,
                         f.col("consumer_postcode_estimated_population").alias("estimated_population"),
                         f.col("consumer_postcode_median_age").alias("median_age"),
                         f.col("consumer_postcode_median_mortgage_repay_monthly").alias("median_mortgage_monthly"),
                         f.col("consumer_postcode_median_totl_prsnal_inc_weekly").alias("total_weekly_personal_income"),
                         f.col("consumer_postcode_median_rent_weekly").alias("median_weekly_rent"),
                         f.col("consumer_postcode_median_tot_fam_inc_weekly").alias("total_weekly_fam_income"),
                         f.col("consumer_postcode_avg_num_psns_per_bedroom").alias("avg_num_persons_per_bedroom"),
                         f.col("consumer_postcode_median_tot_hhd_inc_weekly").alias("total_hhd_income_weekly"),
                         f.col("consumer_postcode_avg_household_size").alias("avg_household_size"))

postcode.write.mode("overwrite").parquet("../../../data/insights/pre_insights/postcode.parquet")


                                                                                

In [28]:
spark.stop()