# Null & Missing Analysis

This notebook conducts analysis of null and missing values in the data, then checks again after joining.

In [4]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col, isnan, when, count
from pyspark.sql.types import DateType
from pyspark.sql import DataFrame

In [5]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("null analysis")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

Functions to check null or missing values and size of dataframe

In [6]:
def count_null_values(df):
    """
    This function counts the number of null, None, or NaN values for all columns in a Spark DataFrame.
    """
    # Use a list comprehension to count null, None, or NaN values for each column
    count_expr = [count(when(col(c).isNull(), c)).alias(c) for c in df.columns]

    # Apply the count expression and show the result
    result = df.select(count_expr)
    result.show()

In [7]:
def count_missing_data(df):
    """
    This function counts the number of missing data entries (empty strings) for all columns in a Spark DataFrame.
    """
    # Use a list comprehension to count missing data entries for each column
    count_expr = [F.count(when(col(c) == '', c)).alias(c) for c in df.columns]

    # Apply the count expression and show the result
    result = df.select(count_expr)
    result.show()


In [8]:
def count_rows_and_print(df: DataFrame) -> int:
    """
    Count the number of rows in a Spark DataFrame and print the result.
    Input:
        A Spark DataFrame for which you want to count the rows.
    Output:
        The number of rows in the DataFrame.
    """
    row_count = df.count()
    print(f"Number of rows in the DataFrame: {row_count}")
    return row_count


# Census Data

- sa2_census.parquet
- sa2_pops.parquet
- sa2_to_postcode.parquet

In [9]:
# Load in census data
census_file_path = '../../../data/nulls&missing_analysis/sa2/sa2_census.parquet'
pop_file_path = '../../../data/nulls&missing_analysis/sa2/sa2_pops.parquet'
post_file_path = '../../../data/nulls&missing_analysis/sa2/sa2_to_postcode.parquet'

sa2_census = spark.read.parquet(census_file_path)
sa2_pops = spark.read.parquet(pop_file_path)
sa2_post =  spark.read.parquet(post_file_path)

Checking size of dataframes.

In [10]:
print("sa2_census:")
count_rows_and_print(sa2_census)

print("sa2_pops:")
count_rows_and_print(sa2_pops)

print("sa2_post:")
count_rows_and_print(sa2_post)

sa2_census:
Number of rows in the DataFrame: 2472
sa2_pops:
Number of rows in the DataFrame: 2454
sa2_post:
Number of rows in the DataFrame: 35040


35040

Checking for null values in sa2 data

In [11]:
print("Checking sa2_census:")
count_null_values(sa2_census)

print("Checking sa2_pops:")
count_null_values(sa2_pops)

print("Checking sa2_post:")
count_null_values(sa2_post)

Checking sa2_census:
+--------+--------------+---------------------------------+-------------------------------+----------------------+-----------------------------+--------------------------------+-----------------------------+--------------------------+
|sa2_code|sa2_median_age|sa2_median_mortgage_repay_monthly|sa2_median_tot_prsnl_inc_weekly|sa2_median_rent_weekly|sa2_median_tot_fam_inc_weekly|sa2_average_num_psns_per_bedroom|sa2_median_tot_hhd_inc_weekly|sa2_average_household_size|
+--------+--------------+---------------------------------+-------------------------------+----------------------+-----------------------------+--------------------------------+-----------------------------+--------------------------+
|       0|             0|                                0|                              0|                     0|                            0|                               0|                            0|                         0|
+--------+--------------+--------------

Checking for missing entries in sa2 data

In [12]:
print("Checking sa2_census:")
count_missing_data(sa2_census)

print("Checking sa2_pops:")
count_missing_data(sa2_pops)

print("Checking sa2_post:")
count_missing_data(sa2_post)

Checking sa2_census:
+--------+--------------+---------------------------------+-------------------------------+----------------------+-----------------------------+--------------------------------+-----------------------------+--------------------------+
|sa2_code|sa2_median_age|sa2_median_mortgage_repay_monthly|sa2_median_tot_prsnl_inc_weekly|sa2_median_rent_weekly|sa2_median_tot_fam_inc_weekly|sa2_average_num_psns_per_bedroom|sa2_median_tot_hhd_inc_weekly|sa2_average_household_size|
+--------+--------------+---------------------------------+-------------------------------+----------------------+-----------------------------+--------------------------------+-----------------------------+--------------------------+
|       0|             0|                                0|                              0|                     0|                            0|                               0|                            0|                         0|
+--------+--------------+--------------

There are 363 null postcodes in the sa2_to_postcode dataframe out of the 35,040 rows. There are no missing data entries in any of the initial sa2 datasets.

# Consumer Data
- consumer_fraud_probability.parquet
- consumer_tbl.parquet
- consumer_user_details.parquet

In [13]:
# Load in consumer data
fp_file_path = '../../../data/nulls&missing_analysis/consumer/consumer_fraud_probability.parquet'
cons_file_path = '../../../data/nulls&missing_analysis/consumer/consumer_tbl.parquet'
user_det_file_path = '../../../data/nulls&missing_analysis/consumer/consumer_user_details.parquet'

cons_fp = spark.read.parquet(fp_file_path)
cons_tbl = spark.read.parquet(cons_file_path)
cons_user_det =  spark.read.parquet(user_det_file_path)

Checking size of dataframes.

In [14]:
print("cons_fp:")
count_rows_and_print(cons_fp)

print("cons_tbl:")
count_rows_and_print(cons_tbl)

print("cons_user_det:")
count_rows_and_print(cons_user_det)

cons_fp:
Number of rows in the DataFrame: 34864
cons_tbl:
Number of rows in the DataFrame: 499999
cons_user_det:
Number of rows in the DataFrame: 499999


499999

Checking for null values in the consumer data.

In [15]:
print("Checking cons_tbl:")
count_null_values(cons_tbl)

print("Checking cons_user_det:")
count_null_values(cons_user_det)

print("Checking cons_fp:")
count_null_values(cons_fp)

Checking cons_tbl:


+-------------+--------------+-----------------+---------------+-----------+
|consumer_name|consumer_state|consumer_postcode|consumer_gender|consumer_id|
+-------------+--------------+-----------------+---------------+-----------+
|            0|             0|                0|              0|          0|
+-------------+--------------+-----------------+---------------+-----------+

Checking cons_user_det:
+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
|      0|          0|
+-------+-----------+

Checking cons_fp:
+-------+--------------+----------------------------+
|user_id|order_datetime|consumer_fraud_probability_%|
+-------+--------------+----------------------------+
|      0|             0|                           0|
+-------+--------------+----------------------------+



Checking for missing data entries in the consumer data.

In [16]:
print("Checking cons_tbl:")
count_missing_data(cons_tbl)

print("Checking cons_user_det:")
count_missing_data(cons_user_det)

print("Checking cons_fp:")
count_missing_data(cons_fp)

Checking cons_tbl:


+-------------+--------------+-----------------+---------------+-----------+
|consumer_name|consumer_state|consumer_postcode|consumer_gender|consumer_id|
+-------------+--------------+-----------------+---------------+-----------+
|            0|             0|                0|              0|          0|
+-------------+--------------+-----------------+---------------+-----------+

Checking cons_user_det:
+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
|      0|          0|
+-------+-----------+

Checking cons_fp:
+-------+--------------+----------------------------+
|user_id|order_datetime|consumer_fraud_probability_%|
+-------+--------------+----------------------------+
|      0|             0|                           0|
+-------+--------------+----------------------------+



There are no null values or missing entries in the consumer dataframes.

# Merchant Data
- merchant_fraud_probability.parquet
- merchant_tbl.parquet

In [17]:
# Load in merchant data
fp_file_path = '../../../data/nulls&missing_analysis/merchant/merchant_fraud_probability.parquet'
merch_file_path = '../../../data/nulls&missing_analysis/merchant/merchant_tbl.parquet'

merch_fp = spark.read.parquet(fp_file_path)
merch_tbl = spark.read.parquet(merch_file_path)

Checking size of dataframes.

In [18]:
print("merch_fp:")
count_rows_and_print(merch_fp)

print("merch_tbl:")
count_rows_and_print(merch_tbl)

merch_fp:
Number of rows in the DataFrame: 114
merch_tbl:
Number of rows in the DataFrame: 4026


4026

Checking for null values in the merchant data

In [19]:
print("Checking merch_tbl:")
count_null_values(merch_tbl)

print("Checking merch_fp:")
count_null_values(merch_fp)

Checking merch_tbl:
+-------------+------------+-----+-------------+-----------+
|merchant_name|merchant_abn|words|revenue_level|take_rate_%|
+-------------+------------+-----+-------------+-----------+
|            0|           0|    0|            0|          0|
+-------------+------------+-----+-------------+-----------+

Checking merch_fp:
+------------+--------------+----------------------------+
|merchant_abn|order_datetime|merchant_fraud_probability_%|
+------------+--------------+----------------------------+
|           0|             0|                           0|
+------------+--------------+----------------------------+



Checking for missing entries in the merchant data

In [20]:
print("Checking cons_tbl:")
count_missing_data(merch_tbl)

print("Checking merch_fp:")
count_missing_data(merch_fp)

Checking cons_tbl:


+-------------+------------+-----+-------------+-----------+
|merchant_name|merchant_abn|words|revenue_level|take_rate_%|
+-------------+------------+-----+-------------+-----------+
|            0|           0|    0|            0|          0|
+-------------+------------+-----+-------------+-----------+

Checking merch_fp:
+------------+--------------+----------------------------+
|merchant_abn|order_datetime|merchant_fraud_probability_%|
+------------+--------------+----------------------------+
|           0|             0|                           0|
+------------+--------------+----------------------------+



There are no null values or missing entries in the merchant dataframes.

# Transaction Data
- transactions_all.parquet

In [21]:
# Load in transaction data
transactions_file_path = '../../../data/nulls&missing_analysis/transaction/transactions_all.parquet'

transactions_all = spark.read.parquet(transactions_file_path)

Checking size of dataframe

In [22]:
print("transactions_all:")
count_rows_and_print(transactions_all)

transactions_all:
Number of rows in the DataFrame: 14195505


14195505

Checking for null values in the transaction data

In [23]:
print("transactions_all:")
count_null_values(transactions_all)

transactions_all:


[Stage 84:>                                                       (0 + 12) / 14]

+-------+------------+------------+--------+--------------+
|user_id|merchant_abn|dollar_value|order_id|order_datetime|
+-------+------------+------------+--------+--------------+
|      0|           0|           0|       0|             0|
+-------+------------+------------+--------+--------------+



                                                                                

Checking for missing values in the transaction data

In [24]:
print("transactions_all:")
count_missing_data(transactions_all)

transactions_all:
+-------+------------+------------+--------+--------------+
|user_id|merchant_abn|dollar_value|order_id|order_datetime|
+-------+------------+------------+--------+--------------+
|      0|           0|           0|       0|             0|
+-------+------------+------------+--------+--------------+



                                                                                

There are no null values or missing entries in the transaction dataframe.

# Joining Datasets

Joining datasets together, checking for null values after each join and missing data

# Join cons_tbl data with cons_user_det

In [25]:
cons_join = cons_tbl.join(cons_user_det, on='consumer_id', how='inner')

Check for null values

In [26]:
print("Check cons_join for null values:")
count_null_values(cons_join)

Check cons_join for null values:
+-----------+-------------+--------------+-----------------+---------------+-------+
|consumer_id|consumer_name|consumer_state|consumer_postcode|consumer_gender|user_id|
+-----------+-------------+--------------+-----------------+---------------+-------+
|          0|            0|             0|                0|              0|      0|
+-----------+-------------+--------------+-----------------+---------------+-------+



Check for missing rows after joining cons_tbl data with cons_user_det

In [27]:
print("cons_join:")
count_rows_and_print(cons_join)

cons_join:
Number of rows in the DataFrame: 499999


499999

There are no null values after joining consumer data. 
Also there are no missing data after the first join. That is the number of rows before the join is 499,999 and then there is still 499,999 rows after the join.

# Join cons_join with transaction_all

In [28]:
cons_transaction = cons_join.join(transactions_all, on='user_id', how ='inner')

Check for nulls after joining cons_join and transaction_all

In [29]:
count_null_values(cons_transaction)



+-------+-----------+-------------+--------------+-----------------+---------------+------------+------------+--------+--------------+
|user_id|consumer_id|consumer_name|consumer_state|consumer_postcode|consumer_gender|merchant_abn|dollar_value|order_id|order_datetime|
+-------+-----------+-------------+--------------+-----------------+---------------+------------+------------+--------+--------------+
|      0|          0|            0|             0|                0|              0|           0|           0|       0|             0|
+-------+-----------+-------------+--------------+-----------------+---------------+------------+------------+--------+--------------+



                                                                                

Now check for missing rows after joining cons_join and transactions_all.

In [30]:
print("cons_transaction:")
count_rows_and_print(cons_transaction)

cons_transaction:


                                                                                

Number of rows in the DataFrame: 14195505


14195505

There are no null values after joining in the transaction data. 
Also there is no missing data after the first join. That is the number of rows in transaction before the join is 14,195,505 and then there is still 14,195,505 rows after the join.

# Join cons_transaction with cons_fp

In [31]:
cons_transaction_with_fraud = cons_transaction.join(cons_fp, on=['user_id', 'order_datetime'], how='left')

Check for nulls after joining cons_transaction and cons_fp

In [32]:
count_null_values(cons_transaction_with_fraud)



+-------+--------------+-----------+-------------+--------------+-----------------+---------------+------------+------------+--------+----------------------------+
|user_id|order_datetime|consumer_id|consumer_name|consumer_state|consumer_postcode|consumer_gender|merchant_abn|dollar_value|order_id|consumer_fraud_probability_%|
+-------+--------------+-----------+-------------+--------------+-----------------+---------------+------------+------------+--------+----------------------------+
|      0|             0|          0|            0|             0|                0|              0|           0|           0|       0|                    14115157|
+-------+--------------+-----------+-------------+--------------+-----------------+---------------+------------+------------+--------+----------------------------+



                                                                                

Now check for missing rows after joining cons_transaction and cons_fp.

In [33]:
print("cons_transaction_with_fraud:")
count_rows_and_print(cons_transaction_with_fraud)

cons_transaction_with_fraud:
Number of rows in the DataFrame: 14195717


14195717

There is 14,115,157 null values in the consumer fraud probability values out of the 14,195,717 rows.

Also there is an increase in the size of the data after this join. The number of rows before the join is 14,195,505 and then there 14,195,717 rows after the join. So 212 rows are gained in this join, most likely as a result of there being multiple matches in the right table for a single row in the left table.

# Join cons_transaction_with_fraud with merch_tbl

In [34]:
cons_transaction_fraud_merchant = cons_transaction_with_fraud.join(merch_tbl, on='merchant_abn', how='inner')

Check for nulls after joining cons_transaction_with_fraud and merch_tbl

In [35]:
count_null_values(cons_transaction_fraud_merchant)



+------------+-------+--------------+-----------+-------------+--------------+-----------------+---------------+------------+--------+----------------------------+-------------+-----+-------------+-----------+
|merchant_abn|user_id|order_datetime|consumer_id|consumer_name|consumer_state|consumer_postcode|consumer_gender|dollar_value|order_id|consumer_fraud_probability_%|merchant_name|words|revenue_level|take_rate_%|
+------------+-------+--------------+-----------+-------------+--------------+-----------------+---------------+------------+--------+----------------------------+-------------+-----+-------------+-----------+
|           0|      0|             0|          0|            0|             0|                0|              0|           0|       0|                    13543038|            0|    0|            0|          0|
+------------+-------+--------------+-----------+-------------+--------------+-----------------+---------------+------------+--------+--------------------------

                                                                                

Checking for missing rows after joining cons_transaction_with_fraud and merch_tbl

In [36]:
print("cons_transaction_fraud_merchant:")
count_rows_and_print(cons_transaction_fraud_merchant)

cons_transaction_fraud_merchant:


                                                                                

Number of rows in the DataFrame: 13614854


13614854

Still the only column with null values is consumer fraud probabilities with 13,543,038 missing out of 13,614,854 rows.

There is a decrease in the size of this data after this join. The number of rows before the join is 14,195,505 whereas after the join there is 13,614,854 rows. This means that there are 580,651 missing rows of data after this join.

# Join cons_transaction_fraud_merchant with merch_fp

In [37]:
combined_data_all = cons_transaction_fraud_merchant.join(merch_fp, on=['merchant_abn', 'order_datetime'], how='left')

Check for nulls after joining cons_transaction_fraud_merchant and merch_fp

In [38]:
count_null_values(combined_data_all)



+------------+--------------+-------+-----------+-------------+--------------+-----------------+---------------+------------+--------+----------------------------+-------------+-----+-------------+-----------+----------------------------+
|merchant_abn|order_datetime|user_id|consumer_id|consumer_name|consumer_state|consumer_postcode|consumer_gender|dollar_value|order_id|consumer_fraud_probability_%|merchant_name|words|revenue_level|take_rate_%|merchant_fraud_probability_%|
+------------+--------------+-------+-----------+-------------+--------------+-----------------+---------------+------------+--------+----------------------------+-------------+-----+-------------+-----------+----------------------------+
|           0|             0|      0|          0|            0|             0|                0|              0|           0|       0|                    13543038|            0|    0|            0|          0|                    13610826|
+------------+--------------+-------+-------

                                                                                

Checking for missing rows after joining cons_transaction_fraud_merchant and merch_fp

In [39]:
print("combined_data_all:")
count_rows_and_print(combined_data_all)

combined_data_all:
Number of rows in the DataFrame: 13614854


                                                                                

13614854

After joining the merchant fraud probability, there are 2 columns that have null values. These are consumer_fraud_probability_% and merchant_fraud_probability_%. consumer_fraud_probability_% has 13,543,038 null values and merchant_fraud_probability_% has 13,610,826 null values.

There is also no change in the size of this data after this join. The number of rows before the join is 13,614,854 and remains the same after the join.

# Joining SA2/Census Data

# Join the sa2_to_post to sa2_pops

In [40]:
sa2_postcode_and_pops = sa2_post.join(sa2_pops, on='sa2_code', how='inner').withColumnRenamed('population_2021','sa2_population')

Check for nulls after joining sa2_post and sa2_pops

In [41]:
count_null_values(sa2_postcode_and_pops)

+--------+--------+--------+--------+--------+--------------+
|sa2_code|postcode|sa4_name|sa3_name|sa2_name|sa2_population|
+--------+--------+--------+--------+--------+--------------+
|       0|     363|       0|       0|       0|             0|
+--------+--------+--------+--------+--------+--------------+



Checking for missing rows after joining sa2_post and sa2_pops

In [42]:
print("sa2_postcode_and_pops:")
count_rows_and_print(sa2_postcode_and_pops)

sa2_postcode_and_pops:
Number of rows in the DataFrame: 35040


35040

There are no null values from this join. Also no rows are missing, with 35,040 rows still remaining after the join

# Join the sa2_postcode_and_pops with sa2_census

In [43]:
# Join the SA2 data
sa2_combined = sa2_postcode_and_pops.join(sa2_census, on='sa2_code', how='inner')

Check for nulls after joining sa2_postcode_and_pops and sa2_census

In [44]:
count_null_values(sa2_combined)

+--------+--------+--------+--------+--------+--------------+--------------+---------------------------------+-------------------------------+----------------------+-----------------------------+--------------------------------+-----------------------------+--------------------------+
|sa2_code|postcode|sa4_name|sa3_name|sa2_name|sa2_population|sa2_median_age|sa2_median_mortgage_repay_monthly|sa2_median_tot_prsnl_inc_weekly|sa2_median_rent_weekly|sa2_median_tot_fam_inc_weekly|sa2_average_num_psns_per_bedroom|sa2_median_tot_hhd_inc_weekly|sa2_average_household_size|
+--------+--------+--------+--------+--------+--------------+--------------+---------------------------------+-------------------------------+----------------------+-----------------------------+--------------------------------+-----------------------------+--------------------------+
|       0|     363|       0|       0|       0|             0|             0|                                0|                              0|

Checking for missing rows after sa2_postcode_and_pops and sa2_census

In [45]:
print("sa2_combined:")
count_rows_and_print(sa2_combined)

sa2_combined:
Number of rows in the DataFrame: 35040


35040

There are still the null values present from the other previous search (363 postcode values are null). 

Also after the joining, there are still 35,040 rows in the dataframe. No rows have been lost joining the data together.

# Now to join all tables together
First group together SA2 stats (copied from the ETL)

In [46]:
# Group SA2 statistics by postcodes by taking averages and medians
postcode_combined = sa2_combined.groupBy("postcode") \
    .agg(F.avg("sa2_population").alias("consumer_postcode_estimated_population"), \
        F.median("sa2_median_age").alias("consumer_postcode_median_age"), \
        F.median("sa2_median_mortgage_repay_monthly").alias("consumer_postcode_median_mortgage_repay_monthly"), \
        F.median("sa2_median_tot_prsnl_inc_weekly").alias("consumer_postcode_median_totl_prsnal_inc_weekly"), \
        F.median("sa2_median_rent_weekly").alias("consumer_postcode_median_rent_weekly"), \
        F.median("sa2_median_tot_fam_inc_weekly").alias("consumer_postcode_median_tot_fam_inc_weekly"), \
        F.avg("sa2_average_num_psns_per_bedroom").alias("consumer_postcode_avg_num_psns_per_bedroom"), \
        F.median("sa2_median_tot_hhd_inc_weekly").alias("consumer_postcode_median_tot_hhd_inc_weekly"), \
        F.avg("sa2_average_household_size").alias("consumer_postcode_avg_household_size")
    ).withColumnRenamed('postcode', 'consumer_postcode')


postcode_combined = postcode_combined.select(*[F.round(c, 2).alias(c) for c in postcode_combined.columns])

postcode_combined = postcode_combined.withColumn("consumer_postcode_estimated_population", F.round(F.col("consumer_postcode_estimated_population")))

Now combine with rest of data

In [47]:
# Combine everything together
all_combined = combined_data_all.join(postcode_combined.withColumnRenamed('postcode','consumer_postcode'), on='consumer_postcode', how='inner')

Checking for null values after joining all data

In [48]:
count_null_values(all_combined)



+-----------------+------------+--------------+-------+-----------+-------------+--------------+---------------+------------+--------+----------------------------+-------------+-----+-------------+-----------+----------------------------+--------------------------------------+----------------------------+-----------------------------------------------+-----------------------------------------------+------------------------------------+-------------------------------------------+------------------------------------------+-------------------------------------------+------------------------------------+
|consumer_postcode|merchant_abn|order_datetime|user_id|consumer_id|consumer_name|consumer_state|consumer_gender|dollar_value|order_id|consumer_fraud_probability_%|merchant_name|words|revenue_level|take_rate_%|merchant_fraud_probability_%|consumer_postcode_estimated_population|consumer_postcode_median_age|consumer_postcode_median_mortgage_repay_monthly|consumer_postcode_median_totl_prsnal_i

                                                                                

Checking for missing rows after joining all data

In [49]:
print("all_combined:")
count_rows_and_print(all_combined)

all_combined:


[Stage 244:>                                                      (0 + 12) / 13]

Number of rows in the DataFrame: 11372905


                                                                                

11372905

No new columns with null values. Still consumer_fraud_probability_% and merchant_fraud_probability_%. consumer_fraud_probability_% has 11,312,983 nulls and merchant_fraud_probability_% has 11,369,563 null values.

There is a decrease in the size of this data after this join. The number of rows before the join is 13,614,854 whereas after the join there is 11,372,905 rows. This means that there are 2,241,949 missing rows of data after this join.