In [205]:
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession, functions as F
from collections import Counter


# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("ETL")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

## Load in datasets

In [206]:
# 'transactions.parquet' files
path = '../data/tables/transactions_20210228_20210827_snapshot'
transactions_21_02_21_08 = spark.read.parquet(path)

transactions_21_02_21_08.printSchema()


path = '../data/tables/transactions_20210828_20220227_snapshot'
transactions_21_08_22_02 = spark.read.parquet(path)

transactions_21_08_22_02.printSchema()

path = '../data/tables/transactions_20220228_20220828_snapshot'
transactions_22_02_22_08 = spark.read.parquet(path)

transactions_22_02_22_08.printSchema()

transactions_21_02_21_08.limit(10)


root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20
3,76819856970,448.529684285612,5ace6a24-cdf0-4aa...,2021-08-20
18479,67609108741,86.4040605836911,d0e180f0-cb06-42a...,2021-08-20
3,34096466752,301.5793450525113,6fb1ff48-24bb-4f9...,2021-08-20
18482,70501974849,68.75486276223054,8505fb33-b69a-412...,2021-08-20
4,49891706470,48.89796461900801,ed11e477-b09f-4ae...,2021-08-20


In [207]:
# 'consumer_fraud_probability.csv' file
path = '../data/tables/consumer_fraud_probability.csv'
cust_fp = spark.read.csv(path, header=True)

cust_fp.printSchema()
cust_fp.limit(10)
# cust_fp.dtypes
# cust_fp.count()

root
 |-- user_id: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- fraud_probability: string (nullable = true)



user_id,order_datetime,fraud_probability
6228,2021-12-19,97.6298077657765
21419,2021-12-10,99.24738020302328
5606,2021-10-17,84.05825045251777
3101,2021-04-17,91.42192091901347
22239,2021-10-19,94.70342477508036
16556,2022-02-20,89.65663294494827
10278,2021-09-28,83.59136689427714
15790,2021-12-30,71.77065889280253
5233,2021-08-29,85.87123303878818
230,2021-08-28,86.28328808934151


In [208]:
# 'consumer_user_details.parquet' file
path = '../data/tables/consumer_user_details.parquet'
cust_user_det = spark.read.parquet(path)

cust_user_det.printSchema()
cust_user_det.limit(10)
# cust_user_det.count()

root
 |-- user_id: long (nullable = true)
 |-- consumer_id: long (nullable = true)



user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975
6,407340
7,511685
8,448088
9,650435
10,1058499


In [209]:
# 'merchant_fraud_probability.csv' file
path = '../data/tables/merchant_fraud_probability.csv'
merch_fp = spark.read.csv(path, header=True)

merch_fp.printSchema()
merch_fp.limit(10)
# merch_fp.dtypes
# merch_fp.count()

root
 |-- merchant_abn: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- fraud_probability: string (nullable = true)



merchant_abn,order_datetime,fraud_probability
19492220327,2021-11-28,44.40365864749536
31334588839,2021-10-02,42.75530083865367
19492220327,2021-12-22,38.867790051131095
82999039227,2021-12-19,94.1347004808891
90918180829,2021-09-02,43.32551731714902
31334588839,2021-12-26,38.36165958070444
23686790459,2021-12-10,79.4543441508535
14827550074,2021-11-26,46.45775596795885
31334588839,2021-11-26,36.20971272078342
19492220327,2021-12-18,33.819672154331755


In [210]:
# 'tbl_consumer.csv' consumer data file
path = '../data/tables/tbl_consumer.csv'
cust_tbl = spark.read.csv(path, sep='|', header=True)

cust_tbl.printSchema()
cust_tbl.limit(10)
# cust_tbl.dtypes
# cust_tbl.count()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- consumer_id: string (nullable = true)



name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975
Karen Chapman,2706 Stewart Oval...,NSW,2033,Female,407340
Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
Stephen Williams,6804 Wright Crest...,WA,6056,Male,448088
Stephanie Reyes,5813 Denise Land ...,NSW,2482,Female,650435
Jillian Gonzales,461 Ryan Common S...,VIC,3220,Female,1058499


In [211]:
# 'tbl_merchants.csv' merchants data file
path = '../data/tables/tbl_merchants.parquet'
merch_tbl = spark.read.parquet(path)

merch_tbl.printSchema()
merch_tbl.limit(10)

root
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- merchant_abn: long (nullable = true)



name,tags,merchant_abn
Felis Limited,"((furniture, home...",10023283211
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217
Nunc Sed Company,"([jewelry, watch,...",10165489824
Ultricies Digniss...,"([wAtch, clock, a...",10187291046
Enim Condimentum PC,([music shops - m...,10192359162
Fusce Company,"[(gift, card, nov...",10206519221
Aliquam Enim Inco...,"[(computers, comP...",10255988167
Ipsum Primis Ltd,"[[watch, clock, a...",10264435225
Pede Ultrices Ind...,([computer progra...,10279061213
Nunc Inc.,"[(furniture, home...",10323485998


## 1st Step: Remove useless columns, data type conversions, rename columns where necessary

### Consumer Data

In [212]:
cust_fp = \
cust_fp.withColumn(
    'user_id',
    F.col('user_id').cast('long')
).withColumn(
    'order_datetime',
    F.col('order_datetime').cast('date')
).withColumn(
    'consumer_fraud_probability_%',
    F.col('fraud_probability').cast('double')
).drop('fraud_probability')

cust_fp.printSchema()


root
 |-- user_id: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- consumer_fraud_probability_%: double (nullable = true)



In [213]:
cust_user_det.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- consumer_id: long (nullable = true)



In [214]:
# May have to make postcodes long depending on the external dataset(s)
cust_tbl = \
cust_tbl.withColumnRenamed(
    'name',
    'consumer_name'   
).withColumnRenamed(
    'state',
    'consumer_state'   
).withColumnRenamed(
    'postcode',
    'consumer_postcode'   
).withColumnRenamed(
    'gender',
    'consumer_gender'   
).withColumn(
    'consumer_id',
    F.col('consumer_id').cast('long')
).drop('address')

cust_tbl.printSchema()

root
 |-- consumer_name: string (nullable = true)
 |-- consumer_state: string (nullable = true)
 |-- consumer_postcode: string (nullable = true)
 |-- consumer_gender: string (nullable = true)
 |-- consumer_id: long (nullable = true)



### Merchant Data

In [215]:
merch_fp = \
merch_fp.withColumn(
    'merchant_abn',
    F.col('merchant_abn').cast('long')
).withColumn(
    'order_datetime',
    F.col('order_datetime').cast('date')
).withColumn(
    'merchant_fraud_probability_%',
    F.col('fraud_probability').cast('double')
).drop('fraud_probability')

merch_fp.printSchema()

root
 |-- merchant_abn: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- merchant_fraud_probability_%: double (nullable = true)



In [216]:
merch_tbl = \
merch_tbl.withColumnRenamed(
    'name',
    'merchant_name'
)

merch_tbl.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- merchant_abn: long (nullable = true)



### Transactions

In [217]:
# should only have to use order id for checking for duplicate orders, otherwise can remove

transactions_21_02_21_08 = transactions_21_02_21_08.drop('order_id')

transactions_21_08_22_02 = transactions_21_08_22_02.drop('order_id')

transactions_22_02_22_08 = transactions_22_02_22_08.drop('order_id')

transactions_21_02_21_08.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_datetime: date (nullable = true)



### External Data

## 2nd Step: Cleaning, filtering, feature engineering, aggregation

### Consumer Data

In [218]:
cust_combined = cust_tbl.join(cust_fp.join(cust_user_det, on='user_id', how='inner'), on='consumer_id', how='inner')

In [219]:
# May have to add 0s to the start of 3-digit postcodes depending on the external dataset(s)

# check for any null values, duplicates, or invalid entries

### Merchant Data

In [220]:
merch_combined = merch_tbl.join(merch_fp, on='merchant_abn', how='inner')

In [222]:
import re
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, FloatType, ArrayType

# Have to extract features from tags (words, revenue band, take rate)
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer)
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html 

# Define a UDF to extract the sections within square and normal brackets
def split_tags(input_tags):
    sep_tags = re.findall(r'[\[\(\{]([^\[\]\(\)\{\}]*)[\]\)\}]', input_tags)
    sep_tags_without_brackets = [re.sub(r'[\[\]\(\)\{\}]', '', tag) for tag in sep_tags]
        
    return sep_tags_without_brackets

# Define a UDF to clean "take rate" numbers
def clean_take_rate(input_rate):
    match = re.search(r'take rate: ([\d.]+)', input_rate)
    if match:
        return float(match.group(1))
    return None

# Register the UDFs
split_tags_udf = udf(split_tags, ArrayType(StringType()))

clean_take_rate_udf = udf(clean_take_rate, FloatType())

# Apply the UDF to the DataFrame
merch_combined = merch_combined.withColumn("sep_tags", split_tags_udf(merch_combined["tags"]))



# Create separate columns for the other two tags
merch_combined = merch_combined.withColumn("words", merch_combined["sep_tags"].getItem(0))
merch_combined = merch_combined.withColumn("revenue_level", merch_combined["sep_tags"].getItem(1))
merch_combined = merch_combined.withColumn("take_rate", merch_combined["sep_tags"].getItem(2))

merch_combined = merch_combined.withColumn("take_rate", clean_take_rate_udf(merch_combined ["take_rate"]))



# check for any null values, duplicates, or invalid entries



In [225]:
merch_combined

merchant_abn,merchant_name,tags,order_datetime,merchant_fraud_probability_%,sep_tags,words,revenue_level,take_rate
11149063370,Et Arcu Limited,([art dealers and...,2022-02-25,51.01538421455241,[art dealers and ...,art dealers and g...,b,4.84
11149063370,Et Arcu Limited,([art dealers and...,2021-11-14,52.40780332276477,[art dealers and ...,art dealers and g...,b,4.84
11149063370,Et Arcu Limited,([art dealers and...,2021-08-28,56.43761254995139,[art dealers and ...,art dealers and g...,b,4.84
11470993597,Sed Associates,"((watch, clock, a...",2021-09-28,63.37734364737917,"[watch, clock, an...","watch, clock, and...",d,1.35
11590404675,Arcu Sed PC,((antique shops -...,2021-12-21,29.607818240092094,[antique shops - ...,antique shops - s...,b,4.19
14530561097,Duis At Inc.,"[[jewelry, watch,...",2021-09-15,80.80054474543395,"[jewelry, watch, ...","jewelry, watch, c...",c,1.69
15043504837,Odio Incorporated,"([jewelry, watch,...",2021-10-08,25.054391991473924,"[jewelry, watch, ...","jewelry, watch, c...",b,4.62
15043504837,Odio Incorporated,"([jewelry, watch,...",2021-12-14,26.12523097610844,"[jewelry, watch, ...","jewelry, watch, c...",b,4.62
15043504837,Odio Incorporated,"([jewelry, watch,...",2021-08-29,59.77648897297805,"[jewelry, watch, ...","jewelry, watch, c...",b,4.62
15157368385,Tempus Non Lacini...,[(artist supply a...,2021-12-13,64.2774131928303,[artist supply an...,artist supply and...,b,3.98


### Transactions

In [None]:


transactions_21_02_21_08 = transactions_21_02_21_08.drop('order_id')

transactions_21_08_22_02 = transactions_21_08_22_02.drop('order_id')

transactions_22_02_22_08 = transactions_22_02_22_08.drop('order_id')

transactions_21_02_21_08.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_datetime: date (nullable = true)



### External Data