In [1]:
#packages
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Imputer
import math as m
from pyspark.ml.stat import Correlation
import numpy as np
import pandas as pd

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/08 12:39:37 WARN Utils: Your hostname, Lachys-Laptop, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/09/08 12:39:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/08 12:39:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
tbl_merchants = spark.read.parquet('data/tables/merchant_data/tbl_merchants.parquet')
consumer_user_details = spark.read.parquet('data/tables/merchant_data/consumer_user_details.parquet')
transactions21 = spark.read.parquet('data/tables/transaction_data/transactions_20210228_20210827_snapshot/')
transactions2122 = spark.read.parquet('data/tables/transaction_data/transactions_20210828_20220227_snapshot/')
transactions22 = spark.read.parquet('data/tables/transaction_data/transactions_20220228_20220828_snapshot/')
con_fraud_prob = spark.read.csv('data/tables/merchant_data/consumer_fraud_probability.csv')
merch_fraud_prob = spark.read.csv('data/tables/merchant_data/merchant_fraud_probability.csv')
tbl_con = spark.read.csv('data/tables/merchant_data/tbl_consumer.csv')
transactions = transactions21.unionByName(transactions2122)
transactions = transactions.unionByName(transactions22)

                                                                                

In [9]:
#Functions
def OHE_variables(data, cat_nom_columns, cat_ord_columns):
    
    """Indexes and encodes categorical features"""

    for c in cat_nom_columns:
        indexer = StringIndexer(inputCol=c, outputCol=str(c) + '_index')
        
        indexed_df = indexer.fit(data).transform(data)
        data.drop(c)
        encoder = OneHotEncoder(inputCol=str(c)+'_index', outputCol=str(c)+'_OHE')
        encoded_df = encoder.fit(indexed_df).transform(indexed_df)
        data.drop(str(c)+'_index')

    for c in cat_ord_columns:
        indexer = StringIndexer(inputCol=c, outputCol=str(c) + '_index')
        indexed_df = indexer.fit(data).transform(data)
        data.drop(c)
        
    return data

def find_NULL(dfs):
    for df in dfs:
        condition = f.lit(False)
        for col_name in df.columns:
            condition = condition | f.col(col_name).isNull()

        df.filter(condition).show()
    return 0

def filter_outliers(data, variables):
    
    """filters outliers of continuous data"""

    n=data.count()
    for feature in variables:
        # Calculate Q1 and Q3
        quantiles = data.approxQuantile(feature, [0.25, 0.75], 0.01)
        q1, q3 = quantiles
        iqr = q3 - q1

        #from ADS lecture slides, n>>100
        scale = m.sqrt(m.log(n)) - 0.5
        if scale<3:
            scale=3
        lower_bound = q1 - scale * iqr
        upper_bound = q3 + scale * iqr
        if lower_bound<0:
            data = data.filter((col(feature) >= 0) & (col(feature) <= upper_bound))
        else:
            data = data.filter((col(feature) >= lower_bound) & (col(feature) <= upper_bound))
    
    return data

def corr_func(data, CORR_COLS):

    """A function to return the correlation matrix of correlation between variables"""

    features = "correlation_features"

    assembler = VectorAssembler(
        inputCols=CORR_COLS, 
        outputCol=features 
    )
    
    feature_vector = assembler.transform(data).select(features)

    corr_matrix_dense = Correlation.corr(feature_vector, features)
    corr_matrix_dense.collect()
    corr_matrix = corr_matrix_dense.collect()[0][0].toArray().tolist()

    return corr_matrix

In [5]:
tbl_merchants.select('tags').show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------+
|tags                                                                                                             |
+-----------------------------------------------------------------------------------------------------------------+
|((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))|
|([cable, satellite, and otHer pay television and radio services], [b], [take rate: 4.22])                        |
|([jewelry, watch, clock, and silverware shops], [b], [take rate: 4.40])                                          |
|([wAtch, clock, and jewelry repair shops], [b], [take rate: 3.29])                                               |
|([music shops - musical instruments, pianos, and sheet music], [a], [take rate: 6.33])                           |
|[(gift, card, novelty, and souvenir shops), (a), (take rate: 6.34)]    

In [7]:
tbl_merchants

name,tags,merchant_abn
Felis Limited,"((furniture, home...",10023283211
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217
Nunc Sed Company,"([jewelry, watch,...",10165489824
Ultricies Digniss...,"([wAtch, clock, a...",10187291046
Enim Condimentum PC,([music shops - m...,10192359162
Fusce Company,"[(gift, card, nov...",10206519221
Aliquam Enim Inco...,"[(computers, comP...",10255988167
Ipsum Primis Ltd,"[[watch, clock, a...",10264435225
Pede Ultrices Ind...,([computer progra...,10279061213
Nunc Inc.,"[(furniture, home...",10323485998


In [6]:
transactions

user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20
3,76819856970,448.529684285612,5ace6a24-cdf0-4aa...,2021-08-20
18479,67609108741,86.4040605836911,d0e180f0-cb06-42a...,2021-08-20
3,34096466752,301.5793450525113,6fb1ff48-24bb-4f9...,2021-08-20
18482,70501974849,68.75486276223054,8505fb33-b69a-412...,2021-08-20
4,49891706470,48.89796461900801,ed11e477-b09f-4ae...,2021-08-20


In [11]:
find_NULL([transactions, tbl_merchants, tbl_con, consumer_user_details])

                                                                                

+-------+------------+------------+--------+--------------+
|user_id|merchant_abn|dollar_value|order_id|order_datetime|
+-------+------------+------------+--------+--------------+
+-------+------------+------------+--------+--------------+

+----+----+------------+
|name|tags|merchant_abn|
+----+----+------------+
+----+----+------------+

+---+
|_c0|
+---+
+---+

+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
+-------+-----------+



0

In [13]:
transactions2122

user_id,merchant_abn,dollar_value,order_id,order_datetime
14935,79417999332,136.06570809815838,23acbb7b-cf98-458...,2021-11-26
1,46451548968,72.61581642788431,76bab304-fa2d-400...,2021-11-26
14936,89518629617,3.0783487174439297,a2ae446a-2959-41c...,2021-11-26
1,49167531725,51.58228625503599,7080c274-17f7-4cc...,2021-11-26
14936,31101120643,25.2281149424178,8e301c0f-06ab-45c...,2021-11-26
2,67978471888,691.5028234458998,0380e9ad-b0e8-420...,2021-11-26
14936,60956456424,102.13952056640888,5ac3da9c-5147-452...,2021-11-26
2,47644196714,644.5220654863093,4e368e44-86f8-4de...,2021-11-26
14938,39649557865,209.12780951421405,4d78cd01-4bab-494...,2021-11-26
3,88402174457,141.0387993699113,c50c957d-ecfc-430...,2021-11-26


In [14]:
transactions22

user_id,merchant_abn,dollar_value,order_id,order_datetime
11139,96152467973,16.213590228273233,785b0080-9e4b-471...,2022-08-20
1,98973094975,86.97955945703498,2560f7b0-ee5d-4b3...,2022-08-20
11139,56762458844,31.513502323509197,0311717b-8b5b-410...,2022-08-20
1,89502033586,124.18468694868491,f8891626-f098-45b...,2022-08-20
11139,96161808980,61.620445567668966,d90a421f-f1da-4bf...,2022-08-20
2,72472909171,32.26524985312485,523e0403-b677-450...,2022-08-20
11139,91923722701,11.331586767322223,f45a842b-0366-41d...,2022-08-20
3,46380096952,119.80011239189334,58d0f423-037c-43f...,2022-08-20
11140,79283124876,198.13027742225435,60b12d41-41d6-4c1...,2022-08-20
4,67202032418,206.20865323560025,64a05a23-a078-481...,2022-08-20
