In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
spark

In [4]:
df = spark.read.csv("data/financial_data.csv", inferSchema=True, header=True)

In [5]:
df.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)



In [6]:
df.show(2)

+----+-------+-------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|   type| amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+-------+-------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1|PAYMENT|9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1|PAYMENT|1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
+----+-------+-------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
only showing top 2 rows



In [7]:
df = df.select("type", "amount", "oldbalanceOrg", "newbalanceOrig", "isFraud")

In [8]:
df.show(2)

+-------+-------+-------------+--------------+-------+
|   type| amount|oldbalanceOrg|newbalanceOrig|isFraud|
+-------+-------+-------------+--------------+-------+
|PAYMENT|9839.64|     170136.0|     160296.36|      0|
|PAYMENT|1864.28|      21249.0|      19384.72|      0|
+-------+-------+-------------+--------------+-------+
only showing top 2 rows



## Train/test split

In [9]:
train, test = df.randomSplit([0.7, 0.3], seed=7)

In [10]:
print(f"Train set length: {train.count()} records")
print(f"Test set length: {test.count()} records")

Train set length: 4451877 records
Test set length: 1910743 records


In [11]:
train.show(2)

+-------+------+-------------+--------------+-------+
|   type|amount|oldbalanceOrg|newbalanceOrig|isFraud|
+-------+------+-------------+--------------+-------+
|CASH_IN|  1.42|   1270713.41|    1270714.83|      0|
|CASH_IN|  4.35|   4136277.22|    4136281.57|      0|
+-------+------+-------------+--------------+-------+
only showing top 2 rows



## Dtypes

In [12]:
train.dtypes

[('type', 'string'),
 ('amount', 'double'),
 ('oldbalanceOrg', 'double'),
 ('newbalanceOrig', 'double'),
 ('isFraud', 'int')]

In [14]:
cat_cols = [col for col, data_type in train.dtypes if data_type == "string"]
cat_cols

['type']

In [15]:
num_cols = [col for col, data_type in train.dtypes if ((data_type == 'double') & (col != "isFraud"))]
num_cols

['amount', 'oldbalanceOrg', 'newbalanceOrig']

## One hot encoding

In [17]:
train.agg(F.countDistinct("type")).show()

+-----------+
|count(type)|
+-----------+
|          5|
+-----------+



In [18]:
train.groupBy("type").count().show()

+--------+-------+
|    type|  count|
+--------+-------+
|TRANSFER| 372791|
| CASH_IN| 979384|
|CASH_OUT|1565928|
| PAYMENT|1504894|
|   DEBIT|  28880|
+--------+-------+



In [19]:
from pyspark.ml.feature import(
    OneHotEncoder,
    StringIndexer
)

In [21]:
string_indexer = [
    StringIndexer(inputCol=col, outputCol=col + "_StringIndexer", handleInvalid="skip")
    for col in cat_cols
]

In [22]:
string_indexer

[StringIndexer_5b04446eb553]

In [23]:
one_hot_encoder = [
    OneHotEncoder(
        inputCols=[f"{col}_StringIndexer" for col in cat_cols],
        outputCols=[f"{col}_OneHotEncoder" for col in cat_cols]
    )
]

In [24]:
one_hot_encoder

[OneHotEncoder_4402695e442a]

## Vector assembling

In [26]:
# Vector assembler combines the values of input columns into a single vector
from pyspark.ml.feature import VectorAssembler

In [30]:
assembler_input = [col for col in num_cols]
assembler_input += [f"{col}_OneHotEncoder" for col in cat_cols]

In [31]:
assembler_input

['amount', 'oldbalanceOrg', 'newbalanceOrig', 'type_OneHotEncoder']

In [32]:
vector_assembler = VectorAssembler(
    inputCols=assembler_input, outputCol="VectorAssembler_features"
)

In [33]:
# Create a pipeline
stages = []
stages += string_indexer
stages += one_hot_encoder
stages += [vector_assembler]

In [34]:
stages

[StringIndexer_5b04446eb553,
 OneHotEncoder_4402695e442a,
 VectorAssembler_8627d4496d9f]

In [36]:
%%time
from pyspark.ml import Pipeline

pipeline = Pipeline().setStages(stages)

model = pipeline.fit(train)

pp_df = model.transform(test)

CPU times: user 15.8 ms, sys: 5.48 ms, total: 21.3 ms
Wall time: 4.17 s


In [37]:
pp_df.select(
    "type", "amount", "oldbalanceOrg", "newbalanceOrig", "VectorAssembler_features"
).show(truncate=False)

+-------+------+-------------+--------------+---------------------------------------------------+
|type   |amount|oldbalanceOrg|newbalanceOrig|VectorAssembler_features                           |
+-------+------+-------------+--------------+---------------------------------------------------+
|CASH_IN|4.58  |94241.0      |94245.58      |[4.58,94241.0,94245.58,0.0,0.0,1.0,0.0]            |
|CASH_IN|5.44  |0.0          |5.44          |(7,[0,2,5],[5.44,5.44,1.0])                        |
|CASH_IN|6.07  |400680.0     |400686.07     |[6.07,400680.0,400686.07,0.0,0.0,1.0,0.0]          |
|CASH_IN|6.76  |11322.0      |11328.76      |[6.76,11322.0,11328.76,0.0,0.0,1.0,0.0]            |
|CASH_IN|8.27  |8428410.94   |8428419.21    |[8.27,8428410.94,8428419.21,0.0,0.0,1.0,0.0]       |
|CASH_IN|8.44  |39384.0      |39392.44      |[8.44,39384.0,39392.44,0.0,0.0,1.0,0.0]            |
|CASH_IN|11.13 |4116439.74   |4116450.87    |[11.13,4116439.74,4116450.87,0.0,0.0,1.0,0.0]      |
|CASH_IN|12.79 |6017

## Logistic regression

In [38]:
from pyspark.ml.classification import LogisticRegression

In [39]:
data = pp_df.select(
    F.col("VectorAssembler_features").alias("features"),
    F.col("isFraud").alias("label")
)

In [40]:
data.show(5, truncate=False)

+--------------------------------------------+-----+
|features                                    |label|
+--------------------------------------------+-----+
|[4.58,94241.0,94245.58,0.0,0.0,1.0,0.0]     |0    |
|(7,[0,2,5],[5.44,5.44,1.0])                 |0    |
|[6.07,400680.0,400686.07,0.0,0.0,1.0,0.0]   |0    |
|[6.76,11322.0,11328.76,0.0,0.0,1.0,0.0]     |0    |
|[8.27,8428410.94,8428419.21,0.0,0.0,1.0,0.0]|0    |
+--------------------------------------------+-----+
only showing top 5 rows



In [41]:
%%time 
model = LogisticRegression().fit(data)

CPU times: user 23.1 ms, sys: 3.89 ms, total: 26.9 ms
Wall time: 13.8 s


In [43]:
model.summary.areaUnderROC

0.9928744818975611

In [44]:
model.summary.pr.show()

+------------------+-------------------+
|            recall|          precision|
+------------------+-------------------+
|               0.0| 0.9127789046653144|
|0.3598560575769692| 0.9127789046653144|
|0.4838064774090364| 0.6813063063063063|
|0.5501799280287885| 0.5362431800467654|
| 0.594562175129948| 0.4430870083432658|
|0.6309476209516194|0.38060781476121563|
|0.6613354658136745|0.33508914100486226|
|0.6889244302279088| 0.3009081383164513|
|0.7113154738104758|0.27302025782688766|
|0.7305077968812475|0.25006843690117714|
|0.7477009196321471|0.23097826086956522|
|0.7616953218712516|0.21438217420661715|
|0.7752898840463814| 0.2003927242662257|
|0.7860855657736905|0.18784635964074145|
|0.7968812475009996|0.17706112295664533|
|0.8000799680127949|0.16611323260833472|
|0.8036785285885646|0.15659083826737302|
|0.8096761295481807|0.14861294583883752|
|0.8128748500599761|0.14102386237513873|
|0.8164734106357457| 0.1342890964093121|
+------------------+-------------------+
only showing top