In [2]:
spark = SparkSession.builder.appName("Paysim Notebook").getOrCreate()

In [5]:
# load dataset
df = spark.read.option("header",True).option("inferSchema",True).csv("use_cases/paysim/*.csv")

In [7]:
# check schema
df.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)



In [8]:
# view basic stats
df.describe().show()

+-------+------------------+--------+-----------------+-----------+------------------+-----------------+-----------+------------------+------------------+--------------------+--------------------+
|summary|              step|    type|           amount|   nameOrig|     oldbalanceOrg|   newbalanceOrig|   nameDest|    oldbalanceDest|    newbalanceDest|             isFraud|      isFlaggedFraud|
+-------+------------------+--------+-----------------+-----------+------------------+-----------------+-----------+------------------+------------------+--------------------+--------------------+
|  count|           6362620| 6362620|          6362620|    6362620|           6362620|          6362620|    6362620|           6362620|           6362620|             6362620|             6362620|
|   mean|243.39724563151657|    null|179861.9035491287|       null| 833883.1040744762|855113.6685785811|       null|1100701.6665196535|1224996.3982019224|0.001290820448180152| 2.51468734577894E-6|
| stddev|142.33

In [9]:
# check fraud by type
df.groupBy("type").pivot("isFraud").count().show()

+--------+-------+----+
|    type|      0|   1|
+--------+-------+----+
|TRANSFER| 528812|4097|
| CASH_IN|1399284|null|
|CASH_OUT|2233384|4116|
| PAYMENT|2151495|null|
|   DEBIT|  41432|null|
+--------+-------+----+



In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassificationModel
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.feature import VectorAssembler

In [11]:
# convert type to integer category
labelIndexer = StringIndexer().setInputCol("type").setOutputCol("typeLabel").fit(df)

In [13]:
# generate feature vector
assembler = VectorAssembler().setInputCols([
"typeLabel","amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest"]).setOutputCol("features")

In [15]:
# Split data
trainingData, testData = df.randomSplit([0.7, 0.3])

In [16]:
# use decision tree classifier
dt = DecisionTreeClassifier().setLabelCol("isFraud").setFeaturesCol("features")

In [17]:
# generate pipeline
pipeline = Pipeline().setStages([labelIndexer, assembler, dt])

In [18]:
# generate model
model = pipeline.fit(trainingData)

In [19]:
# predict
predictions = model.transform(testData)

In [20]:
# check accuracy
evaluator = MulticlassClassificationEvaluator().setLabelCol("isFraud").setPredictionCol("prediction").setMetricName("accuracy")
accuracy = evaluator.evaluate(predictions)
print accuracy

0.999588600298


In [32]:
# print model tree
treeModel = model.stages[2]
print "Learned classification tree model:", treeModel.toDebugString

Learned classification tree model: DecisionTreeClassificationModel (uid=DecisionTreeClassifier_409287d863ebe06731c0) of depth 5 with 45 nodes
  If (feature 1 <= 711870.24)
   If (feature 0 in {1.0,2.0,4.0})
    Predict: 0.0
   Else (feature 0 not in {1.0,2.0,4.0})
    If (feature 5 <= 0.0)
     If (feature 0 in {0.0})
      If (feature 1 <= 1458.95)
       Predict: 0.0
      Else (feature 1 > 1458.95)
       Predict: 0.0
     Else (feature 0 not in {0.0})
      If (feature 4 <= 0.0)
       Predict: 1.0
      Else (feature 4 > 0.0)
       Predict: 0.0
    Else (feature 5 > 0.0)
     If (feature 2 <= 112544.0)
      If (feature 5 <= 114735.72)
       Predict: 0.0
      Else (feature 5 > 114735.72)
       Predict: 0.0
     Else (feature 2 > 112544.0)
      If (feature 3 <= 0.0)
       Predict: 0.0
      Else (feature 3 > 0.0)
       Predict: 0.0
  Else (feature 1 > 711870.24)
   If (feature 2 <= 773479.0)
    If (feature 5 <= 0.0)
     If (feature 2 <= 366833.0)
      Predict: 0.0
     El

In [34]:
# save model
model.write().overwrite().save("use_cases/paysim/pymodel/")