In [1]:
#Set up Spark Session
spark = SparkSession.builder.appName("FraudDetection Notebook").getOrCreate()

In [2]:
import pyspark.sql.functions as F
from __future__ import division

In [3]:
#Load the data
rawDF = spark.read.csv("/user/edureka_524533/Datasets/PS_log.csv",inferSchema=True,header=True)

In [4]:
rawDF.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)



In [None]:
# Check what type of Transactions have Fraud been reported on
#fraudTransDF = rawDF.filter('isFraud==1')
#nonfraudTransDF = rawDF.filter('isFraud==0')

In [None]:
#Clearly only two transactions 'TRANSFER','CASH_OUT' have been marked as Fraudester till now
#fraudTransDF.groupBy('type').count().show()

# Data Cleaning

In [None]:
# we know that fraud only occurs in 'TRANSFER's and 'CASH_OUT's. 
# So we assemble only the corresponding data in X for analysis.
# Eliminate columns shown to be irrelevant for analysis in the EDA : nameOrig, nameDest,isFlaggedFraud
#rawDF = rawDF.drop('nameOrig','nameDest','isFlaggedFraud','step')

# Convert catagorical data into numeric and account for the amounts
# Create adjustedBalanceOrg & adjustedBalanceDest

In [6]:
df = rawDF.filter("type=='CASH_OUT' OR type=='TRANSFER'")

In [None]:
# Binary-encoding of labelled data in 'type'
# convert to binary label
#from pyspark.ml.feature import OneHotEncoder, StringIndexer
#indexer = StringIndexer().setInputCol("type").setOutputCol("typeLabel").fit(df)
#indexed = indexer.transform(df)

In [7]:
col1=['newbalanceOrig','oldbalanceOrg','amount']
col2=['newbalanceDest','oldbalanceDest','amount']

In [8]:
#adjustedBalanceOrg = newbalanceOrg + amount - oldbalanceOrg
df = df.withColumn('adjustedBalanceOrg',sum(df[cols] for cols in col1))
df = df.withColumn('adjustedBalanceDest',sum(df[cols] for cols in col2))

In [16]:
df.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)
 |-- adjustedBalanceOrg: double (nullable = true)
 |-- adjustedBalanceDest: double (nullable = true)



In [None]:
#indexed = indexed.drop('type')

# Split the data

In [9]:
# Split to train and test
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
training, test = df.randomSplit([0.7, 0.3], seed = 12345)

In [11]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, OneHotEncoder, VectorAssembler, IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
indexer = StringIndexer().setInputCol("type").setOutputCol("typeLabel").fit(df)
indexed = indexer.transform(df)
# One Hot Encoder on indexed features
typeEncoder = OneHotEncoder(inputCol="typeLabel", outputCol="typeVec")

In [None]:
# Create the vector structured data (label,features(vector))
assembler = VectorAssembler(inputCols=["amount","adjustedBalanceOrg","adjustedBalanceDest","typeVec"],outputCol="features")

In [None]:
rf = RandomForestClassifier(labelCol="typeLabel", featuresCol="features")

In [None]:
pipeline = Pipeline(stages=[indexer, typeEncoder,assembler, rf])

In [None]:
# Train model.  This also runs the indexers.
model = pipeline.fit(training)

In [None]:
# Predictions
predictions = model.transform(test)

In [None]:
predictions.select('isFraud','prediction').show()

In [None]:
predictions.printSchema()

In [None]:
# Select (prediction, true label) and compute test error
predictions = predictions.select(F.col("isFraud").cast("Float"),F.col("prediction"))
evaluator = MulticlassClassificationEvaluator(labelCol="isFraud", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

In [None]:
print("Accuracy = %g"%accuracy)
print("Test Error = %g" % (1.0 - accuracy))

In [15]:
from pyspark.ml import Pipeline, PipelineModel
indexer = StringIndexer().setInputCol("type").setOutputCol("typeLabel")
typeEncoder = OneHotEncoder(inputCol="typeLabel", outputCol="typeVec")
assembler = VectorAssembler(inputCols=["amount","adjustedBalanceOrg","adjustedBalanceDest","typeVec"],outputCol="features")
rf = RandomForestClassifier(labelCol="typeLabel", featuresCol="features")
pipeline = Pipeline(stages=[indexer, typeEncoder,assembler, rf])
model = pipeline.fit(df)
model.write().overwrite().save("use_cases/fraud_model1.1")