### Apply ANN 

There are several different methods to collect data with different methods. Lets try to solve the problem using ANN.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

#### Start a simple Spark Session

In [2]:
spark = SparkSession.builder.appName('fraud_detection_ann').master("local[*]").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/11 23:48:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
col_names = ['User', 'Card', 'Year', 'Month', 'Day', 'Time', 'Amount', 'Use Chip', 'Merchant Name', 'Merchant City', 
'Merchant State', 'MCC', 'Errors?', 'Is Fraud?', 'Hour', 'Minute', 'Date', 'Day_of_Week']

df = spark.read.option("delimiter", "|").csv('credit_card_transactions.csv', header=None, inferSchema=True).toDF(*col_names)
df = df.repartition(10)

24/06/11 23:48:58 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [4]:
Fraud = df[df['Is Fraud?']==1]
Normal = df.filter(col('Is Fraud?')==0).sample(withReplacement=True, fraction=1.0, seed=42).limit(300000)


In [5]:
df = Fraud.union(Normal)

In [6]:
# distinct_use_chip = 3 # df.select('Use Chip').distinct().count()
# distinct_day_of_week = 7 # df.select('Day_of_Week').distinct().count()
# input_size = 8 + distinct_use_chip + distinct_day_of_week

In [7]:

input_size = 16

In [8]:
stages = []
categorical_cols = ['Use Chip', 'Day_of_Week']
numerical_cols = ['Card', 'Year', 'Month', 'Day', 'Amount', 'MCC', 'Hour', 'Minute']

# Indexers for categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=col+'_indexed') for col in categorical_cols]
# Encoders for categorical columns
encoders = [OneHotEncoder(inputCol=col+'_indexed', outputCol=col+'_OHE') for col in categorical_cols]

assembler = VectorAssembler(inputCols=(numerical_cols + ['Use Chip_OHE', 'Day_of_Week_OHE']),
                            outputCol='features')



In [9]:
# Define the ANN model with adjusted layers and hyperparameters
# Example: input size, two hidden layers with 5 and 4 neurons, output size 2

layers = [input_size, 5, 4, 2]
ann = MultilayerPerceptronClassifier(labelCol="Is Fraud?", featuresCol="features", maxIter=150, layers=layers, blockSize=128, seed=42)

stages = indexers + encoders + [assembler, ann]

pipeline = Pipeline(stages=stages)

In [10]:
# Split data into training and test sets
train, test = df.randomSplit([0.8, 0.2], seed=42)


In [11]:
model = pipeline.fit(train)

24/06/11 23:51:59 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/06/11 23:51:59 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS

In [12]:
# Make predictions
predictions = model.transform(test)

In [13]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="Is Fraud?", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
roc_auc

                                                                                

0.49979793348306667

In [14]:
evaluator = MulticlassClassificationEvaluator(labelCol="Is Fraud?", predictionCol="prediction")

# Compute metrics
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})


evaluator_precision = MulticlassClassificationEvaluator(labelCol="Is Fraud?", predictionCol="prediction", metricName="precisionByLabel")
precision_label_0 = evaluator_precision.evaluate(predictions, {evaluator_precision.metricLabel: 0.0})
precision_label_1 = evaluator_precision.evaluate(predictions, {evaluator_precision.metricLabel: 1.0})

evaluator_recall = MulticlassClassificationEvaluator(labelCol="Is Fraud?", predictionCol="prediction", metricName="recallByLabel")
recall_label_0 = evaluator_recall.evaluate(predictions, {evaluator_recall.metricLabel: 0.0})
recall_label_1 = evaluator_recall.evaluate(predictions, {evaluator_recall.metricLabel: 1.0})


f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

# Print the metrics
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

print(f"Precision label 1: {precision_label_1}")
print(f"Recall label 1: {recall_label_1}")

print(f"Precision label 0: {precision_label_0}")
print(f"Recall label 0: {recall_label_0}")



Accuracy: 0.9116270561153711
F1 Score: 0.8694832883673324
Precision label 1: 0.0
Recall label 1: 0.0
Precision label 0: 0.9116270561153711
Recall label 0: 1.0


                                                                                