In [4]:
!pip install pyspark
from pyspark.sql import SparkSession



In [7]:
# Initialize Spark session
spark = SparkSession.builder \
 .appName("CreditCardFraudDetection") \
 .getOrCreate()


In [10]:
# Load the dataset
!wget https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv
data = spark.read.csv('creditcard.csv', header=True, inferSchema=True)


--2024-08-20 02:14:29--  https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 102634230 (98M) [text/plain]
Saving to: ‘creditcard.csv’


2024-08-20 02:14:33 (243 MB/s) - ‘creditcard.csv’ saved [102634230/102634230]



In [11]:
# Display the schema and a few rows of data
data.printSchema()
data.show(5)


root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double (nulla

In [12]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, when


In [14]:
# Select features and label
feature_columns = data.columns[:-1] # All columns except the last one, which is the label
assembler = VectorAssembler(inputCols=feature_columns,
outputCol="features")
data = assembler.transform(data)


In [15]:
# Rename the label column
final_data = data.select(col("features"),
col("Class").alias("label"))


In [16]:
# Display the transformed data
final_data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,-1.359807134...|    0|
|[0.0,1.191857111,...|    0|
|[1.0,-1.358354062...|    0|
|[1.0,-0.966271712...|    0|
|[2.0,-1.158233093...|    0|
+--------------------+-----+
only showing top 5 rows



In [17]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [18]:
# Split the data into training and testing sets
train_data, test_data = final_data.randomSplit([0.7, 0.3])


In [19]:
# Train the RandomForest model
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
model = rf.fit(train_data)


In [20]:
# Make predictions
predictions = model.transform(test_data)
predictions.show(5)


+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[1.0,-0.966271712...|    0|[9.99728746735658...|[0.99972874673565...|       0.0|
|[4.0,1.229657635,...|    0|[9.99728746735658...|[0.99972874673565...|       0.0|
|[12.0,-2.79185476...|    0|[9.99722639344249...|[0.99972263934424...|       0.0|
|[13.0,-0.43690507...|    0|[9.99728746735658...|[0.99972874673565...|       0.0|
|[15.0,1.492935977...|    0|[9.99728746735658...|[0.99972874673565...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [21]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="label",
metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
print(f"Model ROC-AUC: {roc_auc}")


Model ROC-AUC: 0.9550508422317052


In [22]:
from pyspark.ml.linalg import Vectors
# Create a sample new example with the same structure as the features in the dataset
new_example = spark.createDataFrame([
 (0, Vectors.dense([0.0, 0.0, 1.0, 0.0, 2.0, 0.0, -1.0, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5,
0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 1.0]))], ["label", "features"])


In [23]:
# Use the trained model to predict the label for this new example
sample_prediction = model.transform(new_example)
sample_prediction.show(truncate=False)


+-----+-------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------+--------------------------------------+----------+
|label|features                                                                                                                             |rawPrediction                            |probability                           |prediction|
+-----+-------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------+--------------------------------------+----------+
|0    |[0.0,0.0,1.0,0.0,2.0,0.0,-1.0,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,1.0]|[9.997287467356582,0.0027125326434193006]|[0.999728746735658,2.7125326434193E-4]|0.0       |
+-----+---------------------------------------------------------

In [24]:
# Extract the prediction and probability
sample_prediction.select("features", "prediction", "probability").show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------------------------+
|features                                                                                                                             |prediction|probability                           |
+-------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------------------------+
|[0.0,0.0,1.0,0.0,2.0,0.0,-1.0,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,1.0]|0.0       |[0.999728746735658,2.7125326434193E-4]|
+-------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------------------------+

