In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=03be36a490f2d5f8642ffc7e1855c9659a2fec6d1f013a3f6120a1c096132919
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession

In [3]:
# Initialize Spark session
spark = SparkSession.builder \
 .appName("CreditCardFraudDetection") \
 .getOrCreate()


In [5]:
# Load the dataset
!wget https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv
data = spark.read.csv('creditcard.csv', header=True, inferSchema=True)
# Display the schema and a few rows of data
data.printSchema()
data.show(5)

--2024-08-20 01:59:33--  https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 102634230 (98M) [text/plain]
Saving to: ‘creditcard.csv’


2024-08-20 01:59:36 (205 MB/s) - ‘creditcard.csv’ saved [102634230/102634230]

root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)


In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, when
# Select features and label
feature_columns = data.columns[:-1] # All columns except the last one, which is the label
assembler = VectorAssembler(inputCols=feature_columns,
outputCol="features")
data = assembler.transform(data)

In [8]:
# Rename the label column
final_data = data.select(col("features"),
col("Class").alias("label"))
# Display the transformed data
final_data.show(5)



+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,-1.359807134...|    0|
|[0.0,1.191857111,...|    0|
|[1.0,-1.358354062...|    0|
|[1.0,-0.966271712...|    0|
|[2.0,-1.158233093...|    0|
+--------------------+-----+
only showing top 5 rows



In [9]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Split the data into training and testing sets
train_data, test_data = final_data.randomSplit([0.7, 0.3])
# Train the RandomForest model
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
model = rf.fit(train_data)
# Make predictions
predictions = model.transform(test_data)
predictions.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[0.0,1.191857111,...|    0|[9.99777637281796...|[0.99977763728179...|       0.0|
|[1.0,-1.358354062...|    0|[9.99761895007863...|[0.99976189500786...|       0.0|
|[2.0,-0.425965884...|    0|[9.99777637281796...|[0.99977763728179...|       0.0|
|[7.0,-0.894286082...|    0|[9.99777637281796...|[0.99977763728179...|       0.0|
|[10.0,0.384978215...|    0|[9.99777637281796...|[0.99977763728179...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [10]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="label",
metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
print(f"Model ROC-AUC: {roc_auc}")


Model ROC-AUC: 0.9650840418885587


In [11]:
from pyspark.ml.linalg import Vectors
# Create a sample new example with the same structure as the features in the dataset
new_example = spark.createDataFrame([
 (0, Vectors.dense([0.0, 0.0, 1.0, 0.0, 2.0, 0.0, -1.0, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5,
0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 1.0]))
], ["label", "features"])
# Use the trained model to predict the label for this new example
sample_prediction = model.transform(new_example)
sample_prediction.show(truncate=False)
# Extract the prediction and probability
sample_prediction.select("features", "prediction", "probability").show(truncate=False)

+-----+-------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------+-----------------------------------------+----------+
|label|features                                                                                                                             |rawPrediction                          |probability                              |prediction|
+-----+-------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------+-----------------------------------------+----------+
|0    |[0.0,0.0,1.0,0.0,2.0,0.0,-1.0,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,1.0]|[9.99777637281796,0.002223627182041135]|[0.9997776372817959,2.223627182041135E-4]|0.0       |
+-----+-----------------------------------------------------