<a href="https://colab.research.google.com/github/islington-college-ing/week-7-advanced-machine-learning-with-pyspark-silwalprabin/blob/main/Week_7_Advanced_Machine_Learning_with_PySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488490 sha256=fa74be789264356c972ee7c4883112c2c90d77f7b9f2f7ac7e1a26f4ff98a7e3
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [6]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("CreditCardFraudDetection") \
    .getOrCreate()

# Load the dataset
!wget https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv
data = spark.read.csv('creditcard.csv', header=True, inferSchema=True)

# Display the schema and a few rows of data
data.printSchema()
data.show(5)


--2024-08-04 03:44:12--  https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 102634230 (98M) [text/plain]
Saving to: ‘creditcard.csv.1’


2024-08-04 03:44:13 (208 MB/s) - ‘creditcard.csv.1’ saved [102634230/102634230]

root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = tr

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, when

# Select features and label
feature_columns = data.columns[:-1]  # All columns except the last one, which is the label
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

# Rename the label column
final_data = data.select(col("features"), col("Class").alias("label"))

# Display the transformed data
final_data.show(5)


+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,-1.359807134...|    0|
|[0.0,1.191857111,...|    0|
|[1.0,-1.358354062...|    0|
|[1.0,-0.966271712...|    0|
|[2.0,-1.158233093...|    0|
+--------------------+-----+
only showing top 5 rows



In [8]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Split the data into training and testing sets
train_data, test_data = final_data.randomSplit([0.7, 0.3])

# Train the RandomForest model
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
model = rf.fit(train_data)

# Make predictions
predictions = model.transform(test_data)
predictions.show(5)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
print(f"Model ROC-AUC: {roc_auc}")


+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[0.0,-1.359807134...|    0|[9.99787856344974...|[0.99978785634497...|       0.0|
|[0.0,1.191857111,...|    0|[9.99787856344974...|[0.99978785634497...|       0.0|
|[2.0,-0.425965884...|    0|[9.99787856344974...|[0.99978785634497...|       0.0|
|[12.0,1.103215435...|    0|[9.99787856344974...|[0.99978785634497...|       0.0|
|[17.0,0.96249607,...|    0|[9.99609142355976...|[0.99960914235597...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows

Model ROC-AUC: 0.9640043360941284


In [9]:
from pyspark.ml.linalg import Vectors

# Create a sample new example with the same structure as the features in the dataset
new_example = spark.createDataFrame([
    (0, Vectors.dense([0.0, 0.0, 1.0, 0.0, 2.0, 0.0, -1.0, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 0.5, -0.5, 1.0]))
], ["label", "features"])

# Use the trained model to predict the label for this new example
sample_prediction = model.transform(new_example)
sample_prediction.show(truncate=False)

# Extract the prediction and probability
sample_prediction.select("features", "prediction", "probability").show(truncate=False)


+-----+-------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------+------------------------------------------+----------+
|label|features                                                                                                                             |rawPrediction                          |probability                               |prediction|
+-----+-------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------+------------------------------------------+----------+
|0    |[0.0,0.0,1.0,0.0,2.0,0.0,-1.0,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,1.0]|[9.996091423559767,0.00390857644023359]|[0.9996091423559766,3.9085764402335894E-4]|0.0       |
+-----+-------------------------------------------------