# PySpark ML Essentials

Quick examples of:
- Linear Regression
- Logistic Regression  
- Model persistence with pickle
- Time series patterns

In [47]:
# Essential imports for PySpark ML
from pyspark.sql import SparkSession          # Main Spark entry point
from pyspark.sql.functions import *           # SQL functions (col, sum, avg, etc.)
from pyspark.ml.feature import VectorAssembler    # Combines features into vector
from pyspark.ml.regression import LinearRegression # For regression problems
from pyspark.ml.classification import LogisticRegression # For classification
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator # Model metrics
from pyspark.ml import Pipeline              # Chains ML stages together
import pandas as pd                           # For small data operations
import matplotlib.pyplot as plt              # Plotting and visualization
import numpy as np                           # Numerical operations
import pickle                                # Model serialization
import os                                    # File system operations

print("✓ All imports loaded - Ready for distributed ML!")

✓ All imports loaded - Ready for distributed ML!


In [None]:
# Initialize Spark session with optimizations and explicit bind address for local use
spark = SparkSession.builder \
    .appName("BankingML") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .getOrCreate()

# Reduce logging verbosity (only show warnings/errors)
spark.sparkContext.setLogLevel("WARN")
print(f"✓ Spark {spark.version} ready for distributed processing")

✓ Spark 3.5.3 ready for distributed processing


In [None]:
# Load transaction data from CSV
df = spark.read.csv("../data/transactions_data.csv", header=True, inferSchema=True)
print(f"Loaded: {df.count():,} rows, {len(df.columns)} columns")

# Clean and prepare data for ML
df_clean = df \
    .withColumn("amount_numeric", regexp_replace(col("amount"), "[\$,]", "").cast("double")) \
    .withColumn("is_online", (col("merchant_city") == "ONLINE").cast("int")) \
    .filter(col("amount_numeric") > 0)  # Remove invalid amounts

# Cache for better performance (stores in memory across cluster)
df_clean.cache()
print(f"Cleaned: {df_clean.count():,} transactions with numeric amounts and channel flags")

  .withColumn("amount_numeric", regexp_replace(col("amount"), "[\$,]", "").cast("double")) \
[Stage 1:>                                                         (0 + 8) / 10]

## 1. Simple Linear Regression

Customer spending prediction example:

In [None]:
# Feature engineering: Transform transaction-level data to customer-level features
customer_data = df_clean.groupBy("client_id").agg(
    sum("amount_numeric").alias("total_spend"),        # Customer lifetime value
    count("*").alias("transaction_count"),             # Transaction frequency
    avg("amount_numeric").alias("avg_transaction"),    # Average spending per transaction
    avg("is_online").alias("online_ratio")            # Digital adoption (0.0 = all offline, 1.0 = all online)
).filter(col("transaction_count") >= 3)  # Keep customers with sufficient data

# Cache customer features for reuse
customer_data.cache()
print(f"Customer features: {customer_data.count():,} customers with ≥3 transactions")
customer_data.show(5)

25/08/13 09:56:26 WARN MemoryStore: Not enough space to cache rdd_21_7 in memory! (computed 8.5 MiB so far)
25/08/13 09:56:26 WARN MemoryStore: Not enough space to cache rdd_21_5 in memory! (computed 8.5 MiB so far)
25/08/13 09:56:26 WARN MemoryStore: Not enough space to cache rdd_21_9 in memory! (computed 8.5 MiB so far)
25/08/13 09:56:26 WARN MemoryStore: Not enough space to cache rdd_21_8 in memory! (computed 32.4 MiB so far)
25/08/13 09:56:26 WARN MemoryStore: Not enough space to cache rdd_21_9 in memory! (computed 8.5 MiB so far)
25/08/13 09:56:26 WARN MemoryStore: Not enough space to cache rdd_21_8 in memory! (computed 32.4 MiB so far)

Customer features: 1,219 customers with ≥3 transactions
+---------+------------------+-----------------+------------------+-------------------+
|client_id|       total_spend|transaction_count|   avg_transaction|       online_ratio|
+---------+------------------+-----------------+------------------+-------------------+
|      148|419429.97000000003|            10824|38.749997228381375|0.07501847745750184|
|     1591| 685332.9499999997|            21305| 32.16770476413986|0.49349917859657355|
|     1238|1166239.1800000006|            10344| 112.7454737045631|0.06902552204176333|
|     1645| 417498.7400000001|             8639| 48.32720685264499|0.06308600532469036|
|     1959|         492257.17|             4827|101.97993992127616|0.11912160762378289|
+---------+------------------+-----------------+------------------+-------------------+
only showing top 5 rows



                                                                                

In [None]:
# Linear Regression: Predict customer total spending from behavioral features
feature_cols = ["transaction_count", "avg_transaction", "online_ratio"]

# VectorAssembler combines multiple feature columns into single features vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Linear regression predicts continuous values (total_spend)
lr = LinearRegression(featuresCol="features", labelCol="total_spend")

# Pipeline chains preprocessing + model training together
pipeline = Pipeline(stages=[assembler, lr])

# Split data: 80% training, 20% testing
train_data, test_data = customer_data.randomSplit([0.8, 0.2], seed=42)

# Train the model on training data
model = pipeline.fit(train_data)

# Make predictions on test data
predictions = model.transform(test_data)

# Evaluate using R² score (1.0 = perfect, 0.0 = no better than mean)
evaluator = RegressionEvaluator(labelCol="total_spend", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)

print(f"Linear Regression R² Score: {r2:.3f}")
print("Sample predictions (actual vs predicted spending):")
predictions.select("total_spend", "prediction").show(5)

25/08/13 09:56:27 WARN Instrumentation: [e84c59ec] regParam is zero, which might cause numerical instability and overfitting.


Linear Regression R² Score: 0.909
Sample predictions (actual vs predicted spending):
+------------------+-----------------+
|       total_spend|       prediction|
+------------------+-----------------+
|         400451.06| 399707.847138156|
|         492257.17|707361.1875022794|
| 423696.0899999999|538369.2478928273|
| 613557.3099999999|656883.6705653246|
|241462.70999999996|186911.5127122653|
+------------------+-----------------+
only showing top 5 rows



## 2. Simple Fraud Detection

Basic binary classification example:

In [None]:
# Fraud detection setup: Create features and synthetic fraud labels
fraud_data = df_clean \
    .withColumn("high_amount", (col("amount_numeric") > 100).cast("int")) \
    .withColumn("is_fraud", (rand(42) < 0.05).cast("int"))  # 5% synthetic fraud rate

# SAMPLE the data to avoid out-of-memory errors (adjust fraction as needed)
fraud_data_sample = fraud_data.sample(fraction=0.02, seed=42)  # Use 2% of data

# Features that might indicate fraud
fraud_features = ["amount_numeric", "is_online", "high_amount"]

# VectorAssembler for classification features
fraud_assembler = VectorAssembler(inputCols=fraud_features, outputCol="features")

# Logistic Regression for binary classification (fraud=1, legitimate=0)
fraud_lr = LogisticRegression(
    featuresCol="features", 
    labelCol="is_fraud", 
    maxIter=100  # Maximum iterations for optimization
)

# Create classification pipeline
fraud_pipeline = Pipeline(stages=[fraud_assembler, fraud_lr])

# Split data for training and testing
fraud_train, fraud_test = fraud_data_sample.randomSplit([0.8, 0.2], seed=42)

# Train the fraud detection model
fraud_model = fraud_pipeline.fit(fraud_train)

# Generate predictions (includes probability scores)
fraud_predictions = fraud_model.transform(fraud_test)

print(f"Fraud detection data (sampled): {fraud_data_sample.count():,} transactions")
print(f"Training: {fraud_train.count():,}, Test: {fraud_test.count():,}")
print("Model trained - predictions include probability scores for each class")

25/08/13 09:56:28 WARN MemoryStore: Not enough space to cache rdd_21_5 in memory! (computed 16.4 MiB so far)
25/08/13 09:56:28 WARN MemoryStore: Not enough space to cache rdd_21_7 in memory! (computed 16.5 MiB so far)
25/08/13 09:56:28 ERROR Executor: Exception in task 2.0 in stage 32.0 (TID 1100)
org.apache.spark.memory.SparkOutOfMemoryError: [UNABLE_TO_ACQUIRE_MEMORY] Unable to acquire 65536 bytes of memory, got 0.
	at org.apache.spark.errors.SparkCoreErrors$.outOfMemoryError(SparkCoreErrors.scala:467)
	at org.apache.spark.errors.SparkCoreErrors.outOfMemoryError(SparkCoreErrors.scala)
	at org.apache.spark.memory.MemoryConsumer.throwOom(MemoryConsumer.java:157)
	at org.apache.spark.memory.MemoryConsumer.allocateArray(MemoryConsumer.java:98)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.growPointerArrayIfNecessary(UnsafeExternalSorter.java:384)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.allocateMemoryForRecordIfNecessary(UnsafeExternal

Py4JJavaError: An error occurred while calling o3144.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 32.0 failed 1 times, most recent failure: Lost task 2.0 in stage 32.0 (TID 1100) (192.168.222.131 executor driver): org.apache.spark.memory.SparkOutOfMemoryError: [UNABLE_TO_ACQUIRE_MEMORY] Unable to acquire 65536 bytes of memory, got 0.
	at org.apache.spark.errors.SparkCoreErrors$.outOfMemoryError(SparkCoreErrors.scala:467)
	at org.apache.spark.errors.SparkCoreErrors.outOfMemoryError(SparkCoreErrors.scala)
	at org.apache.spark.memory.MemoryConsumer.throwOom(MemoryConsumer.java:157)
	at org.apache.spark.memory.MemoryConsumer.allocateArray(MemoryConsumer.java:98)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.growPointerArrayIfNecessary(UnsafeExternalSorter.java:384)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.allocateMemoryForRecordIfNecessary(UnsafeExternalSorter.java:467)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.insertRecord(UnsafeExternalSorter.java:487)
	at org.apache.spark.sql.execution.UnsafeExternalRowSorter.insertRow(UnsafeExternalRowSorter.java:138)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.sort_addToSorter_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$4(RDD.scala:1264)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$6(RDD.scala:1265)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:833)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2488)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1202)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1196)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$2(RDD.scala:1289)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1256)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1242)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1242)
	at org.apache.spark.ml.stat.Summarizer$.getClassificationSummarizers(Summarizer.scala:233)
	at org.apache.spark.ml.classification.LogisticRegression.$anonfun$train$1(LogisticRegression.scala:517)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:497)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:287)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.apache.spark.memory.SparkOutOfMemoryError: [UNABLE_TO_ACQUIRE_MEMORY] Unable to acquire 65536 bytes of memory, got 0.
	at org.apache.spark.errors.SparkCoreErrors$.outOfMemoryError(SparkCoreErrors.scala:467)
	at org.apache.spark.errors.SparkCoreErrors.outOfMemoryError(SparkCoreErrors.scala)
	at org.apache.spark.memory.MemoryConsumer.throwOom(MemoryConsumer.java:157)
	at org.apache.spark.memory.MemoryConsumer.allocateArray(MemoryConsumer.java:98)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.growPointerArrayIfNecessary(UnsafeExternalSorter.java:384)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.allocateMemoryForRecordIfNecessary(UnsafeExternalSorter.java:467)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.insertRecord(UnsafeExternalSorter.java:487)
	at org.apache.spark.sql.execution.UnsafeExternalRowSorter.insertRow(UnsafeExternalRowSorter.java:138)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.sort_addToSorter_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$4(RDD.scala:1264)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$6(RDD.scala:1265)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more


In [None]:
# Evaluate fraud detection model performance

# AUC-ROC: Area Under Curve - measures model's ability to distinguish classes
# 1.0 = perfect classifier, 0.5 = random guessing
auc_evaluator = BinaryClassificationEvaluator(labelCol="is_fraud", metricName="areaUnderROC")
auc = auc_evaluator.evaluate(fraud_predictions)

# Calculate confusion matrix components
tp = fraud_predictions.filter((col("prediction") == 1) & (col("is_fraud") == 1)).count()  # True Positives
tn = fraud_predictions.filter((col("prediction") == 0) & (col("is_fraud") == 0)).count()  # True Negatives
fp = fraud_predictions.filter((col("prediction") == 1) & (col("is_fraud") == 0)).count()  # False Positives
fn = fraud_predictions.filter((col("prediction") == 0) & (col("is_fraud") == 1)).count()  # False Negatives

# Key classification metrics
precision = tp / (tp + fp) if (tp + fp) > 0 else 0    # Of predicted fraud, how many were actually fraud?
recall = tp / (tp + fn) if (tp + fn) > 0 else 0       # Of actual fraud, how many did we catch?
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0  # Balanced metric

print(f"Fraud Detection Results:")
print(f"AUC-ROC: {auc:.3f} (discriminative ability)")
print(f"Precision: {precision:.3f} (accuracy of fraud predictions)")
print(f"Recall: {recall:.3f} (coverage of actual fraud)")
print(f"F1-Score: {f1_score:.3f} (harmonic mean of precision & recall)")
print(f"Confusion Matrix - TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")

In [None]:
# Model persistence: Save trained models using PySpark's native save/load methods
models_dir = "../models"
os.makedirs(models_dir, exist_ok=True)  # Create directory if it doesn't exist

# Save both models (entire pipelines including preprocessing)
lr_model_path = os.path.join(models_dir, "linear_regression")
fraud_model_path = os.path.join(models_dir, "fraud_detection")

# Remove existing directories if they exist (PySpark requires non-existing path)
import shutil
if os.path.exists(lr_model_path):
    shutil.rmtree(lr_model_path)
if os.path.exists(fraud_model_path):
    shutil.rmtree(fraud_model_path)

model.save(lr_model_path)
fraud_model.save(fraud_model_path)

# Save model metadata for documentation and model management
metadata = {
    'linear_regression': {
        'r2_score': r2, 
        'features': feature_cols,
        'target': 'total_spend'
    },
    'fraud_detection': {
        'auc': auc, 
        'precision': precision, 
        'recall': recall, 
        'features': fraud_features,
        'target': 'is_fraud'
    }
}

with open(os.path.join(models_dir, "metadata.pkl"), 'wb') as f:
    pickle.dump(metadata, f)

print(f"✓ Models saved to {models_dir}")
print(f"  Linear Regression: {lr_model_path}")
print(f"  Fraud Detection: {fraud_model_path}")
print(f"  Metadata: Contains performance metrics and feature info")

In [None]:
# Model loading example: Restore saved models for inference or further analysis
from pyspark.ml import PipelineModel

def load_models():
    """Load previously saved models and metadata from disk"""
    try:
        # Load complete ML pipelines (including preprocessing steps)
        lr_model = PipelineModel.load(lr_model_path)
        fraud_model = PipelineModel.load(fraud_model_path)
        
        # Load performance metrics and feature information
        with open(os.path.join(models_dir, "metadata.pkl"), 'rb') as f:
            metadata = pickle.load(f)
            
        print("✓ Models loaded successfully from disk")
        return lr_model, fraud_model, metadata
    
    except Exception as e:
        print(f"Models not found or error loading: {e}")
        return None, None, None

# Load models for testing/inference
loaded_lr, loaded_fraud, model_info = load_models()
print(f"Model info: {model_info}")
print("✓ Models loaded successfully")

# Test loaded models with sample data - demonstrates that persistence works
if loaded_lr is not None and loaded_fraud is not None:
    sample_customer = spark.createDataFrame([(50, 75.50, 0.8)], ["transaction_count", "avg_transaction", "online_ratio"])
    spending_pred = loaded_lr.transform(sample_customer)
    print("Sample customer spending prediction:")
    spending_pred.select("prediction").show()

    sample_transaction = spark.createDataFrame([(1500.0, 1, 1)], ["amount_numeric", "is_online", "high_amount"])
    fraud_pred = loaded_fraud.transform(sample_transaction)
    print("Sample transaction fraud prediction:")
    fraud_pred.select("prediction", "probability").show()

In [None]:
# Simple visualizations - Convert Spark DataFrame to pandas for plotting
sample_fraud = fraud_predictions.sample(0.01, seed=42).toPandas()  # Small sample for visualization
sample_fraud['prob_fraud'] = sample_fraud['probability'].apply(lambda x: float(x[1]))  # Extract fraud probability

plt.figure(figsize=(12, 4))

# Transaction amount distribution
plt.subplot(1, 2, 1)
plt.hist(sample_fraud['amount_numeric'], bins=20, alpha=0.7)
plt.title('Transaction Amount Distribution')
plt.xlabel('Amount ($)')
plt.ylabel('Frequency')

# Model's fraud probability distribution
plt.subplot(1, 2, 2)
plt.hist(sample_fraud['prob_fraud'], bins=20, alpha=0.7)
plt.title('Fraud Probability Distribution')
plt.xlabel('Predicted Probability of Fraud')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## 3. Basic Time Series Analysis

Time-based patterns in transaction data can reveal business insights and fraud trends.

In [None]:
# Time series feature extraction - Add temporal features for pattern analysis
df_with_time = fraud_data \
    .withColumn("hour", hour(col("date"))) \
    .withColumn("weekday", date_format(col("date"), "EEEE")) \
    .withColumn("is_weekend", dayofweek(col("date")).isin([1, 7]).cast("int"))  # 1=Sunday, 7=Saturday

# Analyze hourly transaction patterns - When do transactions occur?
hourly_patterns = df_with_time.groupBy("hour").agg(
    sum("amount_numeric").alias("hourly_volume"),     # Total transaction volume per hour
    count("*").alias("transaction_count"),            # Number of transactions per hour
    avg("is_fraud").alias("fraud_rate")               # Fraud rate by hour
).orderBy("hour")

print("Hourly transaction patterns:")
hourly_patterns.show(5)

# Compare weekend vs weekday behavior - Business vs leisure patterns
weekend_comparison = df_with_time.groupBy("is_weekend").agg(
    avg("amount_numeric").alias("avg_amount"),        # Average transaction amount
    count("*").alias("transaction_count"),            # Total transactions
    avg("is_fraud").alias("fraud_rate")               # Fraud rate comparison
)

print("Weekend vs Weekday patterns:")
weekend_comparison.show()

In [None]:
# Time series visualization - Plot temporal patterns
hourly_pd = hourly_patterns.toPandas()  # Convert to pandas for matplotlib

plt.figure(figsize=(10, 4))

# Transaction volume patterns throughout the day
plt.subplot(1, 2, 1)
plt.bar(hourly_pd['hour'], hourly_pd['hourly_volume'])
plt.title('Transaction Volume by Hour')
plt.xlabel('Hour of Day (24h format)')
plt.ylabel('Total Volume ($)')

# Fraud rate patterns - When does fraud peak?
plt.subplot(1, 2, 2)
plt.bar(hourly_pd['hour'], hourly_pd['fraud_rate'])
plt.title('Fraud Rate by Hour')
plt.xlabel('Hour of Day (24h format)')
plt.ylabel('Fraud Rate (0-1)')

plt.tight_layout()
plt.show()

In [None]:
# Simple fraud detection using transaction text
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("FraudDetectionText") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .getOrCreate()

# Transaction descriptions with fraud labels
fraud_text_data = spark.createDataFrame([
    (1, "URGENT ACCOUNT VERIFICATION REQUIRED", 1),  # Fraud
    (2, "WALMART GROCERY PURCHASE", 0),               # Normal
    (3, "SECURITY UPDATE EXPIRES TODAY", 1),          # Fraud
    (4, "STARBUCKS COFFEE SHOP", 0),                  # Normal
    (5, "VERIFY IDENTITY IMMEDIATELY", 1)             # Fraud
], ["id", "description", "is_fraud"])

# Simple text processing pipeline
tokenizer = Tokenizer(inputCol="description", outputCol="words")
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
count_vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="features")
fraud_classifier = LogisticRegression(featuresCol="features", labelCol="is_fraud")

# Train fraud detection model
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, count_vectorizer, fraud_classifier])
fraud_model = pipeline.fit(fraud_text_data)

# Predict fraud from text
predictions = fraud_model.transform(fraud_text_data)
print("Fraud detection results:")
predictions.select("description", "is_fraud", "prediction").show(truncate=False)

# Test model with new unseen transactions
test_data = spark.createDataFrame([
    (6, "CONFIRM DETAILS URGENT ACTION NEEDED", 0),    # Should predict fraud
    (7, "MCDONALDS BREAKFAST ORDER", 0),               # Should predict normal
    (8, "EXPIRES TONIGHT VERIFY NOW", 0),              # Should predict fraud
    (9, "TARGET STORE CLOTHING PURCHASE", 0)           # Should predict normal
], ["id", "description", "is_fraud"])

print("\nTesting model on new transactions:")
test_predictions = fraud_model.transform(test_data)
print("Test predictions:")
test_predictions.select("description", "prediction").show(truncate=False)

spark.stop()  # Stop Spark session when done

# Fraud Detection using Transaction Text Analysis
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Create realistic transaction descriptions for fraud detection
transaction_data = spark.createDataFrame([
    (1, "URGENT ACCOUNT VERIFICATION REQUIRED - IMMEDIATE ACTION", 1),  # Fraud
    (2, "SECURITY UPDATE NEEDED NOW - EXPIRES TODAY", 1),               # Fraud  
    (3, "WALMART SUPERCENTER PURCHASE GROCERIES", 0),                   # Legitimate
    (4, "VERIFY IDENTITY PREMIUM SERVICE LIMITED TIME", 1),             # Fraud
    (5, "STARBUCKS COFFEE DOWNTOWN LOCATION", 0),                       # Legitimate
    (6, "AMAZON PRIME MONTHLY SUBSCRIPTION", 0),                        # Legitimate
    (7, "CONFIRM DETAILS URGENT SECURITY BREACH", 1),                   # Fraud
    (8, "SHELL GAS STATION FUEL PURCHASE", 0),                          # Legitimate
    (9, "MCDONALDS DRIVE THRU ORDER", 0),                               # Legitimate
    (10, "IMMEDIATE RESPONSE REQUIRED ACCOUNT SUSPENDED", 1),           # Fraud
    (11, "TARGET STORE HOUSEHOLD ITEMS", 0),                            # Legitimate
    (12, "URGENT VERIFICATION PREMIUM OFFER EXPIRES", 1)                # Fraud
], ["transaction_id", "description", "is_fraud"])

print(f"Transaction fraud dataset: {transaction_data.count()} records")
print("Sample data:")
transaction_data.show(5, truncate=False)

# Text preprocessing pipeline for fraud detection
tokenizer = Tokenizer(inputCol="description", outputCol="words")
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# TF-IDF for better text feature representation than simple count
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=100)
idf = IDF(inputCol="raw_features", outputCol="features")

# Logistic regression for fraud classification
fraud_classifier = LogisticRegression(featuresCol="features", labelCol="is_fraud")

# Create complete ML pipeline
fraud_pipeline = Pipeline(stages=[tokenizer, stopwords_remover, hashingTF, idf, fraud_classifier])

# Train the fraud detection model
fraud_model = fraud_pipeline.fit(transaction_data)

# Generate predictions with fraud probabilities
predictions = fraud_model.transform(transaction_data)

print("\nFraud Detection Results:")
predictions.select("transaction_id", "description", "is_fraud", "prediction", "probability").show(truncate=False)

In [None]:
# Cleanup - Always stop Spark session to free resources
spark.stop()
print("✅ Spark session terminated cleanly")