<a href="https://colab.research.google.com/github/iraj259/Machine-Learning/blob/main/FraudDetectionAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# -----------------------------------------
# 1) Import libraries
# -----------------------------------------
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassificationModel

In [2]:
# -----------------------------------------
# 2) Initialize Spark session
# -----------------------------------------
spark = SparkSession.builder.appName("FraudDetection").getOrCreate()

In [3]:
# -----------------------------------------
# 3) Define schema
# -----------------------------------------
schema = StructType([
    StructField("step", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("nameOrig", StringType(), True),
    StructField("oldbalanceOrg", DoubleType(), True),
    StructField("newbalanceOrig", DoubleType(), True),
    StructField("nameDest", StringType(), True),
    StructField("oldbalanceDest", DoubleType(), True),
    StructField("newbalanceDest", DoubleType(), True),
    StructField("isFraud", IntegerType(), True),
    StructField("isFlaggedFraud", IntegerType(), True)
])

In [5]:
# -----------------------------------------
# 4) Load dataset
# -----------------------------------------
df = spark.read.csv("fraudDetection.csv", header=True, schema=schema)


In [6]:
# -----------------------------------------
# 5) Preprocessing
# -----------------------------------------
# >>> CHANGE: added handleInvalid='keep' to avoid errors on unseen categories
indexer = StringIndexer(inputCol="type", outputCol="type_index", handleInvalid="keep")
encoder = OneHotEncoder(inputCols=["type_index"], outputCols=["type_encoded"], handleInvalid="keep")

assembler = VectorAssembler(
    inputCols=["step","amount","oldbalanceOrg","newbalanceOrig",
               "oldbalanceDest","newbalanceDest","type_encoded"],
    outputCol="features", handleInvalid="keep"   # >>> CHANGE
)

df = df.withColumnRenamed("isFraud", "label")

In [7]:
# -----------------------------------------
# 6) Handle class imbalance
# -----------------------------------------
fraud_df = df.filter(col("label") == 1)
nonfraud_df = df.filter(col("label") == 0)

# >>> CHANGE: used downsampling instead of oversampling
fraud_count = fraud_df.count()
nonfraud_sampled = nonfraud_df.sample(withReplacement=False,
                                      fraction=fraud_count / nonfraud_df.count(),
                                      seed=42)
balanced_df = fraud_df.union(nonfraud_sampled)

In [8]:
# -----------------------------------------
# 7) Train/test split
# -----------------------------------------
train_df, test_df = balanced_df.randomSplit([0.7, 0.3], seed=42)

In [9]:
# -----------------------------------------
# 8) Define models
# -----------------------------------------
# >>> CHANGE: added seed=42 for reproducibility
rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=42)
gbt = GBTClassifier(featuresCol="features", labelCol="label", seed=42)
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

pipelines = {
    "RandomForest": Pipeline(stages=[indexer, encoder, assembler, rf]),
    "GBT": Pipeline(stages=[indexer, encoder, assembler, gbt]),
    "LogisticRegression": Pipeline(stages=[indexer, encoder, assembler, lr])
}

In [11]:
# -----------------------------------------
# 9) Evaluators
# -----------------------------------------
evaluator_roc = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
evaluator_pr = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderPR")

metrics = {}

In [12]:
# -----------------------------------------
# 10) Train and evaluate
# -----------------------------------------
for name, pipeline in pipelines.items():
    model = pipeline.fit(train_df)
    preds = model.transform(test_df)

    auc_roc = evaluator_roc.evaluate(preds)
    auc_pr = evaluator_pr.evaluate(preds)

    tp = preds.filter((col("label") == 1) & (col("prediction") == 1)).count()
    tn = preds.filter((col("label") == 0) & (col("prediction") == 0)).count()
    fp = preds.filter((col("label") == 0) & (col("prediction") == 1)).count()
    fn = preds.filter((col("label") == 1) & (col("prediction") == 0)).count()

    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)

    metrics[name] = {
        "AUC-ROC": auc_roc,
        "AUC-PR": auc_pr,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1,
        "TP": tp, "TN": tn, "FP": fp, "FN": fn
    }


In [None]:
 # -----------------------------------------
    # 11) Feature importances (RandomForest only)
    # -----------------------------------------
    for stage in model.stages:
        if isinstance(stage, RandomForestClassificationModel):
            print(f"Top 10 feature importances for {name}:")
            importances = stage.featureImportances
            for idx, imp in enumerate(importances.toArray()):
                print(f"  Feature {idx}: {imp}")


In [15]:
  # -----------------------------------------
    # 12) Save model
    # -----------------------------------------
try:
        model.write().overwrite().save(f"{name}_fraud_model")
except Exception as e:  # >>> CHANGE: error handling
        print(f"Could not save {name} model: {e}")

In [16]:
# -----------------------------------------
# 13) Print metrics
# -----------------------------------------
for name, vals in metrics.items():
    print(f"\nModel: {name}")
    for k, v in vals.items():
        print(f"  {k}: {v:.4f}")


Model: RandomForest
  AUC-ROC: 0.9944
  AUC-PR: 0.9939
  Precision: 0.9598
  Recall: 0.9536
  F1-score: 0.9567
  TP: 1027.0000
  TN: 1092.0000
  FP: 43.0000
  FN: 50.0000

Model: GBT
  AUC-ROC: 0.9980
  AUC-PR: 0.9977
  Precision: 0.9816
  Recall: 0.9916
  F1-score: 0.9866
  TP: 1068.0000
  TN: 1115.0000
  FP: 20.0000
  FN: 9.0000

Model: LogisticRegression
  AUC-ROC: 0.9501
  AUC-PR: 0.9483
  Precision: 0.8772
  Recall: 0.8422
  F1-score: 0.8593
  TP: 907.0000
  TN: 1008.0000
  FP: 127.0000
  FN: 170.0000
