In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
spark = SparkSession.builder.appName("FraudDetectionModel").getOrCreate()


In [0]:
dbutils.fs.cp("file:/Workspace/Users/20btrcd025@jainuniversity.ac.in/fraud_detection_dataset.csv", 
              "dbfs:/FileStore/fraud_detection_dataset.csv")


True

In [0]:
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dbfs:/FileStore/fraud_detection_dataset.csv")



In [0]:
categorical_cols = ["transaction_location", "device_type", "merchant_category", "transaction_channel", "user_location"]
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=col + "_index", outputCol=col + "_encoded") for col in categorical_cols]

In [0]:
numeric_cols = ["transaction_amount", "previous_transactions", "failed_transactions", "account_age_days",
                "user_credit_score", "is_foreign_transaction", "is_high_risk_country", "is_vpn_used",
                "user_income", "is_account_compromised"]

In [0]:
assembler = VectorAssembler(inputCols=numeric_cols + [col + "_encoded" for col in categorical_cols], outputCol="features_raw")


In [0]:
scaler = StandardScaler(inputCol="features_raw", outputCol="features_scaled")


In [0]:
normalizer = MinMaxScaler(inputCol="features_scaled", outputCol="features")


In [0]:
gbt = GBTClassifier(labelCol="is_fraud", featuresCol="features", maxIter=50)


In [0]:
pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler, normalizer, gbt])


In [0]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)


In [0]:
model = pipeline.fit(train_df)


Downloading artifacts:   0%|          | 0/145 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
predictions = model.transform(test_df)


In [0]:
evaluator = BinaryClassificationEvaluator(labelCol="is_fraud", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
print(f"Model AUC: {roc_auc}")

Model AUC: 0.9999782978575645


In [0]:
model.save("dbfs:/FileStore/models/fraud_detection_model")
