In [0]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark import StorageLevel
#import mlflow
#import mlflow.spark


In [0]:
file_path = "dbfs:/FileStore/tables/credit_card_info.csv" 
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [0]:
indexers = ["repeat_retailer", "used_chip", "used_pin_number", "online_order"]
for column in indexers:
    indexer = StringIndexer(inputCol=column, outputCol=column + "_index")
    df = indexer.fit(df).transform(df)

In [0]:
feature_columns = [
    "distance_from_home", "distance_from_last_transaction",
    "ratio_to_median_purchase_price", "repeat_retailer_index",
    "used_chip_index", "used_pin_number_index", "online_order_index"
]

In [0]:
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

In [0]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [0]:
rf = RandomForestClassifier(labelCol="fraud", featuresCol="features", numTrees=100, maxDepth=5)
rf_model = rf.fit(train_data)

In [0]:
predictions = rf_model.transform(test_data)

In [0]:
evaluator = BinaryClassificationEvaluator(labelCol="fraud", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"AUC Score on Test Data (Unseen Data): {auc:.4f}")

AUC Score on Test Data (Unseen Data): 0.9915


In [0]:
model_uri = "dbfs:/FileStore/models/credit_card_fraud_detection_rf"
rf_model.write().overwrite().save(model_uri)
print(f"model saved at: {model_uri}")

model saved at: dbfs:/FileStore/models/credit_card_fraud_detection_rf
