In [None]:
#spark_session
from pyspark.sql import SparkSession
spark_session =  SparkSession.builder\
    .master('local[*]')\
    .config("spark.driver.memory", "16g")\
    .config("spark.executor.memory", "16g") \
    .config("spark.memory.fraction", "0.8") \
    .appName('AML_project')\
    .getOrCreate()

In [2]:
df = spark_session.read.csv("../Data/SAML-D.csv",header=True)
df.show()

+--------+----------+--------------+----------------+--------+----------------+-----------------+--------------------+----------------------+------------+-------------+--------------------+
|    Time|      Date|Sender_account|Receiver_account|  Amount|Payment_currency|Received_currency|Sender_bank_location|Receiver_bank_location|Payment_type|Is_laundering|     Laundering_type|
+--------+----------+--------------+----------------+--------+----------------+-----------------+--------------------+----------------------+------------+-------------+--------------------+
|10:35:19|2022-10-07|    8724731955|      2769355426| 1459.15|       UK pounds|        UK pounds|                  UK|                    UK|Cash Deposit|            0|Normal_Cash_Deposits|
|10:35:20|2022-10-07|    1491989064|      8401255335| 6019.64|       UK pounds|           Dirham|                  UK|                   UAE|Cross-border|            0|      Normal_Fan_Out|
|10:35:20|2022-10-07|     287305149|      44047670

In [3]:
from pyspark.sql.functions import year, month, dayofmonth, to_date

df = df.withColumn("Date", to_date(df["Date"], "yyyy-MM-dd"))

df = df.withColumn("year", year(df["Date"])) \
       .withColumn("month", month(df["Date"])) \
       .withColumn("day", dayofmonth(df["Date"]))


In [4]:
from pyspark.sql.functions import to_timestamp,hour,minute,second 

df = df.withColumn("Time", to_timestamp(df["Time"], "HH:mm:ss"))

df = df.withColumn("hour", hour(df["Time"])) \
       .withColumn("minute", minute(df["Time"])) \
       .withColumn("second", second(df["Time"]))


In [5]:
df.columns

['Time',
 'Date',
 'Sender_account',
 'Receiver_account',
 'Amount',
 'Payment_currency',
 'Received_currency',
 'Sender_bank_location',
 'Receiver_bank_location',
 'Payment_type',
 'Is_laundering',
 'Laundering_type',
 'year',
 'month',
 'day',
 'hour',
 'minute',
 'second']

In [6]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
print(f"Training Set: {train_df.count()} rows")
print(f"Testing Set: {test_df.count()} rows")

Training Set: 7603866 rows
Testing Set: 1900986 rows


In [7]:
out

NameError: name 'out' is not defined

In [7]:
train_df = train_df.limit(603866)

In [8]:
train_df.count()

603866

In [9]:
from pyspark.sql.functions import col
numeric_cols = ['Is_laundering']

for col_name in numeric_cols:
    train_df = train_df.withColumn(col_name, col(col_name).cast("double"))
    test_df = test_df.withColumn(col_name, col(col_name).cast("double"))

In [10]:
drop_cols = ['Date', 'Time']
train_df = train_df.drop(*drop_cols)
test_df = test_df.drop(*drop_cols)

In [11]:
train_df.columns

['Sender_account',
 'Receiver_account',
 'Amount',
 'Payment_currency',
 'Received_currency',
 'Sender_bank_location',
 'Receiver_bank_location',
 'Payment_type',
 'Is_laundering',
 'Laundering_type',
 'year',
 'month',
 'day',
 'hour',
 'minute',
 'second']

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
# Categorical and numerical columns
categorical_cols = ['Payment_currency', 'Received_currency', 'Sender_bank_location',
                    'Receiver_bank_location', 'Payment_type', 'Laundering_type']
numeric_cols = ['Sender_account', 'Receiver_account', 'Amount', 'year', 'month', 'day', 'hour', 'minute', 'second','Is_laundering']
scaled_cols = ['Amount', 'year', 'month', 'day', 'hour', 'minute', 'second']

# String Indexing for categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid="keep") for col in categorical_cols]

# Casting numeric columns to double
train_df = train_df.select([col(c).cast("double").alias(c) for c in numeric_cols] + categorical_cols)
test_df = test_df.select([col(c).cast("double").alias(c) for c in numeric_cols] + categorical_cols)

# Handling class imbalance using oversampling
fraud_cases = train_df.filter(col("Is_laundering") == 1)
non_fraud_cases = train_df.filter(col("Is_laundering") == 0)
fraud_count = fraud_cases.count()
non_fraud_count = non_fraud_cases.count()

if fraud_count < non_fraud_count:
    fraud_cases = fraud_cases.sample(withReplacement=True, fraction=non_fraud_count / fraud_count, seed=42)
elif non_fraud_count < fraud_count:
    non_fraud_cases = non_fraud_cases.sample(withReplacement=True, fraction=fraud_count / non_fraud_count, seed=42)

# Ensuring class balance
train_df = fraud_cases.union(non_fraud_cases)

# Assembling features excluding 'Is_laundering'
assembler = VectorAssembler(inputCols=[c for c in numeric_cols if c != 'Is_laundering'] + [c + "_index" for c in categorical_cols], outputCol="features")

# Scaling numeric columns
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")

# Defining Random Forest model
rf = RandomForestClassifier(featuresCol="scaled_features", labelCol="Is_laundering", numTrees=100)

# Creating pipeline
pipeline = Pipeline(stages=indexers + [assembler, scaler, rf])

In [14]:
# Train model
model = pipeline.fit(train_df)

# Predictions
predictions = model.transform(test_df)

# Show results
predictions.select("Is_laundering", "prediction", "probability").show()

+-------------+----------+--------------------+
|Is_laundering|prediction|         probability|
+-------------+----------+--------------------+
|          0.0|       0.0|[0.88679275349344...|
|          0.0|       0.0|[0.94362789924139...|
|          0.0|       0.0|[0.96862865490560...|
|          0.0|       0.0|[0.96862865490560...|
|          0.0|       0.0|[0.56948556142227...|
|          0.0|       0.0|[0.94362789924139...|
|          0.0|       0.0|[0.93752777435884...|
|          0.0|       0.0|[0.94362789924139...|
|          0.0|       0.0|[0.94004278936793...|
|          0.0|       0.0|[0.94362789924139...|
|          0.0|       0.0|[0.94362789924139...|
|          0.0|       0.0|[0.94362789924139...|
|          0.0|       0.0|[0.56948556142227...|
|          0.0|       0.0|[0.96202870477199...|
|          0.0|       0.0|[0.94106618379789...|
|          0.0|       1.0|[0.45212826573220...|
|          0.0|       0.0|[0.94191246467544...|
|          0.0|       0.0|[0.94191246467

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Accuracy
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="Is_laundering", predictionCol="prediction", metricName="accuracy")
accuracy = accuracy_evaluator.evaluate(predictions)

# Precision
precision_evaluator = MulticlassClassificationEvaluator(labelCol="Is_laundering", predictionCol="prediction", metricName="weightedPrecision")
precision = precision_evaluator.evaluate(predictions)

# Recall
recall_evaluator = MulticlassClassificationEvaluator(labelCol="Is_laundering", predictionCol="prediction", metricName="weightedRecall")
recall = recall_evaluator.evaluate(predictions)

# F1-score
f1_evaluator = MulticlassClassificationEvaluator(labelCol="Is_laundering", predictionCol="prediction", metricName="f1")
f1_score = f1_evaluator.evaluate(predictions)

# AUC-ROC (for imbalanced data)
roc_evaluator = BinaryClassificationEvaluator(labelCol="Is_laundering", rawPredictionCol="probability", metricName="areaUnderROC")
roc_auc = roc_evaluator.evaluate(predictions)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")


Accuracy: 0.8911
Precision: 0.9990
Recall: 0.8911
F1-score: 0.9414
ROC-AUC: 0.9914


**Accuracy**: 0.8911 — This means that the model correctly classified 89.11% of all instances. While not perfect, it is relatively high.</br>

**Precision**: 0.9990 — This is a very strong score, meaning that when the model predicts a positive (fraudulent transaction), it is almost always correct. This is crucial in fraud detection, as false positives are costly. </br>

**Recall**: 0.8911 — This is also fairly high, indicating that the model is good at identifying fraudulent transactions. However, it is lower than precision, meaning there might be some false negatives (fraudulent transactions missed by the model).</br>

**F1-score**: 0.9414 — The F1-score is the harmonic mean of precision and recall, and it's high, which suggests that the model has a good balance between precision and recall. </br>

**ROC-AUC**: 0.9914 — The ROC-AUC score close to 1.0 indicates that the model has an excellent ability to discriminate between the positive and negative classes (fraud and non-fraud). </br>

# Hyperparameter tunning

In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
# Categorical and numerical columns
categorical_cols = ['Payment_currency', 'Received_currency', 'Sender_bank_location',
                    'Receiver_bank_location', 'Payment_type', 'Laundering_type']
numeric_cols = ['Sender_account', 'Receiver_account', 'Amount', 'year', 'month', 'day', 'hour', 'minute', 'second','Is_laundering']
scaled_cols = ['Amount', 'year', 'month', 'day', 'hour', 'minute', 'second']

# String Indexing for categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid="keep") for col in categorical_cols]

# Casting numeric columns to double
train_df = train_df.select([col(c).cast("double").alias(c) for c in numeric_cols] + categorical_cols)
test_df = test_df.select([col(c).cast("double").alias(c) for c in numeric_cols] + categorical_cols)

# Handling class imbalance using oversampling
fraud_cases = train_df.filter(col("Is_laundering") == 1)
non_fraud_cases = train_df.filter(col("Is_laundering") == 0)
fraud_count = fraud_cases.count()
non_fraud_count = non_fraud_cases.count()

if fraud_count < non_fraud_count:
    fraud_cases = fraud_cases.sample(withReplacement=True, fraction=non_fraud_count / fraud_count, seed=42)
elif non_fraud_count < fraud_count:
    non_fraud_cases = non_fraud_cases.sample(withReplacement=True, fraction=fraud_count / non_fraud_count, seed=42)

# making class balance
train_df = fraud_cases.union(non_fraud_cases)

# Assemble features excluding 'Is_laundering'
assembler = VectorAssembler(inputCols=[c for c in numeric_cols if c != 'Is_laundering'] + [c + "_index" for c in categorical_cols], outputCol="features")

# Scaling numeric columns
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")

# Defining Random Forest model
rf = RandomForestClassifier(featuresCol="scaled_features", labelCol="Is_laundering")

# Creating pipeline
pipeline = Pipeline(stages=indexers + [assembler, scaler, rf])

In [12]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import random
num_trees = random.choice([30, 50, 70]) 
max_depth = random.choice([5, 10, 15])  
max_bins = random.choice([32, 64, 128])  

paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [num_trees])
             .addGrid(rf.maxDepth, [max_depth])
             .addGrid(rf.maxBins, [max_bins])
             .build())

In [13]:
evaluator = BinaryClassificationEvaluator(labelCol="Is_laundering")

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3) 

In [14]:
#training the model
cvModel = crossval.fit(train_df)

In [15]:
#best model
bestModel = cvModel.bestModel

# Evaluating on the test data
predictions = bestModel.transform(test_df)
roc_auc = evaluator.evaluate(predictions)

print("Best Model ROC-AUC: ", roc_auc)

Best Model ROC-AUC:  0.9682567110290056


In [None]:
best_rf_model = bestModel.stages[-1]  
best_numTrees = best_rf_model.getOrDefault('numTrees')
best_maxDepth = best_rf_model.getOrDefault('maxDepth')
best_maxBins = best_rf_model.getOrDefault('maxBins')

print(f"Best numTrees: {best_numTrees}")
print(f"Best maxDepth: {best_maxDepth}")
print(f"Best maxBins: {best_maxBins}")

Best numTrees: 70
Best maxDepth: 5
Best maxBins: 128
