In [1]:
# 🚀 Spark Session Setup
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("EDA and MapReduce for Fraud Detection") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://hadoop-namenode:8020") \
    .getOrCreate()

In [2]:
df = spark.read.csv("hdfs://namenode:9000/user/fraude/input/frauddetectionsmall.csv", header=True, inferSchema=True)
df.show(5)
df.printSchema()

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|      

In [3]:
df.describe().show()
df.groupBy("isFraud").count().show()

+-------+------------------+--------+------------------+----------+-----------------+------------------+-----------+------------------+------------------+--------------------+--------------+
|summary|              step|    type|            amount|  nameOrig|    oldbalanceOrg|    newbalanceOrig|   nameDest|    oldbalanceDest|    newbalanceDest|             isFraud|isFlaggedFraud|
+-------+------------------+--------+------------------+----------+-----------------+------------------+-----------+------------------+------------------+--------------------+--------------+
|  count|             10200|   10200|             10200|     10200|            10200|             10200|      10200|             10200|             10200|               10200|         10200|
|   mean|  4.23421568627451|    NULL|104785.11960196093|      NULL|877593.0435343134| 898338.6177441188|       NULL| 930100.7737617663|1107144.6106558836|0.006666666666666667|           0.0|
| stddev|2.4863478344801186|    NULL| 270283.

In [9]:
from pyspark.sql.functions import split
# 1. Somme des montants par client
df_total = spark.read.text('hdfs://namenode:9000/user/fraude/stats/total_by_customer_java')

# 2. Nombre de transactions par client
df_freq = spark.read.text('hdfs://namenode:9000/user/fraude/stats/freq_by_customer')

# 3. Fraudes par type
df_fraud_type = spark.read.text('hdfs://namenode:9000/user/fraude/stats/fraud_by_type')

# 4. Fraudes par heure
df_fraud_hour = spark.read.text('hdfs://namenode:9000/user/fraude/stats/fraud_by_hour')

# 5. Types de transactions
df_trans_type = spark.read.text('hdfs://namenode:9000/user/fraude/stats/transaction_type_distribution')

# Séparation clé/valeur
df_total = df_total.withColumn("Customer_ID", split("value", "\t")[0]) \
                   .withColumn("Total_Amount", split("value", "\t")[1]) \
                   .drop("value")

df_freq = df_freq.withColumn("Customer_ID", split("value", "\t")[0]) \
                 .withColumn("Transaction_Count", split("value", "\t")[1]) \
                 .drop("value")

df_fraud_type = df_fraud_type.withColumn("Transaction_Type", split("value", "\t")[0]) \
                             .withColumn("Fraud_Count", split("value", "\t")[1]) \
                             .drop("value")

df_fraud_hour = df_fraud_hour.withColumn("Hour", split("value", "\t")[0]) \
                             .withColumn("Fraud_Count", split("value", "\t")[1]) \
                             .drop("value")

df_trans_type = df_trans_type.withColumn("Transaction_Type", split("value", "\t")[0]) \
                             .withColumn("Count", split("value", "\t")[1]) \
                             .drop("value")

df_total.show(truncate=False)
df_freq.show(truncate=False)
df_fraud_type.show(truncate=False)
df_fraud_hour.show(truncate=False)
df_trans_type.show(truncate=False)


+-----------+------------+
|Customer_ID|Total_Amount|
+-----------+------------+
|C10001825  |2655.04     |
|C100033245 |1980.51     |
|C1000600589|67017.13    |
|C1000719581|1280020.98  |
|C1001304973|168393.3    |
|C1001830827|7860.26     |
|C100196201 |7350.01     |
|C1002041276|2444.49     |
|C1002872041|6064.7      |
|C1002911155|1246.74     |
|C1003000306|1556.49     |
|C1003206025|871.75      |
|C1003307628|264.93      |
|C1003603952|8770.72     |
|C1003663195|132382.57   |
|C1003755748|9029.12     |
|C1004229504|45656.63    |
|C1004271827|25975.86    |
|C1004430079|1617.9      |
|C100445376 |59390.46    |
+-----------+------------+
only showing top 20 rows

+-----------+-----------------+
|Customer_ID|Transaction_Count|
+-----------+-----------------+
|C10001825  |1                |
|C100033245 |1                |
|C1000600589|1                |
|C1000719581|1                |
|C1001304973|1                |
|C1001830827|1                |
|C100196201 |1                |
|C1002