In [None]:
!pip install graphframes

In [None]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame
from pyspark.sql.functions import col

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("FraudDetectionGraph") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12") \
    .getOrCreate()

In [None]:
# Create vertices (bank accounts)
vertices = spark.createDataFrame([
    (101, "Account_A", 5000),
    (102, "Account_B", 10000),
    (103, "Account_C", 1500),
    (104, "Account_D", 20000),
    (105, "Account_E", 3000),
    (106, "Account_F", 8000)
], ["id", "account_name", "balance"])

# Create edges (transactions between accounts)
edges = spark.createDataFrame([
    (101, 102, 1000, "2023-10-01"),
    (102, 103, 500, "2023-10-02"),
    (103, 101, 1000, "2023-10-03"),
    (104, 105, 2000, "2023-10-04"),
    (105, 106, 1500, "2023-10-05"),
    (106, 104, 2000, "2023-10-06"),
    (101, 104, 3000, "2023-10-07")
], ["src", "dst", "amount", "date"])

In [None]:
# Build the transaction graph
graph = GraphFrame(vertices, edges)

In [None]:

# Print the vertices and edges
print("Accounts (Vertices):")
graph.vertices.show()
print("Transactions (Edges):")
graph.edges.show()

In [None]:
# Find Strongly Connected Components (SCCs) to detect cycles
scc = graph.stronglyConnectedComponents(maxIter=10)
print("Strongly Connected Components (Fraud Cycles):")
scc.groupBy("component").count().filter("count > 1").show()  # Components with >1 account

In [None]:
# detect cyclic transaction patterns
cyclic_transactions = graph.find("(a)-[e1]->(b); (b)-[e2]->(c); (c)-[e3]->(a)") \
    .filter("e1.date < e2.date AND e2.date < e3.date")  # Temporal ordering
print("Cyclic Transaction Chains (A -> B -> C -> A):")
cyclic_transactions.show()

In [None]:
# Triangle Count (identify dense clusters of accounts)
triangle_count = graph.triangleCount()
print("Accounts in Transaction Triangles (Dense Clusters):")
triangle_count.filter("count > 0").show()  # Accounts involved in triangles

In [None]:
# Step 4: Flag high-risk accounts (e.g., in cycles or triangles)
high_risk_accounts = triangle_count.select("id", "count")\
  .union(scc.filter("component IN (0, 1)").select("id", col("component").alias("count")))  # Align columns

print("High-Risk Accounts:")
high_risk_accounts.join(vertices, "id").select("id", "account_name", "balance").show()

In [None]:
# Stop Spark Session
spark.stop()