🚨 AML Network Analysis with Graph + Machine Learning

This project analyzes a banking transaction dataset to detect suspicious accounts using:

NetworkX for transaction graph modeling  
Isolation Forest (Scikit-learn)** for anomaly detection 
PostgreSQL + SQLAlchemy** for data integration

 Output
 `all_accounts.csv`: all accounts with money sent/received and connections
 `suspicious_accounts.csv`: flagged suspicious accounts



In [2]:
import pandas as pd
import networkx as nx
from sklearn.ensemble import IsolationForest
from sqlalchemy import create_engine
db_user = 'urdata'      
db_password = 'urpass'   
db_host = 'urhost'
db_port = 'urport'
db_name = 'urdb'
engine = create_engine(f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}')

#Load data from PostgreSQL
banking_data = pd.read_sql("SELECT * FROM banking_data", engine)

print('Starting Analysis')
print(f" Dataset: {len(banking_data)} transactions, "
      f"{len(set(banking_data['nameorig'].tolist() + banking_data['namedest'].tolist()))} accounts")

#Build transaction network
print('Building transaction graph')
G = nx.DiGraph()
for _, row in banking_data.iterrows():
    G.add_edge(row['nameorig'], row['namedest'], weight=row['amount'])

#Feature engineering
print('Calculating features')
features = []
for account in G.nodes():
    money_sent = sum(G[account][neighbor]['weight'] for neighbor in G.successors(account))
    money_received = sum(G[neighbor][account]['weight'] for neighbor in G.predecessors(account))
    connections = G.degree(account)

    features.append({
        'account_id': account,
        'money_sent': money_sent,
        'money_received': money_received,
        'connections': connections
    })

df = pd.DataFrame(features)

#inactive accounts
df = df[(df['money_sent'] > 0) | (df['money_received'] > 0)]

#Isolation Forest(Anomlly detection)
print('Finding suspicious accounts')
X = df[['money_sent', 'money_received', 'connections']]
model = IsolationForest(contamination=0.1, random_state=42)
df['is_suspicious'] = model.fit_predict(X) == -1
df['suspicion_score'] = model.decision_function(X)


suspicious = df[df['is_suspicious']].sort_values('suspicion_score')

print(f"\n Found {len(suspicious)} suspicious accounts:")
print(suspicious[['account_id', 'money_sent', 'money_received', 'suspicion_score']].head(10))

suspicious.to_csv('suspicious_accounts.csv', index=False)
df.to_csv('all_accounts.csv', index=False)

print(f"\n Analysis complete!")
print(f" Files saved: suspicious_accounts.csv, all_accounts.csv")
print(f" Most suspicious account: {suspicious.iloc[0]['account_id'] if len(suspicious) > 0 else 'None'}")


Starting Analysis
 Dataset: 2097150 transactions, 1497908 accounts
Building transaction graph
Calculating features
Finding suspicious accounts

 Found 149108 suspicious accounts:
        account_id  money_sent  money_received  suspicion_score
721      C11003494         0.0     24351218.72        -0.367223
367    C1789550256         0.0     28376404.29        -0.367223
179    C1286084959         0.0     33821294.10        -0.367223
746    C1816757085         0.0     25861438.32        -0.367223
16879   C423580937         0.0     24536518.17        -0.367223
341    C1504109395         0.0     22572409.85        -0.367223
1152    C667346055         0.0     25382158.07        -0.367223
204    C1870252780         0.0     20814145.76        -0.367223
206      C97730845         0.0     41811504.75        -0.367223
1139    C306206744         0.0     24331178.88        -0.367223

 Analysis complete!
 Files saved: suspicious_accounts.csv, all_accounts.csv
 Most suspicious account: C11003494


In [4]:
#cleaningdata
import pandas as pd
from sqlalchemy import create_engine

# Connect to DB
db_user = 'urdata'      
db_password = 'urpass'   
db_host = 'urhost'
db_port = 'urport'
db_name = 'urdb'
engine = create_engine(f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}')

# Load data
df = pd.read_sql("SELECT * FROM banking_data", engine)

#  Clean
print("Cleaning data...")
df = df.drop_duplicates()
df = df[df['amount'] > 0]
df = df.dropna()

# Optional sanity check
print(" Cleaned shape:", df.shape)
print(df.describe())
import os

# Create 'data' folder if it doesn't exist
os.makedirs('data', exist_ok=True)

# Then save file
df.to_csv('data/cleaned_banking_data.csv', index=False)

# Step 4: Save cleaned file
df.to_csv('data/cleaned_banking_data.csv', index=False)
print(" Saved: data/cleaned_banking_data.csv")

Cleaning data...
 Cleaned shape: (1048575, 11)
               step        amount    oldbalorig    newbalorig    oldbaldest  \
count  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06   
mean   2.696617e+01  1.586670e+05  8.740095e+05  8.938089e+05  9.781600e+05   
std    1.562325e+01  2.649409e+05  2.971751e+06  3.008271e+06  2.296780e+06   
min    1.000000e+00  1.000000e-01  0.000000e+00  0.000000e+00  0.000000e+00   
25%    1.500000e+01  1.214907e+04  0.000000e+00  0.000000e+00  0.000000e+00   
50%    2.000000e+01  7.634333e+04  1.600200e+04  0.000000e+00  1.263772e+05   
75%    3.900000e+01  2.137619e+05  1.366420e+05  1.746000e+05  9.159235e+05   
max    9.500000e+01  1.000000e+07  3.890000e+07  3.890000e+07  4.210000e+07   

         newbaldest       isfraud  isflaggedfraud  
count  1.048575e+06  1.048575e+06       1048575.0  
mean   1.114198e+06  1.089097e-03             0.0  
std    2.416593e+06  3.298351e-02             0.0  
min    0.000000e+00  0.000000e+00

In [6]:
#cleaned data analysis

import pandas as pd
import networkx as nx
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import os

#  Load Cleaned Data
df = pd.read_csv('data/cleaned_banking_data.csv')
print(f"Loaded cleaned data: {df.shape[0]} transactions")

#  Transaction Graph
print("Building transaction network...")
G = nx.DiGraph()
for _, row in df.iterrows():
    G.add_edge(row['nameorig'], row['namedest'], weight=row['amount'])

# Feature Engineering
print("Extracting graph features")
features = []
for account in G.nodes():
    money_sent = sum(G[account][nbr]['weight'] for nbr in G.successors(account))
    money_received = sum(G[nbr][account]['weight'] for nbr in G.predecessors(account))
    connections = G.degree(account)

    features.append({
        'account_id': account,
        'money_sent': money_sent,
        'money_received': money_received,
        'connections': connections
    })

account_df = pd.DataFrame(features)
account_df = account_df[(account_df['money_sent'] > 0) | (account_df['money_received'] > 0)]

#  Z-Score Normalization
X = account_df[['money_sent', 'money_received', 'connections']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Isolation Forest
print(" Detecting suspicious accounts")
model = IsolationForest(contamination=0.1, random_state=42)
account_df['is_suspicious'] = model.fit_predict(X_scaled) == -1
account_df['suspicion_score'] = model.decision_function(X_scaled)

top_suspicious = account_df[account_df['is_suspicious']].sort_values('suspicion_score')
print(f"\n Top 10 Suspicious Accounts:")
print(top_suspicious[['account_id', 'money_sent', 'money_received', 'connections', 'suspicion_score']].head(10))

#  Export Results
os.makedirs('output', exist_ok=True)

account_df.to_csv('output/all_accounts.csv', index=False)
account_df[account_df['is_suspicious']].to_csv('output/suspicious_accounts.csv', index=False)

print(f"saved to suspicious_accounts.csv")
print(f"Most suspicious account: {account_df[account_df['is_suspicious']].sort_values('suspicion_score').iloc[0]['account_id']}")


Loaded cleaned data: 1048575 transactions
Building transaction network...
Extracting graph features
 Detecting suspicious accounts

 Top 10 Suspicious Accounts:
        account_id  money_sent  money_received  connections  suspicion_score
721      C11003494         0.0     24351218.72           69        -0.367223
367    C1789550256         0.0     28376404.29           73        -0.367223
179    C1286084959         0.0     33821294.10           96        -0.367223
746    C1816757085         0.0     25861438.32           69        -0.367223
16879   C423580937         0.0     24536518.17           59        -0.367223
341    C1504109395         0.0     22572409.85           69        -0.367223
1152    C667346055         0.0     25382158.07           73        -0.367223
204    C1870252780         0.0     20814145.76           60        -0.367223
206      C97730845         0.0     41811504.75           79        -0.367223
1139    C306206744         0.0     24331178.88           79        -0