# Crypto Flow Visualization

In [None]:
import pandas as pd
from pyvis.network import Network
import math

# PARAMETERS (adjust here)
CSV_PATH = "transactions.csv"
OFAC_TAGGED = { # could help for difference-in difference-out analysis?
    # Lazarus Group ETH address added by OFAC on 2022-04-14
    "0x098B716B8Aaf21512996dC57EB0615e2383E2f96": "LAZARUS_GROUP"
}
MIN_ETH_EDGE = 0.0          # filter out edges below this ETH value (after aggregation)
TOP_N_ADDRESSES = None       # if set (e.g. 100), keep only top-N by total flow (in+out) for clarity
FOCUS_ADDRESSES = []         # list of addresses to force-keep (e.g. ["0x..."])
LIMIT_TXS = None             # if set (e.g. 5000), only read first N rows
TIME_COLUMN = None           # e.g. 'timestamp' if exists
TIME_START = None            # e.g. '2022-04-01'
TIME_END = None              # e.g. '2022-04-30'

# Load data
raw_df = pd.read_csv(CSV_PATH, nrows=LIMIT_TXS)

df = raw_df.copy()

# Optional filtering by time
if TIME_COLUMN and TIME_COLUMN in df.columns:
    df[TIME_COLUMN] = pd.to_datetime(df[TIME_COLUMN], errors='coerce')
    if TIME_START:
        df = df[df[TIME_COLUMN] >= pd.to_datetime(TIME_START)]
    if TIME_END:
        df = df[df[TIME_COLUMN] <= pd.to_datetime(TIME_END)]

# Ensure expected columns exist
essential_cols = {"from_addr", "to_addr"}
if not essential_cols.issubset(df.columns):
    raise ValueError(f"CSV must contain columns: {essential_cols}")

value_col = None
for cand in ["eth_value", "value", "amount"]:
    if cand in df.columns:
        value_col = cand
        break
if value_col is None:
    # Default to 1 per transaction if no value column
    df[value_col := "_unit_value"] = 1.0

# Clean addresses (go lowercase too)
for c in ["from_addr", "to_addr"]:
    df[c] = df[c].astype(str).str.strip()

# Aggregate edges
agg = (df.groupby(["from_addr", "to_addr"], dropna=False)
         .agg(total_eth=(value_col, 'sum'), txs=(value_col, 'count'))
         .reset_index())

# Filter small edges
agg = agg[agg.total_eth >= MIN_ETH_EDGE]

# Compute node metrics
from collections import defaultdict
in_flow = defaultdict(float)
out_flow = defaultdict(float)
for _, r in agg.iterrows():
    out_flow[r['from_addr']] += r['total_eth']
    in_flow[r['to_addr']]    += r['total_eth']

nodes = set(agg['from_addr']) | set(agg['to_addr'])
node_stats = []
for n in nodes:
    total = in_flow[n] + out_flow[n]
    node_stats.append({
        'address': n,
        'in_eth': in_flow[n],
        'out_eth': out_flow[n],
        'total_eth': total
    })
node_df = pd.DataFrame(node_stats).sort_values('total_eth', ascending=False)

# Apply top-N reduction 
if TOP_N_ADDRESSES is not None and TOP_N_ADDRESSES < len(node_df):
    preserve = set(FOCUS_ADDRESSES) | set(OFAC_TAGGED.keys())
    top_keep = set(node_df.head(TOP_N_ADDRESSES)['address']) | preserve
    agg = agg[agg['from_addr'].isin(top_keep) & agg['to_addr'].isin(top_keep)]
    # Recompute flows after pruning
    in_flow.clear(); out_flow.clear()
    for _, r in agg.iterrows():
        out_flow[r['from_addr']] += r['total_eth']
        in_flow[r['to_addr']]    += r['total_eth']
    nodes = set(agg['from_addr']) | set(agg['to_addr'])

# Build PyVis network
net = Network(height="100vh", width="100%", directed=True, bgcolor="#FFFFFF")
net.force_atlas_2based(gravity=-55, central_gravity=0.006, spring_length=150, spring_strength=0.08, damping=0.45)

# Scaling helpers
max_total = max((in_flow[n] + out_flow[n]) for n in nodes) if nodes else 1

# Color scheme
COLOR_DEFAULT = "#5DADE2"
COLOR_OFAC = "#E74C3C"
COLOR_FOCUS = "#F1C40F"

# Add nodes
for n in nodes:
    total = in_flow[n] + out_flow[n]
    if max_total == 0:
        norm = 0
    else:
        norm = total / max_total
    size = 8 + 40 * math.sqrt(norm)  # perceptual scaling
    tag = None
    color = COLOR_DEFAULT
    if n in OFAC_TAGGED:
        tag = OFAC_TAGGED[n]
        color = COLOR_OFAC
    elif n in FOCUS_ADDRESSES:
        color = COLOR_FOCUS
    title = (f"{n}\n" + (f"{tag}" if tag else "") +
             f"In: {in_flow[n]:.6f} ETH\n" +
             f"Out: {out_flow[n]:.6f} ETH\n" +
             f"Total: {total:.6f} ETH")
    short_label = n[:6] + '…' + n[-4:]
    net.add_node(n, label=short_label, title=title, value=total, size=size, color=color)

# Edge scaling
max_edge = agg['total_eth'].max() if len(agg) else 1

for _, r in agg.iterrows():
    w_norm = (r['total_eth'] / max_edge) if max_edge else 0
    width = 1 + 7 * math.sqrt(w_norm)
    title = f"{r['txs']} transaction(s) Total: {r['total_eth']:.8f} ETH"
    net.add_edge(r['from_addr'], r['to_addr'], value=r['total_eth'], title=title, width=width, arrows='to')

# Configure network options directly on the object (avoids template issues)
net.barnes_hut()

# Use write_html instead of show to avoid template issues
OUTPUT_HTML = "eth_flow_advanced.html"
try:
    net.write_html(OUTPUT_HTML, notebook=False)
    print(f"Interactive network saved to {OUTPUT_HTML}")
    print(f"Open {OUTPUT_HTML} in your browser to view the visualization")
except Exception as e:
    print(f"PyVis HTML generation failed: {e}")
    print("Falling back to simple network export...")
    
    # Fallback: create simple HTML manually
    nodes_data = []
    edges_data = []
    
    for n in nodes: # add nodes
        total = in_flow[n] + out_flow[n]
        norm = total / max_total if max_total > 0 else 0
        size = 8 + 40 * math.sqrt(norm)
        color = COLOR_OFAC if n in OFAC_TAGGED else (COLOR_FOCUS if n in FOCUS_ADDRESSES else COLOR_DEFAULT)
        nodes_data.append({
            'id': n,
            'label': n[:6] + '…' + n[-4:],
            'size': size,
            'color': color,
            'title': f"{n} | In: {in_flow[n]:.6f} ETH | Out: {out_flow[n]:.6f} ETH"
        })
    
    for _, r in agg.iterrows(): # add edges
        w_norm = (r['total_eth'] / max_edge) if max_edge > 0 else 0
        width = 1 + 7 * math.sqrt(w_norm)
        edges_data.append({
            'from': r['from_addr'],
            'to': r['to_addr'],
            'width': width,
            'title': f"{r['txs']} tx(s) | {r['total_eth']:.8f} ETH"
        })
    
    # Write basic vis.js HTML
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <script type="text/javascript" src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script>
        <style>
            body {{ margin: 0; padding: 0; }}
            #mynetworkid {{ width: 100%; height: 100vh; border: 1px solid lightgray; }}
        </style>
    </head>
    <body>
        <div id="mynetworkid"></div>
        <script>
            var nodes = new vis.DataSet({nodes_data});
            var edges = new vis.DataSet({edges_data});
            var container = document.getElementById('mynetworkid');
            var data = {{ nodes: nodes, edges: edges }};
            var options = {{
                arrows: {{ to: {{ enabled: true }} }},
                physics: {{ stabilization: {{ iterations: 100 }} }}
            }};
            var network = new vis.Network(container, data, options);
        </script>
    </body>
    </html>
    """
    
    with open(OUTPUT_HTML, 'w') as f:
        f.write(html_content)
    print(f"Fallback visualization saved to {OUTPUT_HTML}")

# Display top addresses table (optional preview)
display(node_df.head(20))

Interactive network saved to eth_flow_advanced.html
Open eth_flow_advanced.html in your browser to view the visualization


Unnamed: 0,address,in_eth,out_eth,total_eth
283,0x47666FAB8BD0AC7003BCE3F5C3585383F09486E2,0.244824,800002.0,800002.244824
220,0xDD90071D52F20E85C89802E5DC1EC0A7B6475F92,196097.650145,180000.0,376097.650145
736,0xA4B2FD68593B6F34E51CB9EDB66E71C1B4AB449E,2.00003,196097.7896,196099.78963
226,0x36ED3C0213565530C35115D93A80F9C04D94E4CB,20000.011299,20000.0,40000.011299
1314,0x4571BD67D14280E40BF3910BD39FBF60834F900A,20000.002366,10486.70622,30486.708586
990,0xAF620E6D32B1C67F3396EF5D2F7D7642DC2E6CE9,20133.191919,10070.202497,30203.394416
975,0xFC926659DD8808F6E3E0A8D61B20B871F3FA6465,20052.954111,10026.480644,30079.434756
921,0xFA3FCCCB897079FD83BFBA690E7D47EB402D6C49,20026.945867,10039.947195,30066.893062
1134,0x3A21F4E6BBE527D347CA7C157F4233C935779847,20007.213974,10017.076706,30024.290679
721,0x83C7678492D623FB98834F0FBCB2E7B7F5AF8950,20000.007178,10015.452682,30015.45986


# Helper Analysis (focusing on OFAC sanctioned)

In [23]:
# Helper analysis: counterparties to OFAC / focus addresses
import pandas as pd
from collections import defaultdict

# Reuse agg, in_flow, out_flow if still in memory; if not, reload lightweight
try:
    _ = agg
except NameError:
    tmp_df = pd.read_csv(CSV_PATH if 'CSV_PATH' in globals() else 'transactions.csv', nrows=LIMIT_TXS if 'LIMIT_TXS' in globals() else None)
    if 'from_addr' not in tmp_df.columns or 'to_addr' not in tmp_df.columns:
        raise ValueError('transactions.csv must contain from_addr and to_addr')
    val_col = None
    for cand in ['eth_value','value','amount']:
        if cand in tmp_df.columns:
            val_col = cand
            break
    if val_col is None:
        tmp_df[val_col := '_unit_value'] = 1.0
    tmp_df['from_addr'] = tmp_df['from_addr'].astype(str).str.strip()
    tmp_df['to_addr'] = tmp_df['to_addr'].astype(str).str.strip()
    agg = (tmp_df.groupby(['from_addr','to_addr'])
                 .agg(total_eth=(val_col,'sum'), txs=(val_col,'count'))
                 .reset_index())

OFAC_LIST = list(OFAC_TAGGED.keys()) if 'OFAC_TAGGED' in globals() else []
TARGETS = set(OFAC_LIST) | set(FOCUS_ADDRESSES if 'FOCUS_ADDRESSES' in globals() else [])

if not TARGETS:
    print('No focus or OFAC addresses defined. Set FOCUS_ADDRESSES or OFAC_TAGGED and rerun.')
else:
    rows = []
    for t in TARGETS:
        sub_out = agg[agg['from_addr'] == t]
        sub_in  = agg[agg['to_addr'] == t]
        top_out = sub_out.sort_values('total_eth', ascending=False).head(15)
        top_in  = sub_in.sort_values('total_eth', ascending=False).head(15)
        rows.append({'address': t,
                     'tag': OFAC_TAGGED.get(t,''),
                     'out_partners': len(sub_out),
                     'in_partners': len(sub_in),
                     'out_eth': sub_out.total_eth.sum(),
                     'in_eth': sub_in.total_eth.sum(),
                     'net_eth': sub_in.total_eth.sum() - sub_out.total_eth.sum()})
        print(f"===== TARGET {t} {OFAC_TAGGED.get(t,'')} =====")
        if len(top_in):
            print('Top inbound counterparties:')
            display(top_in[['from_addr','total_eth','txs']])
        if len(top_out):
            print('Top outbound counterparties:')
            display(top_out[['to_addr','total_eth','txs']])
    summary_df = pd.DataFrame(rows).sort_values('in_eth', ascending=False)
    display(summary_df)

===== TARGET 0x098B716B8Aaf21512996dC57EB0615e2383E2f96 LAZARUS_GROUP =====


Unnamed: 0,address,tag,out_partners,in_partners,out_eth,in_eth,net_eth
0,0x098B716B8Aaf21512996dC57EB0615e2383E2f96,LAZARUS_GROUP,0,0,0.0,0.0,0.0
