# 03. Build Graph Features

This notebook builds a transaction graph from our fraud detection data and computes graph metrics.
We'll create features that capture the network structure of transactions to help identify fraud patterns.

In [2]:
# Import Libraries
import pandas as pd
import networkx as nx
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")

✅ Libraries imported successfully


In [3]:
# Setup paths
data_dir = Path("../app/artifacts")
output_dir = Path("../app/artifacts")

# Create output directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

print(f"📁 Data directory: {data_dir}")
print(f"📁 Output directory: {output_dir}")

📁 Data directory: ..\app\artifacts
📁 Output directory: ..\app\artifacts


In [4]:
# Load Data
print("📊 Loading processed data...")

data_path = data_dir / "eda_processed_data.csv"
df = pd.read_csv(data_path)

print(f"✅ Data loaded successfully")
print(f"📊 Shape: {df.shape}")
print(f"📊 Columns: {len(df.columns)}")
print(f"📊 Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

📊 Loading processed data...
✅ Data loaded successfully
📊 Shape: (118108, 434)
📊 Columns: 434
📊 Memory usage: 502.80 MB


In [5]:
# Display data info
print("📋 Data Overview:")
print("=" * 50)
print(f"Total transactions: {len(df):,}")
print(f"Fraud rate: {df['isFraud'].mean():.4f}")
print(f"Fraud count: {df['isFraud'].sum():,}")
print(f"Legitimate count: {(df['isFraud'] == 0).sum():,}")

📋 Data Overview:
Total transactions: 118,108
Fraud rate: 0.0350
Fraud count: 4,133
Legitimate count: 113,975


In [6]:
# Show sample of key columns
key_cols = ['card1', 'addr1', 'P_emaildomain', 'R_emaildomain', 'DeviceType', 'ProductCD']
print(f"\n�� Key entity columns: {key_cols}")
print("\n📊 Sample of key columns:")
print(df[key_cols].head())


�� Key entity columns: ['card1', 'addr1', 'P_emaildomain', 'R_emaildomain', 'DeviceType', 'ProductCD']

📊 Sample of key columns:
   card1  addr1 P_emaildomain R_emaildomain DeviceType ProductCD
0  14223  204.0     gmail.com           NaN        NaN         W
1   2516  315.0     yahoo.com           NaN        NaN         W
2   7585  272.0     yahoo.com           NaN        NaN         W
3  10823    NaN     gmail.com     gmail.com     mobile         C
4   9633    NaN   hotmail.com   hotmail.com     mobile         C


In [7]:
# Build Graph
print("🔗 Building transaction graph...")

# Initialize the graph
G = nx.Graph()

🔗 Building transaction graph...


In [8]:
# Define entity columns to use for graph construction
entity_cols = [
    'card1', 'addr1', 'P_emaildomain', 'R_emaildomain', 
    'DeviceType', 'ProductCD', 'card4', 'card6'
]

print(f"��️ Using {len(entity_cols)} entity columns: {entity_cols}")


��️ Using 8 entity columns: ['card1', 'addr1', 'P_emaildomain', 'R_emaildomain', 'DeviceType', 'ProductCD', 'card4', 'card6']


In [9]:
# Add nodes and edges
transaction_count = 0
for idx, row in df.iterrows():
    if idx % 10000 == 0:
        print(f"  Processing transaction {idx:,}/{len(df):,}")
    
    # Get entities for this transaction
    entities = []
    for col in entity_cols:
        if pd.notna(row[col]) and str(row[col]) != 'nan':
            entity_id = f"{col}_{str(row[col])}"
            entities.append(entity_id)
    
    # Add nodes
    for entity in entities:
        G.add_node(entity)
    
    # Add edges between all entities in this transaction
    for i in range(len(entities)):
        for j in range(i + 1, len(entities)):
            G.add_edge(entities[i], entities[j])
    
    transaction_count += 1

  Processing transaction 0/118,108
  Processing transaction 10,000/118,108
  Processing transaction 20,000/118,108
  Processing transaction 30,000/118,108
  Processing transaction 40,000/118,108
  Processing transaction 50,000/118,108
  Processing transaction 60,000/118,108
  Processing transaction 70,000/118,108
  Processing transaction 80,000/118,108
  Processing transaction 90,000/118,108
  Processing transaction 100,000/118,108
  Processing transaction 110,000/118,108


In [10]:
print(f"✅ Graph construction completed!")
print(f"�� Total transactions processed: {transaction_count:,}")
print(f"📊 Graph nodes: {G.number_of_nodes():,}")
print(f"📊 Graph edges: {G.number_of_edges():,}")

✅ Graph construction completed!
�� Total transactions processed: 118,108
📊 Graph nodes: 8,491
📊 Graph edges: 81,857


In [11]:
# Graph Analysis
print("📊 Analyzing graph structure...")

# Basic graph statistics
print("🔍 Basic Graph Statistics:")
print("=" * 40)
print(f"Number of nodes: {G.number_of_nodes():,}")
print(f"Number of edges: {G.number_of_edges():,}")
print(f"Number of connected components: {nx.number_connected_components(G)}")
print(f"Density: {nx.density(G):.6f}")

# Largest connected component
largest_cc = max(nx.connected_components(G), key=len)
print(f"Largest component size: {len(largest_cc):,}")

# Degree distribution
degrees = [d for n, d in G.degree()]
print(f"Average degree: {np.mean(degrees):.2f}")
print(f"Max degree: {max(degrees)}")
print(f"Min degree: {min(degrees)}")

📊 Analyzing graph structure...
🔍 Basic Graph Statistics:
Number of nodes: 8,491
Number of edges: 81,857
Number of connected components: 1
Density: 0.002271
Largest component size: 8,491
Average degree: 19.28
Max degree: 5850
Min degree: 2


In [12]:
# Calculate Graph Metrics
print("🧮 Calculating graph metrics...")

# Initialize metrics storage
metrics_data = {}

# Calculate metrics for each node
node_count = 0
total_nodes = G.number_of_nodes()


🧮 Calculating graph metrics...


In [14]:
# Calculate Graph Metrics (ULTRA-MINIMAL - FASTEST VERSION)
print("🧮 Calculating essential graph metrics only...")

print("  Calculating degree centrality...")
degree_cent = dict(G.degree())

print("  Calculating simple PageRank...")
# Use very fast PageRank with minimal iterations
pagerank = nx.pagerank(G, alpha=0.85, max_iter=20, tol=1e-2)

print("  Skipping expensive metrics (clustering & betweenness)...")
# Skip slow metrics entirely for large graphs
clustering = {node: 0.0 for node in G.nodes()}
betweenness = {node: 0.0 for node in G.nodes()}

print("✅ Essential metrics calculated in seconds!")

# Create metrics dictionary
metrics_data = {}
for node in G.nodes():
    metrics_data[node] = {
        'degree_centrality': degree_cent.get(node, 0),
        'pagerank': pagerank.get(node, 0.0),
        'clustering_coefficient': clustering.get(node, 0.0),
        'betweenness_centrality': betweenness.get(node, 0.0)
    }

print(f"✅ Metrics stored for {len(metrics_data):,} nodes")

🧮 Calculating essential graph metrics only...
  Calculating degree centrality...
  Calculating simple PageRank...
  Skipping expensive metrics (clustering & betweenness)...
✅ Essential metrics calculated in seconds!
✅ Metrics stored for 8,491 nodes


In [16]:
print("  Calculating clustering coefficient (optimized)...")
# Calculate clustering for high-degree nodes only (most important)
high_degree_nodes = [node for node, degree in degree_cent.items() if degree >= 5]
print(f"    Computing clustering for {len(high_degree_nodes):,} high-degree nodes...")

clustering_partial = nx.clustering(G, nodes=high_degree_nodes)
# Fill remaining nodes with 0
clustering = {node: clustering_partial.get(node, 0.0) for node in G.nodes()}

print("  Skipping betweenness (still too slow)...")
betweenness = {node: 0.0 for node in G.nodes()}

print("✅ Optimized metrics calculated!")

  Calculating clustering coefficient (optimized)...
    Computing clustering for 8,056 high-degree nodes...
  Skipping betweenness (still too slow)...
✅ Optimized metrics calculated!


In [17]:
print("  Calculating clustering for high-degree nodes...")
high_degree_nodes = [node for node, degree in degree_cent.items() if degree >= 5]
clustering_partial = nx.clustering(G, nodes=high_degree_nodes)
clustering = {node: clustering_partial.get(node, 0.0) for node in G.nodes()}

print("  Calculating sampled betweenness centrality...")
# Sample only 1000 most important nodes for betweenness
top_nodes = sorted(degree_cent.items(), key=lambda x: x[1], reverse=True)[:1000]
sample_nodes = [node for node, _ in top_nodes]

print(f"    Computing betweenness for top {len(sample_nodes)} nodes...")
betweenness_sample = nx.betweenness_centrality(G, k=len(sample_nodes))

# Assign betweenness values
betweenness = {}
for node in G.nodes():
    if node in betweenness_sample:
        betweenness[node] = betweenness_sample[node]
    else:
        # Estimate based on degree for non-sampled nodes
        betweenness[node] = degree_cent[node] / (G.number_of_nodes() * 1000)

print("✅ All metrics calculated with smart sampling!")

  Calculating clustering for high-degree nodes...
  Calculating sampled betweenness centrality...
    Computing betweenness for top 1000 nodes...
✅ All metrics calculated with smart sampling!


In [18]:
# Create metrics dictionary
metrics_data = {}
for node in G.nodes():
    metrics_data[node] = {
        'degree_centrality': degree_cent.get(node, 0),
        'pagerank': pagerank.get(node, 0.0),
        'clustering_coefficient': clustering.get(node, 0.0),
        'betweenness_centrality': betweenness.get(node, 0.0)
    }

print(f"✅ Metrics stored for {len(metrics_data):,} nodes")

✅ Metrics stored for 8,491 nodes


In [19]:
# Create Features DataFrame
print("📊 Creating features DataFrame...")

# Convert to DataFrame
graph_features_df = pd.DataFrame.from_dict(metrics_data, orient='index')

# Reset index to get node as a column
graph_features_df.reset_index(inplace=True)
graph_features_df.rename(columns={'index': 'node'}, inplace=True)

print(f"✅ Graph features DataFrame created")
print(f"📊 Shape: {graph_features_df.shape}")
print(f"📊 Columns: {graph_features_df.columns.tolist()}")

# Display sample
print("\n📋 Sample of graph features:")
print(graph_features_df.head(10))

📊 Creating features DataFrame...
✅ Graph features DataFrame created
📊 Shape: (8491, 5)
📊 Columns: ['node', 'degree_centrality', 'pagerank', 'clustering_coefficient', 'betweenness_centrality']

📋 Sample of graph features:
                      node  degree_centrality  pagerank  \
0              card1_14223                 11  0.000019   
1              addr1_204.0               1055  0.008494   
2  P_emaildomain_gmail.com               4990  0.061364   
3              ProductCD_W               5850  0.087432   
4               card4_visa               4847  0.069541   
5              card6_debit               5541  0.081945   
6               card1_2516                  5  0.000018   
7              addr1_315.0                755  0.007064   
8  P_emaildomain_yahoo.com               3124  0.034685   
9         card4_mastercard               3081  0.043342   

   clustering_coefficient  betweenness_centrality  
0                0.836364            2.468695e-09  
1                0.024324

In [20]:
# Feature Statistics
print("📊 Graph Features Statistics:")
print("=" * 50)

# Display statistics for each metric
for col in graph_features_df.columns:
    if col != 'node':
        print(f"\n🔍 {col}:")
        print(f"  Mean: {graph_features_df[col].mean():.6f}")
        print(f"  Std: {graph_features_df[col].std():.6f}")
        print(f"  Min: {graph_features_df[col].min():.6f}")
        print(f"  Max: {graph_features_df[col].max():.6f}")
        print(f"  Missing: {graph_features_df[col].isnull().sum()}")

# Check for any missing values
missing_count = graph_features_df.isnull().sum().sum()
print(f"\n⚠️ Total missing values: {missing_count}")

📊 Graph Features Statistics:

🔍 degree_centrality:
  Mean: 19.280886
  Std: 154.510367
  Min: 2.000000
  Max: 5850.000000
  Missing: 0

🔍 pagerank:
  Mean: 0.000118
  Std: 0.001951
  Min: 0.000018
  Max: 0.087432
  Missing: 0

🔍 clustering_coefficient:
  Mean: 0.854908
  Std: 0.246927
  Min: 0.000000
  Max: 1.000000
  Missing: 0

🔍 betweenness_centrality:
  Mean: 0.000127
  Std: 0.003725
  Min: 0.000000
  Max: 0.186888
  Missing: 0

⚠️ Total missing values: 0


In [21]:
# Handle missing values
print("🔧 Handling missing values...")

# Fill missing values with 0 (appropriate for graph metrics)
graph_features_df = graph_features_df.fillna(0)

print(f"✅ Missing values filled")
print(f"📊 Final shape: {graph_features_df.shape}")
print(f"📊 Memory usage: {graph_features_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

🔧 Handling missing values...
✅ Missing values filled
📊 Final shape: (8491, 5)
📊 Memory usage: 0.74 MB


In [22]:
# Save Artifacts
print("💾 Saving graph features...")

# Save as parquet
output_path = output_dir / "graph_features.parquet"
graph_features_df.to_parquet(output_path, index=False)

print(f"✅ Graph features saved to: {output_path}")
print(f"�� File size: {output_path.stat().st_size / 1024**2:.2f} MB")

💾 Saving graph features...
✅ Graph features saved to: ..\app\artifacts\graph_features.parquet
�� File size: 0.17 MB


In [23]:
# Also save as CSV for inspection (optional)
csv_path = output_dir / "graph_features.csv"
graph_features_df.to_csv(csv_path, index=False)
print(f"✅ CSV backup saved to: {csv_path}")

print("\n�� GRAPH FEATURES CREATION COMPLETED!")
print(f"📊 Total nodes with features: {len(graph_features_df):,}")
print(f"�� Features per node: {len(graph_features_df.columns) - 1}")
print(f"\nNext step: Use these graph features to enhance your XGBoost model!")

✅ CSV backup saved to: ..\app\artifacts\graph_features.csv

�� GRAPH FEATURES CREATION COMPLETED!
📊 Total nodes with features: 8,491
�� Features per node: 4

Next step: Use these graph features to enhance your XGBoost model!


In [24]:
# Verification and Summary
print("�� Final Verification:")
print("=" * 40)

# Check file was created
if output_path.exists():
    print(f"✅ Output file exists: {output_path}")
    print(f"�� File size: {output_path.stat().st_size / 1024**2:.2f} MB")
else:
    print(f"❌ Output file not found!")

# Display final sample
print(f"\n📋 Final sample of graph features:")
print(graph_features_df.head())

print(f"\n�� Ready for the next step: Hybrid model training!")
print(f"These graph features will be merged with transaction features to create a powerful hybrid model.")

�� Final Verification:
✅ Output file exists: ..\app\artifacts\graph_features.parquet
�� File size: 0.17 MB

📋 Final sample of graph features:
                      node  degree_centrality  pagerank  \
0              card1_14223                 11  0.000019   
1              addr1_204.0               1055  0.008494   
2  P_emaildomain_gmail.com               4990  0.061364   
3              ProductCD_W               5850  0.087432   
4               card4_visa               4847  0.069541   

   clustering_coefficient  betweenness_centrality  
0                0.836364            2.468695e-09  
1                0.024324            4.835185e-03  
2                0.003461            1.301770e-01  
3                0.002523            1.868879e-01  
4                0.003682            1.397444e-01  

�� Ready for the next step: Hybrid model training!
These graph features will be merged with transaction features to create a powerful hybrid model.
