# 04. Hybrid Model Training & Evaluation

This notebook trains our final hybrid fraud detection model by combining:
- Traditional transaction features from our baseline model
- Graph network features from our transaction graph

Our goal is to beat the baseline ROC-AUC score of **0.9213** using graph-enhanced features.

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")

✅ Libraries imported successfully


In [2]:
# Setup paths
data_dir = Path("../app/artifacts")
output_dir = Path("../app/artifacts")

print(f"📁 Data directory: {data_dir}")
print(f"📁 Output directory: {output_dir}")

# Define baseline score to beat
BASELINE_ROC_AUC = 0.9213
print(f"🎯 Baseline ROC-AUC to beat: {BASELINE_ROC_AUC}")

📁 Data directory: ..\app\artifacts
📁 Output directory: ..\app\artifacts
🎯 Baseline ROC-AUC to beat: 0.9213


In [3]:
# Load Datasets
print("📊 Loading datasets...")

# Load main transaction data
transaction_path = data_dir / "eda_processed_data.csv"
df_transactions = pd.read_csv(transaction_path)

# Load graph features
graph_path = data_dir / "graph_features.parquet"
df_graph = pd.read_parquet(graph_path)

print(f"✅ Transaction data loaded: {df_transactions.shape}")
print(f"✅ Graph features loaded: {df_graph.shape}")
print(f"📊 Transaction columns: {len(df_transactions.columns)}")
print(f"📊 Graph feature columns: {df_graph.columns.tolist()}")

📊 Loading datasets...
✅ Transaction data loaded: (118108, 434)
✅ Graph features loaded: (8491, 5)
📊 Transaction columns: 434
📊 Graph feature columns: ['node', 'degree_centrality', 'pagerank', 'clustering_coefficient', 'betweenness_centrality']


In [4]:
# Inspect data for merging
print("🔍 Inspecting data for merging...")

# Show sample of transaction data key columns
key_entity_cols = ['card1', 'addr1', 'P_emaildomain', 'R_emaildomain', 'DeviceType', 'ProductCD']
print("\n📊 Sample transaction entities:")
print(df_transactions[key_entity_cols].head())

# Show sample of graph features
print("\n📊 Sample graph features:")
print(df_graph.head())

# Check graph node patterns
print(f"\n🔍 Graph node patterns:")
node_types = df_graph['node'].str.split('_', n=1, expand=True)[0].value_counts()
print(node_types)

🔍 Inspecting data for merging...

📊 Sample transaction entities:
   card1  addr1 P_emaildomain R_emaildomain DeviceType ProductCD
0  14223  204.0     gmail.com           NaN        NaN         W
1   2516  315.0     yahoo.com           NaN        NaN         W
2   7585  272.0     yahoo.com           NaN        NaN         W
3  10823    NaN     gmail.com     gmail.com     mobile         C
4   9633    NaN   hotmail.com   hotmail.com     mobile         C

📊 Sample graph features:
                      node  degree_centrality  pagerank  \
0              card1_14223                 11  0.000019   
1              addr1_204.0               1055  0.008494   
2  P_emaildomain_gmail.com               4990  0.061364   
3              ProductCD_W               5850  0.087432   
4               card4_visa               4847  0.069541   

   clustering_coefficient  betweenness_centrality  
0                0.836364            2.468695e-09  
1                0.024324            4.835185e-03  
2       

In [5]:
# Merge Features - Key Step!
print("🔗 Merging graph features with transaction data...")

# Create a copy to work with
df_merged = df_transactions.copy()

# Define primary and secondary entities for merging
primary_entities = ['card1', 'addr1', 'P_emaildomain']  # Most important for fraud
secondary_entities = ['ProductCD', 'DeviceType', 'card4']  # Secondary entities

print(f"🎯 Primary entities for merging: {primary_entities}")
print(f"🎯 Secondary entities for merging: {secondary_entities}")


🔗 Merging graph features with transaction data...
🎯 Primary entities for merging: ['card1', 'addr1', 'P_emaildomain']
🎯 Secondary entities for merging: ['ProductCD', 'DeviceType', 'card4']


In [6]:
# Merge function
def merge_graph_features(df, entity_col, suffix):
    """Merge graph features for a specific entity column"""
    # Create entity IDs that match graph node format
    entity_ids = df[entity_col].astype(str).apply(lambda x: f"{entity_col}_{x}" if pd.notna(x) and x != 'nan' else None)
    
    # Create temporary dataframe for merging
    temp_df = pd.DataFrame({
        'entity_id': entity_ids,
        'index': df.index
    }).dropna()
    
    # Merge with graph features
    merged = temp_df.merge(
        df_graph.rename(columns={'node': 'entity_id'}), 
        on='entity_id', 
        how='left'
    )
    
    # Rename columns with suffix
    feature_cols = ['degree_centrality', 'pagerank', 'clustering_coefficient', 'betweenness_centrality']
    for col in feature_cols:
        merged[f"{col}_{suffix}"] = merged[col]
    
    # Return only the new features aligned with original dataframe
    result = merged.set_index('index')[feature_cols].rename(columns={col: f"{col}_{suffix}" for col in feature_cols})
    return result.reindex(df.index, fill_value=0.0)

In [7]:
# Start merging
original_cols = len(df_merged.columns)
print(f"📊 Starting with {original_cols} columns")

📊 Starting with 434 columns


In [8]:
# Merge primary entities (most important)
print("🔗 Merging primary entity features...")

for entity in primary_entities:
    if entity in df_merged.columns:
        print(f"  Merging {entity}...")
        graph_features = merge_graph_features(df_merged, entity, entity.lower())
        df_merged = pd.concat([df_merged, graph_features], axis=1)
        print(f"    Added {len(graph_features.columns)} features")

print(f"✅ Primary entities merged. Total columns: {len(df_merged.columns)}")

🔗 Merging primary entity features...
  Merging card1...
    Added 4 features
  Merging addr1...
    Added 4 features
  Merging P_emaildomain...
    Added 4 features
✅ Primary entities merged. Total columns: 446


In [9]:
# Merge secondary entities
print("🔗 Merging secondary entity features...")

for entity in secondary_entities:
    if entity in df_merged.columns:
        print(f"  Merging {entity}...")
        graph_features = merge_graph_features(df_merged, entity, entity.lower())
        df_merged = pd.concat([df_merged, graph_features], axis=1)
        print(f"    Added {len(graph_features.columns)} features")

print(f"✅ Secondary entities merged. Total columns: {len(df_merged.columns)}")


🔗 Merging secondary entity features...
  Merging ProductCD...
    Added 4 features
  Merging DeviceType...
    Added 4 features
  Merging card4...
    Added 4 features
✅ Secondary entities merged. Total columns: 458


In [10]:
# Show new graph feature columns
graph_feature_cols = [col for col in df_merged.columns if any(metric in col for metric in ['degree_centrality', 'pagerank', 'clustering_coefficient', 'betweenness_centrality'])]
print(f"\n📊 Graph feature columns added ({len(graph_feature_cols)}):")
for col in graph_feature_cols:
    print(f"  - {col}")


📊 Graph feature columns added (24):
  - degree_centrality_card1
  - pagerank_card1
  - clustering_coefficient_card1
  - betweenness_centrality_card1
  - degree_centrality_addr1
  - pagerank_addr1
  - clustering_coefficient_addr1
  - betweenness_centrality_addr1
  - degree_centrality_p_emaildomain
  - pagerank_p_emaildomain
  - clustering_coefficient_p_emaildomain
  - betweenness_centrality_p_emaildomain
  - degree_centrality_productcd
  - pagerank_productcd
  - clustering_coefficient_productcd
  - betweenness_centrality_productcd
  - degree_centrality_devicetype
  - pagerank_devicetype
  - clustering_coefficient_devicetype
  - betweenness_centrality_devicetype
  - degree_centrality_card4
  - pagerank_card4
  - clustering_coefficient_card4
  - betweenness_centrality_card4


In [11]:
# Handle missing values from merging
print("🔧 Handling missing values from merging...")

# Check for missing values in graph features
missing_summary = df_merged[graph_feature_cols].isnull().sum()
print(f"📊 Missing values in graph features:")
print(missing_summary[missing_summary > 0])

# Fill missing graph features with 0 (entities not in graph)
df_merged[graph_feature_cols] = df_merged[graph_feature_cols].fillna(0.0)

print(f"✅ Missing values filled with 0.0")
print(f"📊 Final merged dataset shape: {df_merged.shape}")

🔧 Handling missing values from merging...
📊 Missing values in graph features:
Series([], dtype: int64)
✅ Missing values filled with 0.0
📊 Final merged dataset shape: (118108, 458)


In [12]:
# Feature summary
print("📊 Hybrid Feature Summary:")
print("=" * 50)

print(f"Original transaction features: {original_cols}")
print(f"Graph features added: {len(graph_feature_cols)}")
print(f"Total features: {len(df_merged.columns)}")

# Show sample of graph features
print(f"\n📋 Sample of graph features:")
print(df_merged[graph_feature_cols].describe())

📊 Hybrid Feature Summary:
Original transaction features: 434
Graph features added: 24
Total features: 458

📋 Sample of graph features:
       degree_centrality_card1  pagerank_card1  clustering_coefficient_card1  \
count            118108.000000   118108.000000                 118108.000000   
mean                 44.487240        0.000044                      0.636099   
std                  30.917079        0.000030                      0.181294   
min                   2.000000        0.000018                      0.000000   
25%                  17.000000        0.000020                      0.481422   
50%                  40.000000        0.000030                      0.628056   
75%                  64.000000        0.000063                      0.763636   
max                 121.000000        0.000126                      1.000000   

       betweenness_centrality_card1  degree_centrality_addr1  pagerank_addr1  \
count                  1.181080e+05            118108.000000   1

In [13]:
# Preprocessing (same as baseline)
print("🔧 Starting feature preprocessing (same as baseline)...")

# Separate features and target
target_col = 'isFraud'
X = df_merged.drop(columns=[target_col])
y = df_merged[target_col]

print(f"✅ Features (X): {X.shape}")
print(f"✅ Target (y): {y.shape}")
print(f"✅ Target distribution: {y.value_counts().to_dict()}")

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"📊 Categorical columns: {len(categorical_cols)}")
print(f"📊 Numerical columns: {len(numerical_cols)}")

🔧 Starting feature preprocessing (same as baseline)...
✅ Features (X): (118108, 457)
✅ Target (y): (118108,)
✅ Target distribution: {0: 113975, 1: 4133}
📊 Categorical columns: 31
📊 Numerical columns: 426


In [14]:
# Handle missing values in numerical columns
print("🔧 Handling missing values...")
X_numerical = X[numerical_cols].copy()
X_numerical = X_numerical.fillna(-999)

# Frequency encoding for categorical columns
print("🔧 Applying frequency encoding...")
X_categorical = X[categorical_cols].copy()
X_categorical = X_categorical.fillna('MISSING')

for col in categorical_cols:
    value_counts = X_categorical[col].value_counts()
    X_categorical[col] = X_categorical[col].map(value_counts)
    X_categorical[col] = X_categorical[col].fillna(0)

# Combine features
X_processed = pd.concat([X_numerical, X_categorical], axis=1)

print(f"✅ Preprocessing completed")
print(f"📊 Final feature matrix: {X_processed.shape}")
print(f"📊 Memory usage: {X_processed.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

🔧 Handling missing values...
🔧 Applying frequency encoding...
✅ Preprocessing completed
📊 Final feature matrix: (118108, 457)
📊 Memory usage: 411.80 MB


In [15]:
# Data Split (same as baseline)
print("✂️ Splitting data (same as baseline)...")

X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"✅ Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X_processed)*100:.1f}%)")
print(f"✅ Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(X_processed)*100:.1f}%)")
print(f"✅ Training fraud rate: {y_train.mean():.4f}")
print(f"✅ Validation fraud rate: {y_val.mean():.4f}")

✂️ Splitting data (same as baseline)...
✅ Training set: 94486 samples (80.0%)
✅ Validation set: 23622 samples (20.0%)
✅ Training fraud rate: 0.0350
✅ Validation fraud rate: 0.0350


In [16]:
# Train Hybrid Model (same parameters as baseline)
print("🚀 Training Hybrid XGBoost model...")

# Initialize with same parameters as baseline
hybrid_model = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='auc',
    early_stopping_rounds=10,
    verbose=100
)

print("✅ Hybrid model initialized")
print(f"📊 Training on {X_train.shape[1]} features (including {len(graph_feature_cols)} graph features)")

🚀 Training Hybrid XGBoost model...
✅ Hybrid model initialized
📊 Training on 457 features (including 24 graph features)


In [17]:
# Train the hybrid model
print("🔥 Starting hybrid model training...")

hybrid_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)

print("✅ Hybrid model training completed!")
print(f"📊 Best iteration: {hybrid_model.best_iteration}")
print(f"📊 Best validation AUC: {hybrid_model.best_score:.4f}")

🔥 Starting hybrid model training...
[0]	validation_0-auc:0.79623
[1]	validation_0-auc:0.84421
[2]	validation_0-auc:0.85833
[3]	validation_0-auc:0.86202
[4]	validation_0-auc:0.86333
[5]	validation_0-auc:0.86961
[6]	validation_0-auc:0.87016
[7]	validation_0-auc:0.87275
[8]	validation_0-auc:0.87617
[9]	validation_0-auc:0.87790
[10]	validation_0-auc:0.87881
[11]	validation_0-auc:0.88120
[12]	validation_0-auc:0.88200
[13]	validation_0-auc:0.88364
[14]	validation_0-auc:0.88476
[15]	validation_0-auc:0.88517
[16]	validation_0-auc:0.88650
[17]	validation_0-auc:0.88656
[18]	validation_0-auc:0.88726
[19]	validation_0-auc:0.88864
[20]	validation_0-auc:0.88898
[21]	validation_0-auc:0.88950
[22]	validation_0-auc:0.89021
[23]	validation_0-auc:0.89101
[24]	validation_0-auc:0.89155
[25]	validation_0-auc:0.89193
[26]	validation_0-auc:0.89256
[27]	validation_0-auc:0.89313
[28]	validation_0-auc:0.89356
[29]	validation_0-auc:0.89464
[30]	validation_0-auc:0.89533
[31]	validation_0-auc:0.89524
[32]	validatio

In [18]:
# Evaluate and Compare Performance
print("📊 Evaluating hybrid model performance...")

# Make predictions
y_pred_proba = hybrid_model.predict_proba(X_val)[:, 1]
y_pred = hybrid_model.predict(X_val)

# Calculate metrics
hybrid_roc_auc = roc_auc_score(y_val, y_pred_proba)

print("🎯 PERFORMANCE COMPARISON:")
print("=" * 60)
print(f"Baseline ROC-AUC:     {BASELINE_ROC_AUC:.4f}")
print(f"Hybrid ROC-AUC:       {hybrid_roc_auc:.4f}")
print(f"Improvement:          {hybrid_roc_auc - BASELINE_ROC_AUC:+.4f}")
print(f"Relative Improvement: {((hybrid_roc_auc - BASELINE_ROC_AUC) / BASELINE_ROC_AUC * 100):+.2f}%")
print("=" * 60)

if hybrid_roc_auc > BASELINE_ROC_AUC:
    print("🎉 SUCCESS! Graph features improved the model!")
else:
    print("🤔 Graph features didn't improve performance. Need investigation.")

📊 Evaluating hybrid model performance...
🎯 PERFORMANCE COMPARISON:
Baseline ROC-AUC:     0.9213
Hybrid ROC-AUC:       0.9263
Improvement:          +0.0050
Relative Improvement: +0.54%
🎉 SUCCESS! Graph features improved the model!


In [19]:
# Detailed performance metrics
print("📋 Detailed Performance Metrics:")
print("=" * 50)
print(classification_report(y_val, y_pred, target_names=['Legitimate', 'Fraud']))
print("=" * 50)

📋 Detailed Performance Metrics:
              precision    recall  f1-score   support

  Legitimate       0.98      1.00      0.99     22795
       Fraud       0.90      0.42      0.57       827

    accuracy                           0.98     23622
   macro avg       0.94      0.71      0.78     23622
weighted avg       0.98      0.98      0.97     23622



In [20]:
# Feature Importance Analysis
print("🔍 Analyzing feature importance...")

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_processed.columns,
    'importance': hybrid_model.feature_importances_
}).sort_values('importance', ascending=False)

print("🏆 Top 20 Most Important Features:")
print(feature_importance.head(20))

# Check how many graph features are in top 20
top_20_features = feature_importance.head(20)['feature'].tolist()
graph_features_in_top20 = [f for f in top_20_features if any(metric in f for metric in ['degree_centrality', 'pagerank', 'clustering_coefficient', 'betweenness_centrality'])]

print(f"\n📊 Graph features in top 20: {len(graph_features_in_top20)}/20")
for gf in graph_features_in_top20:
    rank = top_20_features.index(gf) + 1
    importance = feature_importance[feature_importance['feature'] == gf]['importance'].iloc[0]
    print(f"  #{rank}: {gf} (importance: {importance:.4f})")

🔍 Analyzing feature importance...
🏆 Top 20 Most Important Features:
                         feature  importance
297                         V258    0.210359
240                         V201    0.051369
228                         V189    0.050857
18                            C8    0.026764
109                          V70    0.019346
108                          V69    0.017547
129                          V90    0.016753
323                         V284    0.015767
130                          V91    0.014120
24                           C14    0.012966
284                         V245    0.010939
347                         V308    0.010584
333                         V294    0.007579
22                           C12    0.007442
245                         V206    0.006742
211                         V172    0.006633
87                           V48    0.006393
414  degree_centrality_productcd    0.006177
177                         V138    0.006157
14                            C4

In [21]:
# Feature Importance Visualization
print("📊 Creating feature importance visualization...")

# Plot top 20 features
plt.figure(figsize=(12, 10))
top_features = feature_importance.head(20)

# Color graph features differently
colors = ['red' if any(metric in feat for metric in ['degree_centrality', 'pagerank', 'clustering_coefficient', 'betweenness_centrality']) else 'steelblue' for feat in top_features['feature']]

bars = plt.barh(range(len(top_features)), top_features['importance'], color=colors)
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 20 Most Important Features (Hybrid Model)\nRed = Graph Features, Blue = Traditional Features')
plt.gca().invert_yaxis()

# Add importance values on bars
for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width + 0.001, bar.get_y() + bar.get_height()/2, 
             f'{width:.3f}', ha='left', va='center', fontsize=8)

plt.tight_layout()
plt.savefig('hybrid_feature_importance.png', dpi=100, bbox_inches='tight')
plt.close()
print("✅ Feature importance plot saved as 'hybrid_feature_importance.png'")

📊 Creating feature importance visualization...
✅ Feature importance plot saved as 'hybrid_feature_importance.png'


In [22]:
# Save Final Model
print("💾 Saving hybrid model...")

# Save the trained hybrid model
model_path = output_dir / "hybrid_model.pkl"
joblib.dump(hybrid_model, model_path)
print(f"✅ Hybrid model saved to: {model_path}")

# Save hybrid feature names
feature_names_path = output_dir / "hybrid_feature_names.pkl"
joblib.dump(X_processed.columns.tolist(), feature_names_path)
print(f"✅ Hybrid feature names saved to: {feature_names_path}")

# Save hybrid feature importance
importance_path = output_dir / "hybrid_feature_importance.csv"
feature_importance.to_csv(importance_path, index=False)
print(f"✅ Hybrid feature importance saved to: {importance_path}")

print("\n🎉 HYBRID MODEL TRAINING COMPLETED!")
print(f"🏆 Final ROC-AUC Score: {hybrid_roc_auc:.4f}")
print(f"📈 Improvement over baseline: {hybrid_roc_auc - BASELINE_ROC_AUC:+.4f}")

💾 Saving hybrid model...
✅ Hybrid model saved to: ..\app\artifacts\hybrid_model.pkl
✅ Hybrid feature names saved to: ..\app\artifacts\hybrid_feature_names.pkl
✅ Hybrid feature importance saved to: ..\app\artifacts\hybrid_feature_importance.csv

🎉 HYBRID MODEL TRAINING COMPLETED!
🏆 Final ROC-AUC Score: 0.9263
📈 Improvement over baseline: +0.0050


In [23]:
# Final Summary
print("📋 FINAL HYBRID MODEL SUMMARY:")
print("=" * 50)
print(f"📊 Dataset size: {len(df_merged):,} transactions")
print(f"📊 Total features: {X_processed.shape[1]:,}")
print(f"📊 Graph features: {len(graph_feature_cols)}")
print(f"📊 Traditional features: {X_processed.shape[1] - len(graph_feature_cols)}")
print(f"📊 Fraud rate: {y.mean():.4f}")
print("\n🎯 PERFORMANCE:")
print(f"Baseline ROC-AUC:  {BASELINE_ROC_AUC:.4f}")
print(f"Hybrid ROC-AUC:    {hybrid_roc_auc:.4f}")
print(f"Improvement:       {hybrid_roc_auc - BASELINE_ROC_AUC:+.4f}")
print(f"Success:           {'✅ YES' if hybrid_roc_auc > BASELINE_ROC_AUC else '❌ NO'}")
print("\n🏆 Graph features impact:")
print(f"Graph features in top 20: {len(graph_features_in_top20)}/20")
print("=" * 50)

📋 FINAL HYBRID MODEL SUMMARY:
📊 Dataset size: 118,108 transactions
📊 Total features: 457
📊 Graph features: 24
📊 Traditional features: 433
📊 Fraud rate: 0.0350

🎯 PERFORMANCE:
Baseline ROC-AUC:  0.9213
Hybrid ROC-AUC:    0.9263
Improvement:       +0.0050
Success:           ✅ YES

🏆 Graph features impact:
Graph features in top 20: 1/20
