# 📊 AML Multi-Dataset Data Visualization

This notebook visualizes the preprocessed AML datasets to understand:
- Dataset distributions and sizes
- AML vs Non-AML ratios
- Graph structure and connectivity
- Feature distributions
- Network topology analysis


In [None]:
# Install required packages
%pip install matplotlib seaborn networkx plotly


In [None]:
# Import libraries
import os
import sys
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Add project path
sys.path.append('/content/drive/MyDrive/LaunDetection')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Mount Google Drive (if not already mounted)
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Load processed datasets
def load_processed_datasets():
    """Load all processed datasets"""
    processed_path = "/content/drive/MyDrive/LaunDetection/data/processed"
    
    datasets = {}
    available_datasets = ['HI-Small', 'LI-Small', 'HI-Medium', 'LI-Medium']
    
    print("📊 Loading Processed Datasets...")
    print("=" * 50)
    
    for dataset_name in available_datasets:
        try:
            # Load the processed data
            dataset_path = os.path.join(processed_path, f"{dataset_name}_processed.pkl")
            
            if os.path.exists(dataset_path):
                with open(dataset_path, 'rb') as f:
                    data = pickle.load(f)
                
                datasets[dataset_name] = data
                print(f"✅ {dataset_name}: Loaded successfully")
                
                # Print basic info
                if isinstance(data, dict):
                    if 'graph' in data:
                        graph = data['graph']
                        nodes = graph.number_of_nodes()
                        edges = graph.number_of_edges()
                        print(f"   📊 Nodes: {nodes:,}, Edges: {edges:,}")
                        
                        if 'edge_labels' in data:
                            aml_edges = sum(data['edge_labels'])
                            non_aml_edges = len(data['edge_labels']) - aml_edges
                            aml_rate = (aml_edges / len(data['edge_labels'])) * 100
                            print(f"   🚨 AML Edges: {aml_edges:,} ({aml_rate:.2f}%)")
                            print(f"   ✅ Non-AML Edges: {non_aml_edges:,}")
                    else:
                        print(f"   📊 Data keys: {list(data.keys())}")
                else:
                    print(f"   📊 Data type: {type(data)}")
            else:
                print(f"❌ {dataset_name}: File not found")
                
        except Exception as e:
            print(f"❌ {dataset_name}: Error loading - {str(e)}")
    
    return datasets

# Load the datasets
datasets = load_processed_datasets()


In [None]:
# Quick visualization using the simple script
!python quick_data_visualization.py
