# AML Multi-GNN - Phase 2: Data Exploration

This notebook implements Phase 2 of the AML Multi-GNN project, focusing on data exploration and analysis of the existing IBM AML dataset in Google Drive.

## Objectives:
1. Load existing data from Google Drive
2. Explore data structure and quality
3. Analyze transaction patterns
4. Create visualizations
5. Generate data quality report


In [None]:
# Setup environment
import sys
import os
from pathlib import Path

# Add project root to path
project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Import project utilities
from utils.gpu_utils import get_device, print_system_info
from utils.logging_utils import setup_logging
from utils.random_utils import set_random_seed
from utils.data_loader_colab import load_existing_data, identify_data_structure, create_unified_dataframe, analyze_data_quality, print_data_summary

# Setup logging
logger = setup_logging(experiment_name="phase_2_data_exploration")

# Print system info
print_system_info()


In [None]:
# Load existing data from Google Drive (with progress tracking)
print("Loading existing IBM AML dataset from Google Drive...")
print("This may take a few minutes for large datasets...")

try:
    # Load data with progress indication
    import time
    start_time = time.time()
    
    data = load_existing_data()
    load_time = time.time() - start_time
    
    print(f"✓ Successfully loaded {len(data)} data files in {load_time:.1f} seconds")
    
    # Identify data structure
    print("Identifying data structure...")
    structure = identify_data_structure(data)
    print(f"✓ Identified data structure: {structure}")
    
except Exception as e:
    print(f"✗ Error loading data: {e}")
    print("This might be due to large dataset size or memory issues")
    data = None
    structure = None


In [None]:
# Create unified dataframe (with progress tracking)
if data is not None and structure is not None:
    print("Creating unified dataframe...")
    print("This may take time for large datasets...")
    
    try:
        import time
        start_time = time.time()
        
        unified_df = create_unified_dataframe(data, structure)
        create_time = time.time() - start_time
        
        print(f"✓ Created unified dataframe: {unified_df.shape} in {create_time:.1f} seconds")
        
        # Analyze data quality (this can be slow)
        print("Analyzing data quality...")
        quality_metrics = analyze_data_quality(unified_df)
        
        # Print comprehensive summary
        print_data_summary(unified_df, quality_metrics)
        
    except Exception as e:
        print(f"✗ Error creating unified dataframe: {e}")
        print("This might be due to memory constraints or data size")
        unified_df = None
        quality_metrics = None
else:
    print("⚠️  Cannot create unified dataframe - data loading failed")
    unified_df = None
    quality_metrics = None


In [None]:
# Data exploration and visualization
if unified_df is not None:
    print("Performing data exploration...")
    
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd
    import numpy as np
    
    # Set up plotting
    plt.style.use('default')
    sns.set_palette("husl")
    
    # Basic statistics
    print("\nBasic Statistics:")
    print(unified_df.describe())
    
    # Check for missing values
    print("\nMissing Values:")
    missing_data = unified_df.isnull().sum()
    missing_percent = (missing_data / len(unified_df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing_data,
        'Missing Percentage': missing_percent
    })
    print(missing_df[missing_df['Missing Count'] > 0])
    
    # Data types
    print("\nData Types:")
    print(unified_df.dtypes)
    
else:
    print("⚠️  Cannot perform exploration - unified dataframe not available")


In [None]:
# Save processed data for next phase
if unified_df is not None:
    print("Saving processed data for next phase...")
    
    try:
        # Save to Google Drive
        output_path = "/content/drive/MyDrive/LaunDetection/data/processed/unified_data.csv"
        unified_df.to_csv(output_path, index=False)
        print(f"✓ Saved unified data to: {output_path}")
        
        # Save data structure info
        import json
        structure_path = "/content/drive/MyDrive/LaunDetection/data/processed/data_structure.json"
        with open(structure_path, 'w') as f:
            json.dump(structure, f, indent=2)
        print(f"✓ Saved data structure to: {structure_path}")
        
        # Save quality metrics
        quality_path = "/content/drive/MyDrive/LaunDetection/data/processed/quality_metrics.json"
        with open(quality_path, 'w') as f:
            json.dump(quality_metrics, f, indent=2, default=str)
        print(f"✓ Saved quality metrics to: {quality_path}")
        
    except Exception as e:
        print(f"✗ Error saving processed data: {e}")
    
    print("\n" + "=" * 60)
    print("Phase 2 - Data Exploration Completed!")
    print("=" * 60)
    print("\nNext steps:")
    print("1. Review the data exploration results above")
    print("2. Proceed to Phase 3: Graph Construction and Preprocessing")
    print("3. Run: %run notebooks/02_graph_construction.ipynb")
    
else:
    print("⚠️  Cannot save processed data - unified dataframe not available")
