# AML Multi-GNN - Phase 2: Data Exploration

This notebook implements Phase 2 of the AML Multi-GNN project, focusing on data exploration and analysis of the existing IBM AML dataset in Google Drive.

## Objectives:
1. Load existing data from Google Drive
2. Explore data structure and quality
3. Analyze transaction patterns
4. Create visualizations
5. Generate data quality report


In [None]:
# Setup environment
import sys
import os
from pathlib import Path

# Add project root to path
project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Import project utilities
from utils.gpu_utils import get_device, print_system_info
from utils.logging_utils import setup_logging
from utils.random_utils import set_random_seed
from utils.data_loader_colab import load_existing_data, identify_data_structure, create_unified_dataframe, analyze_data_quality, print_data_summary

# Setup logging
logger = setup_logging(experiment_name="phase_2_data_exploration")

# Print system info
print_system_info()

# Memory monitoring function
def check_memory():
    import psutil
    memory = psutil.virtual_memory()
    print(f"Memory: {memory.used / 1e9:.1f}GB used / {memory.total / 1e9:.1f}GB total ({memory.percent:.1f}%)")
    return memory.percent

# Check initial memory
print("Initial memory usage:")
check_memory()


In [None]:
# Load existing data from Google Drive (ultra-conservative approach)
print("Loading existing IBM AML dataset from Google Drive...")
print("Using ultra-conservative loading to prevent crashes...")

try:
    # Ultra-conservative approach: load only tiny samples
    import os
    import pandas as pd
    
    data_path = "/content/drive/MyDrive/LaunDetection/data/raw"
    data = {}
    
    # Load only HI-Small files with extreme memory management
    hi_small_files = [f for f in os.listdir(data_path) if 'HI-Small' in f and f.endswith('.csv')]
    print(f"Found {len(hi_small_files)} HI-Small CSV files")
    
    for file in hi_small_files:
        file_path = os.path.join(data_path, file)
        file_size = os.path.getsize(file_path) / 1024 / 1024  # MB
        print(f"Loading {file} ({file_size:.1f} MB)...")
        
        # Always use tiny chunks for memory safety
        print(f"  Reading in tiny chunks...")
        chunk_size = 500  # Ultra-small chunks
        chunks = []
        
        # Read only first chunk to avoid memory issues
        for chunk_num, chunk in enumerate(pd.read_csv(file_path, chunksize=chunk_size)):
            chunks.append(chunk)
            print(f"    Chunk {chunk_num + 1}: {chunk.shape}")
            
            # Limit to first chunk only to save memory
            if chunk_num >= 0:
                print(f"    Limiting to first chunk only to save memory...")
                break
        
        # Use the single chunk
        df = chunks[0] if chunks else pd.DataFrame()
        print(f"  ✓ Loaded sample of {file}: {df.shape} (first chunk only)")
        
        data[file.replace('.csv', '')] = df
        
        # Clear memory after each file
        import gc
        gc.collect()
        
        # Check memory usage
        memory_usage = check_memory()
        if memory_usage > 80:  # If memory usage > 80%
            print(f"⚠️  High memory usage ({memory_usage:.1f}%), stopping to prevent crash")
            break
    
    print(f"✓ Successfully loaded {len(data)} data files (ultra-conservative)")
    
    # Identify data structure
    print("Identifying data structure...")
    structure = identify_data_structure(data)
    print(f"✓ Identified data structure: {structure}")
    
except Exception as e:
    print(f"✗ Error loading data: {e}")
    print("Trying minimal sample loading...")
    
    try:
        # Minimal approach: load just a few rows to get structure
        import os
        import pandas as pd
        
        data_path = "/content/drive/MyDrive/LaunDetection/data/raw"
        data = {}
        
        # Load only the smallest file first
        hi_small_files = [f for f in os.listdir(data_path) if 'HI-Small' in f and f.endswith('.csv')]
        
        # Find the smallest file
        file_sizes = []
        for file in hi_small_files:
            file_path = os.path.join(data_path, file)
            file_size = os.path.getsize(file_path)
            file_sizes.append((file, file_size))
        
        # Sort by size and take the smallest
        file_sizes.sort(key=lambda x: x[1])
        smallest_file = file_sizes[0][0]
        
        print(f"Loading smallest file: {smallest_file}")
        
        # Load just the first 100 rows
        file_path = os.path.join(data_path, smallest_file)
        df = pd.read_csv(file_path, nrows=100)
        data[smallest_file.replace('.csv', '')] = df
        print(f"✓ Loaded sample of {smallest_file}: {df.shape} (first 100 rows)")
        
        # Create minimal structure
        structure = {
            'transactions': smallest_file.replace('.csv', ''),
            'accounts': None,
            'labels': None,
            'other_files': []
        }
        
        print(f"✓ Created minimal data structure: {structure}")
        
    except Exception as e2:
        print(f"✗ Minimal loading also failed: {e2}")
        data = None
        structure = None


In [None]:
# Fix data structure identification
print("Fixing data structure identification...")

if data and len(data) > 0:
    # Manual structure identification
    manual_structure = {
        'transactions': None,
        'accounts': None,
        'labels': None,
        'other_files': []
    }
    
    for key, df in data.items():
        if df is not None and not df.empty:
            columns = list(df.columns)
            print(f"Analyzing {key} with columns: {columns}")
            
            # Check for transaction indicators
            if any(col in ['Timestamp', 'From Bank', 'To Bank', 'Amount Received', 'Amount Paid'] for col in columns):
                manual_structure['transactions'] = key
                print(f"  → Identified as transactions: {key}")
            
            # Check for account indicators  
            elif any(col in ['Bank Name', 'Account Number', 'Entity ID', 'Entity Name'] for col in columns):
                manual_structure['accounts'] = key
                print(f"  → Identified as accounts: {key}")
            
            # Check for label indicators
            elif any(col in ['Is Laundering', 'Label', 'Class', 'Target'] for col in columns):
                manual_structure['labels'] = key
                print(f"  → Identified as labels: {key}")
            
            else:
                manual_structure['other_files'].append(key)
                print(f"  → Added to other files: {key}")
    
    structure = manual_structure
    print(f"✓ Fixed data structure: {structure}")
else:
    print("No data available to analyze structure")
    structure = None


In [None]:
# Create unified dataframe (with progress tracking)
if data is not None and len(data) > 0:
    print("Creating unified dataframe...")
    print("This may take time for large datasets...")
    
    try:
        import time
        start_time = time.time()
        
        if structure and (structure.get('transactions') or structure.get('accounts')):
            unified_df = create_unified_dataframe(data, structure)
        else:
            print("⚠️  No valid data structure found, creating minimal unified dataframe...")
            
            # Create a simple unified dataframe from available data
            unified_dfs = []
            for key, df in data.items():
                if df is not None and not df.empty:
                    # Add a source column to identify the original file
                    df_copy = df.copy()
                    df_copy['source_file'] = key
                    unified_dfs.append(df_copy)
            
            if unified_dfs:
                unified_df = pd.concat(unified_dfs, ignore_index=True)
                print(f"✓ Created minimal unified dataframe: {unified_df.shape}")
            else:
                print("✗ No data available for unified dataframe")
                unified_df = None
        
        create_time = time.time() - start_time
        
        if unified_df is not None:
            print(f"✓ Created unified dataframe: {unified_df.shape} in {create_time:.1f} seconds")
            
            # Analyze data quality (this can be slow)
            print("Analyzing data quality...")
            quality_metrics = analyze_data_quality(unified_df)
            
            # Print comprehensive summary
            print_data_summary(unified_df, quality_metrics)
        else:
            print("⚠️  Cannot create unified dataframe - no valid data")
            quality_metrics = None
        
    except Exception as e:
        print(f"✗ Error creating unified dataframe: {e}")
        print("This might be due to memory constraints or data size")
        unified_df = None
        quality_metrics = None
else:
    print("⚠️  Cannot create unified dataframe - data loading failed")
    unified_df = None
    quality_metrics = None


In [None]:
# Data exploration and visualization
if unified_df is not None:
    print("Performing data exploration...")
    
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd
    import numpy as np
    
    # Set up plotting
    plt.style.use('default')
    sns.set_palette("husl")
    
    # Basic statistics
    print("\nBasic Statistics:")
    print(unified_df.describe())
    
    # Check for missing values
    print("\nMissing Values:")
    missing_data = unified_df.isnull().sum()
    missing_percent = (missing_data / len(unified_df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing_data,
        'Missing Percentage': missing_percent
    })
    print(missing_df[missing_df['Missing Count'] > 0])
    
    # Data types
    print("\nData Types:")
    print(unified_df.dtypes)
    
else:
    print("⚠️  Cannot perform exploration - unified dataframe not available")


In [None]:
# Save processed data for next phase
if unified_df is not None:
    print("Saving processed data for next phase...")
    
    try:
        # Save to Google Drive
        output_path = "/content/drive/MyDrive/LaunDetection/data/processed/unified_data.csv"
        unified_df.to_csv(output_path, index=False)
        print(f"✓ Saved unified data to: {output_path}")
        
        # Save data structure info
        import json
        structure_path = "/content/drive/MyDrive/LaunDetection/data/processed/data_structure.json"
        with open(structure_path, 'w') as f:
            json.dump(structure, f, indent=2)
        print(f"✓ Saved data structure to: {structure_path}")
        
        # Save quality metrics
        quality_path = "/content/drive/MyDrive/LaunDetection/data/processed/quality_metrics.json"
        with open(quality_path, 'w') as f:
            json.dump(quality_metrics, f, indent=2, default=str)
        print(f"✓ Saved quality metrics to: {quality_path}")
        
    except Exception as e:
        print(f"✗ Error saving processed data: {e}")
    
    print("\n" + "=" * 60)
    print("Phase 2 - Data Exploration Completed!")
    print("=" * 60)
    print("\nNext steps:")
    print("1. Review the data exploration results above")
    print("2. Proceed to Phase 3: Graph Construction and Preprocessing")
    print("3. Run: %run notebooks/02_graph_construction.ipynb")
    
else:
    print("⚠️  Cannot save processed data - unified dataframe not available")
