# Solana Data Explorer

This notebook explores the migrated Solana tables from PostgreSQL to MinIO storage.

## Tables Available:
- `token_list_v3` - Token data from BirdEye API
- `token_whales` - Large holders for tracked tokens
- `wallet_trade_history` - Trading history for whale wallets
- `token_metadata` - Token metadata (Twitter, website, descriptions)

In [1]:
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[1], line 3
      1 # Import required libraries
      2 import duckdb
----> 3 import pandas as pd
      4 import os
      5 import json

File ~/.local/lib/python3.11/site-packages/pandas/__init__.py:19
     16         _missing_dependencies.append(f"{_dependency}: {_e}")
     18 if _missing_dependencies:  # pragma: no cover
---> 19     raise ImportError(
     20         "Unable to import required dependencies:\n" + "\n".join(_missing_dependencies)
     21     )
     22 del _hard_dependencies, _dependency, _missing_dependencies
     24 try:
     25     # numpy compat

ImportError: Unable to import required dependencies:
numpy: Error importing numpy: you should not try to import numpy from
        its source directory; please exit the numpy source tree, and relaunch
        your python interpreter from there.

SyntaxError: invalid syntax (923040672.py, line 1)

In [1]:
# Import required libraries
import duckdb
import pandas as pd
import os
import json
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

ImportError: Unable to import required dependencies:
numpy: Error importing numpy: you should not try to import numpy from
        its source directory; please exit the numpy source tree, and relaunch
        your python interpreter from there.

In [2]:
%pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Setup DuckDB connection with S3/MinIO configuration
def get_duckdb_connection():
    """Create DuckDB connection with S3 configuration."""
    db_path = '/data/analytics.duckdb'
    conn = duckdb.connect(db_path)
    
    # Configure S3/MinIO
    conn.execute("LOAD httpfs;")
    conn.execute("SET s3_endpoint='minio:9000';")
    conn.execute("SET s3_access_key_id='minioadmin';")
    conn.execute("SET s3_secret_access_key='minioadmin123';")
    conn.execute("SET s3_use_ssl=false;")
    conn.execute("SET s3_url_style='path';")
    
    return conn

# Initialize connection
conn = get_duckdb_connection()
print("✅ DuckDB connection established")

## Data Overview

In [None]:
# Define table paths
tables = {
    'token_list_v3': 's3://solana-data/bronze/token_list_v3/*.parquet',
    'token_whales': 's3://solana-data/bronze/token_whales/*.parquet',
    'wallet_trade_history': 's3://solana-data/bronze/wallet_trade_history/*.parquet',
    'token_metadata': 's3://solana-data/bronze/token_metadata/*.parquet'
}

# Get row counts for each table
print("=== TABLE SUMMARY ===")
row_counts = {}
for table_name, path in tables.items():
    try:
        count = conn.execute(f"SELECT COUNT(*) FROM read_parquet('{path}')").fetchone()[0]
        row_counts[table_name] = count
        print(f"{table_name}: {count:,} rows")
    except Exception as e:
        print(f"{table_name}: Error - {e}")
        row_counts[table_name] = 0

In [None]:
# Visualize table sizes
plt.figure(figsize=(10, 6))
table_names = list(row_counts.keys())
counts = list(row_counts.values())

bars = plt.bar(table_names, counts, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
plt.title('Solana Tables - Row Counts', fontsize=16, fontweight='bold')
plt.xlabel('Tables', fontsize=12)
plt.ylabel('Number of Rows', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add value labels on bars
for bar, count in zip(bars, counts):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05,
             str(count), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## Token List Analysis

In [None]:
# Load token list data
token_df = conn.execute("""
    SELECT * FROM read_parquet('s3://solana-data/bronze/token_list_v3/*.parquet')
""").df()

print("=== TOKEN LIST DATA ===")
print(f"Shape: {token_df.shape}")
print("\nColumns:")
for col in token_df.columns:
    print(f"  - {col}")
    
print("\nSample data:")
display(token_df.head())

In [None]:
# Token analysis
print("=== TOKEN ANALYSIS ===")
print(f"Total tokens: {len(token_df)}")
print(f"Unique symbols: {token_df['symbol'].nunique()}")

if 'market_cap' in token_df.columns:
    print(f"\nMarket Cap Summary:")
    print(token_df['market_cap'].describe())
    
if 'price' in token_df.columns:
    print(f"\nPrice Summary:")
    print(token_df['price'].describe())

print("\nToken Details:")
for _, row in token_df.iterrows():
    print(f"  {row['symbol']}: {row['name']} - ${row.get('price', 'N/A')}")

## Token Whales Analysis

In [None]:
# Load whale data
whales_df = conn.execute("""
    SELECT * FROM read_parquet('s3://solana-data/bronze/token_whales/*.parquet')
""").df()

print("=== TOKEN WHALES DATA ===")
print(f"Shape: {whales_df.shape}")
print("\nSample data:")
display(whales_df.head())

In [None]:
# Whale analysis
print("=== WHALE ANALYSIS ===")
print(f"Total whale positions: {len(whales_df)}")
print(f"Unique wallets: {whales_df['wallet_address'].nunique()}")
print(f"Unique tokens: {whales_df['token_address'].nunique()}")

# Holdings by token
print("\nHoldings by Token:")
token_holdings = whales_df.groupby('token_address').agg({
    'holdings_value': ['sum', 'count'],
    'holdings_pct': 'sum'
}).round(2)
display(token_holdings)

# Top whales by value
print("\nTop Whale Positions:")
top_whales = whales_df.nlargest(3, 'holdings_value')[['wallet_address', 'token_address', 'holdings_value', 'holdings_pct']]
for _, whale in top_whales.iterrows():
    print(f"  {whale['wallet_address'][:8]}... holds ${whale['holdings_value']:,.2f} ({whale['holdings_pct']:.2f}%)")

In [None]:
# Visualize whale holdings
plt.figure(figsize=(12, 8))

# Subplot 1: Holdings by token
plt.subplot(2, 2, 1)
token_totals = whales_df.groupby('token_address')['holdings_value'].sum()
token_labels = [addr[:8] + '...' for addr in token_totals.index]
plt.pie(token_totals.values, labels=token_labels, autopct='%1.1f%%')
plt.title('Holdings Value by Token')

# Subplot 2: Holdings percentage distribution
plt.subplot(2, 2, 2)
plt.hist(whales_df['holdings_pct'], bins=10, alpha=0.7, color='skyblue')
plt.title('Distribution of Holdings Percentage')
plt.xlabel('Holdings %')
plt.ylabel('Frequency')

# Subplot 3: Holdings value distribution
plt.subplot(2, 2, 3)
plt.hist(whales_df['holdings_value'], bins=10, alpha=0.7, color='lightcoral')
plt.title('Distribution of Holdings Value')
plt.xlabel('Holdings Value ($)')
plt.ylabel('Frequency')

# Subplot 4: Whale positions by token
plt.subplot(2, 2, 4)
token_counts = whales_df['token_address'].value_counts()
token_labels = [addr[:8] + '...' for addr in token_counts.index]
plt.bar(range(len(token_counts)), token_counts.values, color='lightgreen')
plt.title('Number of Whale Positions by Token')
plt.xlabel('Token')
plt.ylabel('Number of Positions')
plt.xticks(range(len(token_counts)), token_labels, rotation=45)

plt.tight_layout()
plt.show()

## Trading History Analysis

In [None]:
# Load trading history
trades_df = conn.execute("""
    SELECT * FROM read_parquet('s3://solana-data/bronze/wallet_trade_history/*.parquet')
""").df()

print("=== TRADING HISTORY DATA ===")
print(f"Shape: {trades_df.shape}")
print("\nSample data:")
display(trades_df.head())

In [None]:
# Trading analysis
print("=== TRADING ANALYSIS ===")
print(f"Total trades: {len(trades_df)}")
print(f"Unique wallets: {trades_df['wallet_address'].nunique()}")
print(f"Unique signatures: {trades_df['signature'].nunique()}")

if 'usd_value' in trades_df.columns:
    print(f"\nTrading Volume:")
    print(f"  Total USD value: ${trades_df['usd_value'].sum():,.2f}")
    print(f"  Average trade size: ${trades_df['usd_value'].mean():,.2f}")
    print(f"  Largest trade: ${trades_df['usd_value'].max():,.2f}")

# Token pair analysis
print("\nTrading Pairs:")
for _, trade in trades_df.iterrows():
    from_token = trade.get('from_token_symbol', 'N/A')
    to_token = trade.get('to_token_symbol', 'N/A')
    usd_val = trade.get('usd_value', 0)
    print(f"  {from_token} → {to_token}: ${usd_val:,.2f}")

## Token Metadata Analysis

In [None]:
# Load metadata
metadata_df = conn.execute("""
    SELECT * FROM read_parquet('s3://solana-data/bronze/token_metadata/*.parquet')
""").df()

print("=== TOKEN METADATA ===")
print(f"Shape: {metadata_df.shape}")
print("\nSample data:")
display(metadata_df.head())

In [None]:
# Metadata analysis
print("=== METADATA ANALYSIS ===")
print(f"Total tokens with metadata: {len(metadata_df)}")

# Check data completeness
completeness = {}
for col in ['twitter', 'website', 'description', 'coingecko_id']:
    if col in metadata_df.columns:
        non_null = metadata_df[col].notna().sum()
        completeness[col] = f"{non_null}/{len(metadata_df)} ({non_null/len(metadata_df)*100:.1f}%)"

print("\nData Completeness:")
for field, stats in completeness.items():
    print(f"  {field}: {stats}")

print("\nToken Details:")
for _, token in metadata_df.iterrows():
    print(f"\n{token['symbol']} - {token['name']}")
    if pd.notna(token.get('description')):
        desc = token['description'][:100] + '...' if len(token['description']) > 100 else token['description']
        print(f"  Description: {desc}")
    if pd.notna(token.get('website')):
        print(f"  Website: {token['website']}")
    if pd.notna(token.get('twitter')):
        print(f"  Twitter: {token['twitter']}")

## Cross-Table Analysis

In [None]:
# Join tokens with metadata for enriched view
enriched_tokens = conn.execute("""
    SELECT 
        t.symbol,
        t.name,
        t.market_cap,
        t.price,
        t.volume_24h,
        m.website,
        m.twitter,
        m.coingecko_id
    FROM read_parquet('s3://solana-data/bronze/token_list_v3/*.parquet') t
    LEFT JOIN read_parquet('s3://solana-data/bronze/token_metadata/*.parquet') m
        ON t.token_address = m.token_address
""").df()

print("=== ENRICHED TOKEN VIEW ===")
display(enriched_tokens)

In [None]:
# Whale positions with token info
whale_positions = conn.execute("""
    SELECT 
        w.wallet_address,
        t.symbol,
        t.name,
        w.holdings_amount,
        w.holdings_value,
        w.holdings_pct,
        t.price
    FROM read_parquet('s3://solana-data/bronze/token_whales/*.parquet') w
    LEFT JOIN read_parquet('s3://solana-data/bronze/token_list_v3/*.parquet') t
        ON w.token_address = t.token_address
    ORDER BY w.holdings_value DESC
""").df()

print("=== WHALE POSITIONS WITH TOKEN INFO ===")
display(whale_positions)

## Data Quality Report

In [None]:
# Data quality summary
print("=== DATA QUALITY REPORT ===")
print(f"Migration Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total Tables: {len(tables)}")
print(f"Total Records: {sum(row_counts.values()):,}")

print("\nTable Status:")
for table, count in row_counts.items():
    status = "✅" if count > 0 else "❌"
    print(f"  {status} {table}: {count:,} rows")

# Test data integrity
print("\nData Integrity Checks:")

# Check for valid token addresses
invalid_addresses = conn.execute("""
    SELECT COUNT(*) FROM read_parquet('s3://solana-data/bronze/token_list_v3/*.parquet')
    WHERE token_address IS NULL OR token_address = ''
""").fetchone()[0]
print(f"  Token addresses - Invalid: {invalid_addresses}")

# Check for valid whale holdings
negative_holdings = conn.execute("""
    SELECT COUNT(*) FROM read_parquet('s3://solana-data/bronze/token_whales/*.parquet')
    WHERE holdings_amount < 0 OR holdings_value < 0
""").fetchone()[0]
print(f"  Whale holdings - Negative values: {negative_holdings}")

# Check for future trade dates
future_trades = conn.execute(f"""
    SELECT COUNT(*) FROM read_parquet('s3://solana-data/bronze/wallet_trade_history/*.parquet')
    WHERE block_time > '{datetime.now().isoformat()}'
""").fetchone()[0]
print(f"  Trade dates - Future dates: {future_trades}")

print("\n🎉 Data exploration complete!")
print("All Solana tables successfully migrated and analyzed.")

In [None]:
# Close connection
conn.close()
print("Database connection closed.")