In [3]:
"""
DuckDB S3 Browser - List all parquet files in warehouse
"""

import duckdb
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Create DuckDB connection
con = duckdb.connect()

# Configure MinIO access from .env
MINIO_USER = os.getenv('MINIO_ROOT_USER')
MINIO_PASSWORD = os.getenv('MINIO_ROOT_PASSWORD')
MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT', 'localhost:9100')

con.execute(f"""
    CREATE SECRET minio_secret (
        TYPE S3,
        KEY_ID '{MINIO_USER}',
        SECRET '{MINIO_PASSWORD}',
        REGION 'eu-central-1',
        ENDPOINT '{MINIO_ENDPOINT}',
        USE_SSL false,
        URL_STYLE 'path'
    )
""")

print("=== Listing warehouse bucket contents ===")
files = con.execute("""
    SELECT * FROM glob('s3://warehouse/**/*.parquet')
""").df()
print(files)

# Close connection
con.close()

=== Listing warehouse bucket contents ===
                                                  file
0    s3://warehouse/warehouse/bronze_listens/data/u...
1    s3://warehouse/warehouse/bronze_listens/data/u...
2    s3://warehouse/warehouse/bronze_listens/data/u...
3    s3://warehouse/warehouse/bronze_listens/data/u...
4    s3://warehouse/warehouse/bronze_listens/data/u...
..                                                 ...
606  s3://warehouse/warehouse/silver_listens/data/u...
607  s3://warehouse/warehouse/silver_listens/data/u...
608  s3://warehouse/warehouse/silver_listens/data/u...
609  s3://warehouse/warehouse/silver_listens/data/u...
610  s3://warehouse/warehouse/silver_listens/data/u...

[611 rows x 1 columns]


### RUN AFTER MAKE JOBS

In [4]:
"""
DuckDB Analytics Query - Silver & Gold Layers
Quick queries for Iceberg tables via S3/MinIO
"""

import duckdb
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Create DuckDB connection
con = duckdb.connect()

# Configure MinIO access from .env
MINIO_USER = os.getenv('MINIO_ROOT_USER')
MINIO_PASSWORD = os.getenv('MINIO_ROOT_PASSWORD')
MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT', 'localhost:9100')

con.execute(f"""
    CREATE SECRET minio_secret (
        TYPE S3,
        KEY_ID '{MINIO_USER}',
        SECRET '{MINIO_PASSWORD}',
        REGION 'eu-central-1',
        ENDPOINT '{MINIO_ENDPOINT}',
        USE_SSL false,
        URL_STYLE 'path'
    )
""")

# Query silver layer
print("=== Silver Layer (Top 10 Recent Listens) ===")
silver_df = con.execute("""
    SELECT * 
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
    ORDER BY listened_at DESC
    LIMIT 10
""").df()
print(silver_df)

# Query gold layer with union_by_name to handle schema changes
print("\n=== Gold Layer (Top 10 Peak Days) ===")
gold_df = con.execute("""
    SELECT 
        user_name,
        listened_date,
        COALESCE(listen_count, count) as listen_count,
        unique_tracks,
        unique_artists
    FROM read_parquet(
        's3://warehouse/warehouse/gold/user_peaks/data/**/*.parquet',
        union_by_name=true
    )
    ORDER BY listen_count DESC NULLS LAST
    LIMIT 10
""").df()
print(gold_df)

# Get full counts
silver_count = con.execute("""
    SELECT COUNT(*) as count
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
""").fetchone()[0]

gold_count = con.execute("""
    SELECT COUNT(*) as count
    FROM read_parquet(
        's3://warehouse/warehouse/gold/user_peaks/data/**/*.parquet',
        union_by_name=true
    )
""").fetchone()[0]

print("\n=== Summary ===")
print(f"Total silver records: {silver_count:,}")
print(f"Total gold records: {gold_count:,}")

# User statistics from silver
print("\n=== Top 10 Users by Listen Count ===")
user_stats = con.execute("""
    SELECT 
        user_name,
        COUNT(*) as total_listens,
        COUNT(DISTINCT listened_date) as unique_days,
        MIN(listened_datetime) as first_listen,
        MAX(listened_datetime) as last_listen
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
    GROUP BY user_name
    ORDER BY total_listens DESC
    LIMIT 10
""").df()
print(user_stats)

# Close connection
con.close()

=== Silver Layer (Top 10 Recent Listens) ===
   listened_at                        recording_msid  \
0   1555286610  8968d9eb-e62f-4ecb-8f3e-96e16a1988b2   
1   1555286601  1d4d754c-2dc0-4fc0-874a-74a42cf0258d   
2   1555286560  1e1b2aa0-b2db-42ed-a8ba-89c303499408   
3   1555286560  1e1b2aa0-b2db-42ed-a8ba-89c303499408   
4   1555286549  c75d80b9-b7ef-47c7-8273-0830f7787ed8   
5   1555286462  f6e8e988-43b7-46ba-b827-0ab566eda2ad   
6   1555286387  950d2108-eae2-4116-b452-29ab58fbfa0a   
7   1555286383  f5d583f9-bd76-4f82-b109-b80007faab6b   
8   1555286378  283062c8-75e2-406a-8c5e-f38136aa5a68   
9   1555286378  283062c8-75e2-406a-8c5e-f38136aa5a68   

                    track_name                artist_name  \
0           Love Within Beauty             Vision Éternel   
1                Rollover D.J.                        Jet   
2  Love In the Time of Ecstacy              Withered Hand   
3  Love In the Time of Ecstacy              Withered Hand   
4                        Awake   

In [1]:
"""
DuckDB Query Script for Iceberg Tables via S3/MinIO
Queries silver and gold layers stored in Iceberg format
"""

import duckdb
from dotenv import load_dotenv
import os

# Load environment variables from .env
load_dotenv()

# Create DuckDB connection
con = duckdb.connect()

# Configure MinIO access from environment variables
MINIO_USER = os.getenv('MINIO_ROOT_USER', 'scalable')
MINIO_PASSWORD = os.getenv('MINIO_ROOT_PASSWORD', 'scalable123')
MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT', 'localhost:9100')

con.execute(f"""
    CREATE SECRET minio_secret (
        TYPE S3,
        KEY_ID '{MINIO_USER}',
        SECRET '{MINIO_PASSWORD}',
        REGION 'eu-central-1',
        ENDPOINT '{MINIO_ENDPOINT}',
        USE_SSL false,
        URL_STYLE 'path'
    )
""")

print("=" * 70)
print("ICEBERG DATA LAKE QUERY")
print("=" * 70)

# List all parquet files in warehouse
print("\n=== Warehouse Contents ===")
try:
    files = con.execute("""
        SELECT * FROM glob('s3://warehouse/**/*.parquet')
        LIMIT 20
    """).df()
    print(f"Found {len(files)} parquet files (showing first 20)")
    print(files)
except Exception as e:
    print(f"Could not list files: {e}")

# Query silver layer
print("\n=== Silver Layer (Top 10 Recent Listens) ===")
silver_df = con.execute("""
    SELECT * 
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
    ORDER BY listened_at DESC
    LIMIT 10
""").df()
print(silver_df)

# Query gold layer with union_by_name to handle schema evolution
print("\n=== Gold Layer (Top 10 Peak Days) ===")
gold_df = con.execute("""
    SELECT 
        user_name,
        listened_date,
        COALESCE(listen_count, count) as listen_count,
        unique_tracks,
        unique_artists
    FROM read_parquet(
        's3://warehouse/warehouse/gold/user_peaks/data/**/*.parquet',
        union_by_name=true
    )
    ORDER BY listen_count DESC NULLS LAST
    LIMIT 10
""").df()
print(gold_df)

# Get full counts
print("\n=== Summary ===")
silver_count = con.execute("""
    SELECT COUNT(*) as count
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
""").fetchone()[0]

gold_count = con.execute("""
    SELECT COUNT(*) as count
    FROM read_parquet(
        's3://warehouse/warehouse/gold/user_peaks/data/**/*.parquet',
        union_by_name=true
    )
""").fetchone()[0]

print(f"Total silver records: {silver_count:,}")
print(f"Total gold records: {gold_count:,}")

# User statistics from silver
print("\n=== Top 10 Users by Listen Count ===")
user_stats = con.execute("""
    SELECT 
        user_name,
        COUNT(*) as total_listens,
        COUNT(DISTINCT listened_date) as unique_days,
        MIN(listened_datetime) as first_listen,
        MAX(listened_datetime) as last_listen
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
    GROUP BY user_name
    ORDER BY total_listens DESC
    LIMIT 10
""").df()
print(user_stats)

# Most active listening days
print("\n=== Top 10 Most Active Days (Across All Users) ===")
active_days = con.execute("""
    SELECT 
        listened_date,
        COUNT(*) as total_listens,
        COUNT(DISTINCT user_name) as unique_users,
        COUNT(DISTINCT track_name) as unique_tracks
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
    GROUP BY listened_date
    ORDER BY total_listens DESC
    LIMIT 10
""").df()
print(active_days)

# Close connection
con.close()
print("\n" + "=" * 70)
print("Query completed successfully!")
print("=" * 70)

ICEBERG DATA LAKE QUERY

=== Warehouse Contents ===
Found 20 parquet files (showing first 20)
                                                 file
0   s3://warehouse/warehouse/bronze_listens/data/u...
1   s3://warehouse/warehouse/bronze_listens/data/u...
2   s3://warehouse/warehouse/bronze_listens/data/u...
3   s3://warehouse/warehouse/bronze_listens/data/u...
4   s3://warehouse/warehouse/bronze_listens/data/u...
5   s3://warehouse/warehouse/bronze_listens/data/u...
6   s3://warehouse/warehouse/bronze_listens/data/u...
7   s3://warehouse/warehouse/bronze_listens/data/u...
8   s3://warehouse/warehouse/bronze_listens/data/u...
9   s3://warehouse/warehouse/bronze_listens/data/u...
10  s3://warehouse/warehouse/bronze_listens/data/u...
11  s3://warehouse/warehouse/bronze_listens/data/u...
12  s3://warehouse/warehouse/bronze_listens/data/u...
13  s3://warehouse/warehouse/bronze_listens/data/u...
14  s3://warehouse/warehouse/bronze_listens/data/u...
15  s3://warehouse/warehouse/bronze_listen

In [5]:
"""
DuckDB Query Script for Iceberg Tables via S3/MinIO
Queries silver and gold layers stored in Iceberg format
"""
import duckdb
from dotenv import load_dotenv
import os

# Load environment variables from .env
load_dotenv()

# Create DuckDB connection
con = duckdb.connect()

# Configure MinIO access from environment variables
MINIO_USER = os.getenv('MINIO_ROOT_USER', 'scalable')
MINIO_PASSWORD = os.getenv('MINIO_ROOT_PASSWORD', 'scalable123')
MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT', 'localhost:9100')

con.execute(f"""
    CREATE SECRET minio_secret (
        TYPE S3,
        KEY_ID '{MINIO_USER}',
        SECRET '{MINIO_PASSWORD}',
        REGION 'eu-central-1',
        ENDPOINT '{MINIO_ENDPOINT}',
        USE_SSL false,
        URL_STYLE 'path'
    )
""")

print("=" * 70)
print("ICEBERG DATA LAKE QUERY")
print("=" * 70)

# List all parquet files in warehouse
print("\n=== Warehouse Contents ===")
try:
    files = con.execute("""
        SELECT * FROM glob('s3://warehouse/**/*.parquet')
        LIMIT 20
    """).df()
    print(f"Found {len(files)} parquet files (showing first 20)")
    print(files)
except Exception as e:
    print(f"Could not list files: {e}")

# Query silver layer - show actual data with all columns
print("\n=== Silver Layer (Top 10 Recent Listens - Full Details) ===")
silver_df = con.execute("""
    SELECT 
        user_name,
        artist_name,
        track_name,
        listened_datetime,
        listened_date,
        year,
        month,
        day,
        hour
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
    ORDER BY listened_datetime DESC
    LIMIT 10
""").df()
print(silver_df.to_string())

# Query gold layer with union_by_name to handle schema evolution
print("\n=== Gold Layer (Top 10 Peak Days) ===")
gold_df = con.execute("""
    SELECT 
        user_name,
        listened_date,
        COALESCE(listen_count, count) as listen_count,
        unique_tracks,
        unique_artists
    FROM read_parquet(
        's3://warehouse/warehouse/gold/user_peaks/data/**/*.parquet',
        union_by_name=true
    )
    ORDER BY listen_count DESC NULLS LAST
    LIMIT 10
""").df()
print(gold_df.to_string())

# Get full counts
print("\n=== Summary ===")
silver_count = con.execute("""
    SELECT COUNT(*) as count
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
""").fetchone()[0]

gold_count = con.execute("""
    SELECT COUNT(*) as count
    FROM read_parquet(
        's3://warehouse/warehouse/gold/user_peaks/data/**/*.parquet',
        union_by_name=true
    )
""").fetchone()[0]

print(f"Total silver records: {silver_count:,}")
print(f"Total gold records: {gold_count:,}")

# User statistics from silver
print("\n=== Top 10 Users by Listen Count ===")
user_stats = con.execute("""
    SELECT 
        user_name,
        COUNT(*) as total_listens,
        COUNT(DISTINCT listened_date) as unique_days,
        COUNT(DISTINCT artist_name) as unique_artists,
        COUNT(DISTINCT track_name) as unique_tracks,
        MIN(listened_datetime) as first_listen,
        MAX(listened_datetime) as last_listen
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
    GROUP BY user_name
    ORDER BY total_listens DESC
    LIMIT 10
""").df()
print(user_stats.to_string())

# Most active listening days
print("\n=== Top 10 Most Active Days (Across All Users) ===")
active_days = con.execute("""
    SELECT 
        listened_date,
        COUNT(*) as total_listens,
        COUNT(DISTINCT user_name) as unique_users,
        COUNT(DISTINCT track_name) as unique_tracks,
        COUNT(DISTINCT artist_name) as unique_artists
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
    GROUP BY listened_date
    ORDER BY total_listens DESC
    LIMIT 10
""").df()
print(active_days.to_string())

# Top 10 most played tracks
print("\n=== Top 10 Most Played Tracks ===")
top_tracks = con.execute("""
    SELECT 
        track_name,
        artist_name,
        COUNT(*) as play_count,
        COUNT(DISTINCT user_name) as unique_listeners
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
    GROUP BY track_name, artist_name
    ORDER BY play_count DESC
    LIMIT 10
""").df()
print(top_tracks.to_string())

# Top 10 most played artists
print("\n=== Top 10 Most Played Artists ===")
top_artists = con.execute("""
    SELECT 
        artist_name,
        COUNT(*) as play_count,
        COUNT(DISTINCT user_name) as unique_listeners,
        COUNT(DISTINCT track_name) as unique_tracks
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
    GROUP BY artist_name
    ORDER BY play_count DESC
    LIMIT 10
""").df()
print(top_artists.to_string())

# Listening patterns by hour
print("\n=== Listening Activity by Hour of Day ===")
hourly_patterns = con.execute("""
    SELECT 
        hour,
        COUNT(*) as total_listens,
        COUNT(DISTINCT user_name) as unique_users,
        ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
    GROUP BY hour
    ORDER BY hour
""").df()
print(hourly_patterns.to_string())

# Monthly listening trends
print("\n=== Monthly Listening Trends ===")
monthly_trends = con.execute("""
    SELECT 
        year,
        month,
        COUNT(*) as total_listens,
        COUNT(DISTINCT user_name) as unique_users,
        COUNT(DISTINCT track_name) as unique_tracks
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
    GROUP BY year, month
    ORDER BY year, month
""").df()
print(monthly_trends.to_string())

# User diversity - users with most diverse listening
print("\n=== Top 10 Most Diverse Listeners (Unique Artists) ===")
diverse_listeners = con.execute("""
    SELECT 
        user_name,
        COUNT(DISTINCT artist_name) as unique_artists,
        COUNT(DISTINCT track_name) as unique_tracks,
        COUNT(*) as total_listens,
        ROUND(COUNT(DISTINCT artist_name) * 100.0 / COUNT(*), 2) as diversity_ratio
    FROM read_parquet('s3://warehouse/warehouse/silver_listens/data/**/*.parquet')
    GROUP BY user_name
    HAVING COUNT(*) >= 10  -- At least 10 listens
    ORDER BY unique_artists DESC
    LIMIT 10
""").df()
print(diverse_listeners.to_string())

# Close connection
con.close()
print("\n" + "=" * 70)
print("Query completed successfully!")
print("=" * 70)

ICEBERG DATA LAKE QUERY

=== Warehouse Contents ===
Found 20 parquet files (showing first 20)
                                                 file
0   s3://warehouse/warehouse/bronze_listens/data/u...
1   s3://warehouse/warehouse/bronze_listens/data/u...
2   s3://warehouse/warehouse/bronze_listens/data/u...
3   s3://warehouse/warehouse/bronze_listens/data/u...
4   s3://warehouse/warehouse/bronze_listens/data/u...
5   s3://warehouse/warehouse/bronze_listens/data/u...
6   s3://warehouse/warehouse/bronze_listens/data/u...
7   s3://warehouse/warehouse/bronze_listens/data/u...
8   s3://warehouse/warehouse/bronze_listens/data/u...
9   s3://warehouse/warehouse/bronze_listens/data/u...
10  s3://warehouse/warehouse/bronze_listens/data/u...
11  s3://warehouse/warehouse/bronze_listens/data/u...
12  s3://warehouse/warehouse/bronze_listens/data/u...
13  s3://warehouse/warehouse/bronze_listens/data/u...
14  s3://warehouse/warehouse/bronze_listens/data/u...
15  s3://warehouse/warehouse/bronze_listen