In [19]:
import polars as pl
from google.cloud import storage
import pyarrow

In [20]:
BUCKET_NAME = "nfl-data-bronze"
LOAD_DATE = "2025-09-28"

DATASETS = {
    'fantasy_ids': f'bronze/fantasy_ids/load_date={LOAD_DATE}/ff_playerids.parquet',
    'play_by_play': f'bronze/play_by_play/load_date={LOAD_DATE}/pbp_all_seasons.parquet',
    'player_stats': f'bronze/player_stats/load_date={LOAD_DATE}/player_stats_all_seasons.parquet',
    'players': f'bronze/players/load_date={LOAD_DATE}/players.parquet',
    'rosters': f'bronze/rosters/load_date={LOAD_DATE}/rosters_all_seasons.parquet',
    'schedules': f'bronze/schedules/load_date={LOAD_DATE}/schedules_all_seasons.parquet',
    'team_stats': f'bronze/team_stats/load_date={LOAD_DATE}/team_stats.parquet'
}

In [21]:
def list_available_files(bucket_name: str, prefix: str = ""):
    """List all available files in the GCS bucket to help identify correct paths"""
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    
    print(f"📁 Files in gs://{bucket_name}/{prefix}:")
    print("-" * 60)
    
    blobs = bucket.list_blobs(prefix=prefix)
    files = []
    
    for blob in blobs:
        if blob.name.endswith('.parquet'):
            size_mb = blob.size / (1024 * 1024)
            print(f"  📄 {blob.name} ({size_mb:.1f} MB)")
            files.append(blob.name)
    
    return files

In [22]:
def load_dataset_sample(bucket_name: str, file_path: str, sample_size: int = 10) -> dict:
    """Load a sample of data from GCS and return info about the dataset"""
    try:
        gcs_path = f"gs://{bucket_name}/{file_path}"
        print(f"\n🔍 Loading sample from: {file_path}")
        
        # Read just a sample for inspection
        df = pl.read_parquet(gcs_path)
        
        # Get dataset info
        info = {
            'path': file_path,
            'total_rows': df.height,
            'total_columns': df.width,
            'columns': df.columns,
            'dtypes': dict(df.schema),
            'sample_data': df.head(sample_size),
            'memory_usage_mb': df.estimated_size('mb')
        }
        
        print(f"  ✅ Total rows: {info['total_rows']:,}")
        print(f"  ✅ Total columns: {info['total_columns']}")
        print(f"  ✅ Estimated size: {info['memory_usage_mb']:.1f} MB")
        
        return info
        
    except Exception as e:
        print(f"  ❌ Error loading {file_path}: {e}")
        return None

In [27]:
def inspect_all_datasets(bucket_name: str, datasets: dict, sample_size: int = 10):
    """Load and inspect all datasets - Pure Polars version"""
    
    print("=" * 70)
    print("🏈 NFL DATA INSPECTOR")
    print("=" * 70)
    
    # First, let's see what files are actually available
    print("\n📋 AVAILABLE FILES:")
    available_files = list_available_files(bucket_name)
    
    if not available_files:
        print("❌ No parquet files found. Check your bucket name and ensure data has been loaded.")
        return {}
    
    print(f"\n🔎 INSPECTING DATASETS (showing top {sample_size} rows each):")
    print("=" * 70)
    
    dataset_info = {}
    
    for dataset_name, expected_path in datasets.items():
        print(f"\n📊 DATASET: {dataset_name.upper()}")
        print("-" * 50)
        
        # Try the expected path first
        info = load_dataset_sample(bucket_name, expected_path, sample_size)
        
        if info is None:
            # If expected path fails, try to find alternative paths
            print(f"  🔍 Searching for alternative paths...")
            possible_paths = [f for f in available_files if dataset_name in f.lower() or any(keyword in f.lower() for keyword in dataset_name.split('_'))]
            
            if possible_paths:
                print(f"  📂 Found possible alternatives:")
                for path in possible_paths[:3]:  # Show up to 3 alternatives
                    print(f"    - {path}")
                
                # Try the first alternative
                info = load_dataset_sample(bucket_name, possible_paths[0], sample_size)
        
        if info:
            dataset_info[dataset_name] = info
            
            # Display column info
            print(f"\n  📋 COLUMNS ({len(info['columns'])}):")
            for col, dtype in list(info['dtypes'].items())[:10]:  # Show first 10 columns
                print(f"    • {col}: {dtype}")
            if len(info['dtypes']) > 10:
                print(f"    ... and {len(info['dtypes']) - 10} more columns")
            
            # Display sample data using pure Polars (NO PANDAS)
            print(f"\n  📊 SAMPLE DATA (top {sample_size} rows):")
            
            sample_df = info['sample_data']  # This is already a Polars DataFrame
            
            # Limit columns for display if too many
            if len(sample_df.columns) > 8:
                display_cols = sample_df.columns[:8]
                sample_display = sample_df.select(display_cols)
                print(f"    (Showing first 8 of {len(sample_df.columns)} columns)")
            else:
                sample_display = sample_df
            
            # Use Polars' built-in display (NO .to_pandas() call)
            print(sample_display)
        
        print("\n" + "=" * 50)
    
    return dataset_info

In [31]:
all_dataset_info = inspect_all_datasets(BUCKET_NAME, DATASETS, sample_size=10)
    
# Summary
print(f"\n📈 SUMMARY:")
print("=" * 50)

total_rows = sum(info['total_rows'] for info in all_dataset_info.values())
total_size_mb = sum(info['memory_usage_mb'] for info in all_dataset_info.values())

print(f"✅ Successfully loaded {len(all_dataset_info)} datasets")
print(f"📊 Total records across all datasets: {total_rows:,}")
print(f"💾 Total estimated size: {total_size_mb:.1f} MB")

print(f"\n📋 Dataset breakdown:")
for name, info in all_dataset_info.items():
    print(f"  • {name}: {info['total_rows']:,} rows, {info['total_columns']} cols, {info['memory_usage_mb']:.1f} MB")

print(f"\n🎯 Ready for silver layer processing!")

# Extended Overview - Columns and Head(3) for each dataset
print(f"\n" + "=" * 80)
print(f"📊 DETAILED DATASET OVERVIEW")
print(f"=" * 80)

for dataset_name, info in all_dataset_info.items():
    print(f"\n🏈 {dataset_name.upper()}")
    print("-" * 60)
    
    print(f"📏 Shape: {info['total_rows']:,} rows × {info['total_columns']} columns")
    print(f"📁 Path: {info['path']}")
    
    print(f"\n📋 COLUMNS ({info['total_columns']}):")
    for i, (col, dtype) in enumerate(info['dtypes'].items(), 1):
        print(f"  {i:2d}. {col:30} | {dtype}")
    
    print(f"\n📊 SAMPLE DATA (first 3 rows):")
    sample_data = info['sample_data'].head(3)  # Get just 3 rows
    
    # If too many columns, show first 8
    if len(sample_data.columns) > 8:
        display_cols = sample_data.columns
        display_df = sample_data.select(display_cols)
        print(f"    (Showing {len(sample_data.columns)} columns)")
        print(display_df)
    else:
        print(sample_data)
    
    print("\n" + "=" * 80)

print(f"\n🚀 Overview complete! All datasets ready for analysis.")

🏈 NFL DATA INSPECTOR

📋 AVAILABLE FILES:
📁 Files in gs://nfl-data-bronze/:
------------------------------------------------------------
  📄 bronze/fantasy_ids/load_date=2025-09-28/ff_playerids.parquet (0.9 MB)
  📄 bronze/play_by_play/load_date=2025-09-28/pbp_2023.parquet (12.8 MB)
  📄 bronze/play_by_play/load_date=2025-09-28/pbp_2024.parquet (12.8 MB)
  📄 bronze/play_by_play/load_date=2025-09-28/pbp_2025.parquet (2.4 MB)
  📄 bronze/play_by_play/load_date=2025-09-28/pbp_all_seasons.parquet (291.1 MB)
  📄 bronze/player_stats/load_date=2025-09-28/player_stats_all_seasons.parquet (10.2 MB)
  📄 bronze/players/load_date=2025-09-27/players.parquet (3.2 MB)
  📄 bronze/players/load_date=2025-09-28/players.parquet (2.0 MB)
  📄 bronze/rosters/load_date=2025-09-28/rosters_all_seasons.parquet (2.8 MB)
  📄 bronze/schedules/load_date=2025-09-28/schedules_all_seasons.parquet (0.3 MB)
  📄 bronze/team_stats/load_date=2025-09-28/team_stats.parquet (0.9 MB)

🔎 INSPECTING DATASETS (showing top 10 rows each

In [38]:
all_dataset_info.keys()

dict_keys(['fantasy_ids', 'play_by_play', 'player_stats', 'players', 'rosters', 'schedules', 'team_stats'])

In [45]:
for col in all_dataset_info['fantasy_ids']['columns']:
    print(col)

mfl_id
sportradar_id
fantasypros_id
gsis_id
pff_id
sleeper_id
nfl_id
espn_id
yahoo_id
fleaflicker_id
cbs_id
pfr_id
cfbref_id
rotowire_id
rotoworld_id
ktc_id
stats_id
stats_global_id
fantasy_data_id
swish_id
name
merge_name
position
team
birthdate
age
draft_year
draft_round
draft_pick
draft_ovr
twitter_username
height
weight
college
db_season
