# Check Azure Blob Storage Folder Size

This notebook calculates the total size of all files in the `rsna_drrs_and_nifti` folder in Azure Blob Storage.

In [None]:
# Install required package
!pip install azure-storage-blob -q

In [None]:
from azure.storage.blob import BlobServiceClient
from tqdm import tqdm

In [None]:
# Azure Blob Storage configuration
CONNECTION_STRING = (
    "DefaultEndpointsProtocol=https;AccountName=spartis9488473038;"
    "AccountKey=WxiLwTEm+WEut0AIFRTLiWcXgHhDixXtYtF5gbbGIKLMWANt5wHOVwg/"
    "QzRgz2uG1CHcazDil58i+ASttN+yaA==;EndpointSuffix=core.windows.net"
)
CONTAINER_NAME = "ct-big-data"
FOLDER_PREFIX = "rsna_drrs_and_nifti"

In [None]:
# Connect to Azure Blob Storage
print("üîó Connecting to Azure Blob Storage...")
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)
container_client = blob_service_client.get_container_client(CONTAINER_NAME)
print("‚úÖ Connected successfully!")

In [None]:
# Calculate total size (with immediate feedback)
import time

print(f"\nüìä Calculating size of '{FOLDER_PREFIX}' folder...\n")
print("‚è≥ Starting to stream files from Azure... (please wait)\n")

total_size_bytes = 0
file_count = 0
start_time = time.time()

try:
    # Use list_blobs with pagination for better streaming
    blob_pages = container_client.list_blobs(
        name_starts_with=FOLDER_PREFIX,
        results_per_page=100
    ).by_page()
    
    for page_num, page in enumerate(blob_pages, 1):
        page_size = 0
        page_count = 0
        
        for blob in page:
            if not blob.name.endswith('/'):  # Skip directory markers
                total_size_bytes += blob.size
                file_count += 1
                page_size += blob.size
                page_count += 1
        
        # Show progress after each page (every ~100 files)
        elapsed = time.time() - start_time
        size_gb = total_size_bytes / (1024 ** 3)
        print(f"üìÑ Page {page_num}: {page_count} files, {page_size/(1024**2):.1f} MB | "
              f"Total so far: {file_count} files, {size_gb:.2f} GB | "
              f"Time: {elapsed:.1f}s")
    
    # Final summary
    elapsed = time.time() - start_time
    total_size_mb = total_size_bytes / (1024 ** 2)
    total_size_gb = total_size_bytes / (1024 ** 3)
    total_size_tb = total_size_bytes / (1024 ** 4)
    
    print("\n" + "="*70)
    print("üì¶ FOLDER SIZE SUMMARY")
    print("="*70)
    print(f"üìÅ Folder: {FOLDER_PREFIX}")
    print(f"üìä Total files: {file_count:,}")
    print(f"üíæ Total size:")
    print(f"     {total_size_bytes:,} bytes")
    print(f"     {total_size_mb:,.2f} MB")
    print(f"     {total_size_gb:,.2f} GB")
    if total_size_tb >= 1:
        print(f"     {total_size_tb:,.2f} TB")
    print(f"‚è±Ô∏è  Processing time: {elapsed:.1f} seconds")
    print("="*70)
    
except KeyboardInterrupt:
    print("\n\n‚ö†Ô∏è Interrupted by user!")
    print(f"üìä Partial results: {file_count:,} files, {total_size_bytes / (1024**3):.2f} GB")
except Exception as e:
    print(f"\n‚ùå Error occurred: {e}")
    print(f"üìä Partial results: {file_count:,} files processed")
    import traceback
    traceback.print_exc()