In [1]:
import os
from google.cloud import storage
from google.api_core.exceptions import NotFound

In [2]:
# --- Configuration ---
project_id = "animated-way-451621-i3"
bucket_name = "stat288"
# Choose the local directory where you want to download the files
local_destination_folder = "downloaded_gcs_data"
# --- End Configuration ---

In [3]:
def download_gcs_bucket(project_id, bucket_name, destination_folder):
    """Downloads all blobs from a GCS bucket to a local directory,
       preserving the folder structure.

    Args:
        project_id (str): Your Google Cloud project ID.
        bucket_name (str): The name of the GCS bucket.
        destination_folder (str): The local path to download files into.
    """
    print(f"Starting download from gs://{bucket_name} in project '{project_id}'...")
    print(f"Local destination: '{os.path.abspath(destination_folder)}'")

    # Ensure the base destination directory exists
    os.makedirs(destination_folder, exist_ok=True)

    try:
        # Initialize the GCS client
        # The client uses Application Default Credentials (ADC)
        # automatically picked up from your gcloud login or environment.
        storage_client = storage.Client(project=project_id)

        # Get the bucket object
        bucket = storage_client.bucket(bucket_name)

        # List all blobs in the bucket
        # Use list_blobs() without prefix to get everything
        print(f"Fetching list of objects from bucket '{bucket_name}'...")
        blobs = bucket.list_blobs()
        download_count = 0
        skipped_count = 0

        for blob in blobs:
            # Construct the full local path including subdirectory structure
            # blob.name contains the full path within the bucket (e.g., 'folder/subfolder/file.txt')
            local_file_path = os.path.join(destination_folder, blob.name)

            # GCS represents empty folders often as zero-byte objects ending with '/'
            # Skip downloading these "folder markers" but ensure the directory exists
            if blob.name.endswith('/'):
                print(f"  Ensuring directory exists for: {blob.name}")
                os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
                skipped_count += 1
                continue # Don't try to download it as a file

            # Create the local directory structure if it doesn't exist
            local_directory = os.path.dirname(local_file_path)
            if not os.path.exists(local_directory):
                print(f"  Creating directory: {local_directory}")
                os.makedirs(local_directory, exist_ok=True)

            # Download the blob to the local file
            try:
                print(f"  Downloading: {blob.name} -> {local_file_path}")
                blob.download_to_filename(local_file_path)
                download_count += 1
            except Exception as e:
                print(f"  ERROR downloading {blob.name}: {e}")
                skipped_count += 1


        print("\n--------------------")
        print("Download Summary:")
        print(f"  Successfully downloaded: {download_count} files")
        print(f"  Skipped (folders/errors): {skipped_count} items")
        print("--------------------")

    except NotFound:
        print(f"Error: Bucket '{bucket_name}' not found in project '{project_id}'.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("Please ensure you have authenticated (`gcloud auth application-default login`)")
        print("and that the specified project ID and bucket name are correct.")
        print("Also check network connectivity and permissions.")

In [3]:
def download_gcs_bucket_skip_existing(project_id, bucket_name, destination_folder):
    print(f"Starting download from gs://{bucket_name} in project '{project_id}'...")
    print(f"Local destination: '{os.path.abspath(destination_folder)}'")

    # Ensure the base destination directory exists
    os.makedirs(destination_folder, exist_ok=True)

    try:
        # ... (rest of the initial setup code is the same) ...
        storage_client = storage.Client(project=project_id)
        bucket = storage_client.bucket(bucket_name)
        print(f"Fetching list of objects from bucket '{bucket_name}'...")
        blobs = bucket.list_blobs()
        download_count = 0
        skipped_count = 0
        already_exist_count = 0 # Track files skipped because they exist

        for blob in blobs:
            local_file_path = os.path.join(destination_folder, blob.name)

            if blob.name.endswith('/'):
                print(f"   Ensuring directory exists for: {blob.name}")
                os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
                skipped_count += 1
                continue

            local_directory = os.path.dirname(local_file_path)
            if not os.path.exists(local_directory):
                print(f"   Creating directory: {local_directory}")
                os.makedirs(local_directory, exist_ok=True)

            # --- Check if the local file already exists ---
            if os.path.exists(local_file_path):
                print(f"   Skipping (already exists): {local_file_path}")
                already_exist_count += 1
                continue
            # --- End Check ---

            try:
                print(f"   Downloading: {blob.name} -> {local_file_path}")
                blob.download_to_filename(local_file_path)
                download_count += 1
            except Exception as e:
                print(f"   ERROR downloading {blob.name}: {e}")
                skipped_count += 1

        print("\n--------------------")
        print("Download Summary:")
        print(f"   Successfully downloaded: {download_count} files")
        print(f"   Skipped (folders/errors): {skipped_count} items")
        print(f"   Skipped (already exists): {already_exist_count} files") # Added summary line
        print("--------------------")
    
    except NotFound:
        print(f"Error: Bucket '{bucket_name}' not found in project '{project_id}'.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("Please ensure you have authenticated (`gcloud auth application-default login`)")
        print("and that the specified project ID and bucket name are correct.")
        print("Also check network connectivity and permissions.")


In [None]:
download_gcs_bucket(project_id, bucket_name, local_destination_folder)

Starting download from gs://stat288 in project 'animated-way-451621-i3'...
Local destination: 'c:\Users\yuant\OneDrive\Desktop\STAT 288\project\downloaded_gcs_data'
Fetching list of objects from bucket 'stat288'...
  Creating directory: downloaded_gcs_data\data_dhs/senegal_1992
  Downloading: data_dhs/senegal_1992/senegal_1992_00000.json -> downloaded_gcs_data\data_dhs/senegal_1992/senegal_1992_00000.json
  Downloading: data_dhs/senegal_1992/senegal_1992_00000.tfrecord.gz -> downloaded_gcs_data\data_dhs/senegal_1992/senegal_1992_00000.tfrecord.gz
  Downloading: data_dhs/senegal_1992/senegal_1992_00001.json -> downloaded_gcs_data\data_dhs/senegal_1992/senegal_1992_00001.json
  Downloading: data_dhs/senegal_1992/senegal_1992_00001.tfrecord.gz -> downloaded_gcs_data\data_dhs/senegal_1992/senegal_1992_00001.tfrecord.gz
  Downloading: data_dhs/senegal_1992/senegal_1992_00002.json -> downloaded_gcs_data\data_dhs/senegal_1992/senegal_1992_00002.json
  Downloading: data_dhs/senegal_1992/senega

In [4]:
download_gcs_bucket_skip_existing(project_id, bucket_name, local_destination_folder)

Starting download from gs://stat288 in project 'animated-way-451621-i3'...
Local destination: 'c:\Users\yuant\OneDrive\Desktop\STAT 288\project\downloaded_gcs_data'
Fetching list of objects from bucket 'stat288'...
   Skipping (already exists): downloaded_gcs_data\data_dhs/senegal_1992/senegal_1992_00000.json
   Skipping (already exists): downloaded_gcs_data\data_dhs/senegal_1992/senegal_1992_00000.tfrecord.gz
   Skipping (already exists): downloaded_gcs_data\data_dhs/senegal_1992/senegal_1992_00001.json
   Skipping (already exists): downloaded_gcs_data\data_dhs/senegal_1992/senegal_1992_00001.tfrecord.gz
   Skipping (already exists): downloaded_gcs_data\data_dhs/senegal_1992/senegal_1992_00002.json
   Skipping (already exists): downloaded_gcs_data\data_dhs/senegal_1992/senegal_1992_00002.tfrecord.gz
   Skipping (already exists): downloaded_gcs_data\data_dhs/senegal_1992/senegal_1992_00003.json
   Skipping (already exists): downloaded_gcs_data\data_dhs/senegal_1992/senegal_1992_00003.t