In [1]:
# imports to look in "leap-persistent" google cloud storage
import gcsfs
fs = gcsfs.GCSFileSystem()

# looking in a cloud storage directory

### To look at contents in a directory, use '.ls' and put your path in the parentheses. 

### The path will always start with " gs://leap-persistent/ "

We insert your username and a path associated with the default runs

In [2]:
your_username = 'dhruvc99'
base_path = f'gs://leap-persistent/{your_username}'

# List all files and directories recursively
all_files = fs.ls(base_path, detail=False, refresh=True)

for f in all_files:
    print(f)

leap-persistent/dhruvc99/dhruvc99


In [3]:
def print_tree(fs, path, indent=0):
    items = fs.ls(path)
    for item in items:
        name = item.split('/')[-1]
        print("  " * indent + name)
        # Skip looking inside .zarr directories
        if fs.isdir(item) and not item.endswith('.zarr'):
            print_tree(fs, item, indent + 1)

print_tree(fs, f'gs://leap-persistent/{your_username}')

dhruvc99
  pco2_residual
    baseline
      post02_xgb
        metrics
          xgb_test_performance_200401-202312.csv
          xgb_unseen_performance_200401-202312.csv
        reconstructions
          ACCESS-ESM1-5
            member_r10i1p1f1
              recon_pCO2_ACCESS-ESM1-5_member_r10i1p1f1_mon_1x1_200401_202312.zarr
              recon_pCO2residual_ACCESS-ESM1-5_member_r10i1p1f1_mon_1x1_200401_202312.zarr
            member_r2i1p1f1
              recon_pCO2_ACCESS-ESM1-5_member_r2i1p1f1_mon_1x1_200401_202312.zarr
              recon_pCO2residual_ACCESS-ESM1-5_member_r2i1p1f1_mon_1x1_200401_202312.zarr
            member_r5i1p1f1
              recon_pCO2_ACCESS-ESM1-5_member_r5i1p1f1_mon_1x1_200401_202312.zarr
              recon_pCO2residual_ACCESS-ESM1-5_member_r5i1p1f1_mon_1x1_200401_202312.zarr
          CanESM5
            member_r1i1p1f1
              recon_pCO2_CanESM5_member_r1i1p1f1_mon_1x1_200401_202312.zarr
              recon_pCO2residual_CanESM5_member_r1i1p1f1

# getting file sizes

### if we want to get info for a file on the cloud storage:

In [4]:
owner_username = 'dhruvc99'

runthiscell = 1

from google.cloud import storage

if runthiscell != -1:
    client = storage.Client()
    bucket = client.bucket("leap-persistent")
    
    prefix = f"{owner_username}/{owner_username}/pco2_residual/"
    blobs = bucket.list_blobs(prefix=prefix)
    
    seen_dirs = set()  # Track top-level directories (immediate subdirectories of `reconstructions`)

    for blob in blobs:
        # Extract the relative path after `reconstructions/`
        relative_path = blob.name[len(prefix):]
        top_level_dir = relative_path.split("/")[0]  # Get first component

        try:
            # Make the file public
            blob.make_public()
            
            # Only print if it's a new top-level directory
            if top_level_dir not in seen_dirs:
                seen_dirs.add(top_level_dir)
                print(f"Made public: {top_level_dir}")
        except Exception as e:
            print(f"Failed to make public: {blob.name}")
            print(e)

Made public: baseline
Made public: custom_loss
Failed to make public: dhruvc99/dhruvc99/pco2_residual/custom_loss/post02_xgb/reconstructions/CanESM5/member_r1i1p1f1/recon_pCO2residual_CanESM5_member_r1i1p1f1_mon_1x1_200401_202312.zarr/net_mask/6.2.2
503 PATCH https://storage.googleapis.com/storage/v1/b/leap-persistent/o/dhruvc99%2Fdhruvc99%2Fpco2_residual%2Fcustom_loss%2Fpost02_xgb%2Freconstructions%2FCanESM5%2Fmember_r1i1p1f1%2Frecon_pCO2residual_CanESM5_member_r1i1p1f1_mon_1x1_200401_202312.zarr%2Fnet_mask%2F6.2.2?projection=full&prettyPrint=false: We encountered an internal error. Please try again.
Made public: latitude_split
Made public: ocean_split
Failed to make public: dhruvc99/dhruvc99/pco2_residual/ocean_split/post02_xgb/reconstructions/CanESM5/member_r1i1p1f1/recon_pCO2residual_CanESM5_member_r1i1p1f1_mon_1x1_200401_202312.zarr/net_mask/4.2.3
503 PATCH https://storage.googleapis.com/storage/v1/b/leap-persistent/o/dhruvc99%2Fdhruvc99%2Fpco2_residual%2Focean_split%2Fpost02_xgb%

In [9]:
failed_cases = [
    'dhruvc99/dhruvc99/pco2_residual/custom_loss/post02_xgb/reconstructions/CanESM5/member_r1i1p1f1/recon_pCO2residual_CanESM5_member_r1i1p1f1_mon_1x1_200401_202312.zarr/net_mask/6.2.2',
    'dhruvc99/dhruvc99/pco2_residual/ocean_split/post02_xgb/reconstructions/CanESM5/member_r1i1p1f1/recon_pCO2residual_CanESM5_member_r1i1p1f1_mon_1x1_200401_202312.zarr/net_mask/4.2.3',
    'dhruvc99/dhruvc99/pco2_residual/ocean_split/post02_xgb/reconstructions/MPI-ESM1-2-LR/member_r15i1p1f1/recon_pCO2residual_MPI-ESM1-2-LR_member_r15i1p1f1_mon_1x1_200401_202312.zarr/pCO2_recon_full/1.0.0'
               ]

for f in failed_cases:
    blob = bucket.blob(file_path)
    try:
        # Make the file public
        blob.make_public()
        
        # Only print if it's a new top-level directory
        if top_level_dir not in seen_dirs:
            seen_dirs.add(top_level_dir)
            print(f"Made public: {top_level_dir}")
    except Exception as e:
        print(f"Failed to make public: {blob.name}")
        print(e)

In [10]:
def get_directory_size(fs, path):
    total_size = 0
    files = fs.find(path)  # Recursively find all files
    for file in files:
        if not fs.isdir(file):  # Only count files, not directories
            total_size += fs.size(file)
    return total_size

# Usage
size_bytes = get_directory_size(fs, f'gs://leap-persistent/{your_username}')
size_mb = size_bytes / (1024 * 1024)
size_gb = size_bytes / (1024 * 1024 * 1024)
print(f"Total size: {size_gb:.2f} GB ({size_mb:.2f} MB)")

Total size: 15.44 GB (15812.88 MB)


In [4]:
runthiscell = -1

In [5]:
from google.cloud import storage

if runthiscell != -1:
    client = storage.Client()
    bucket = client.bucket("leap-persistent")
    
    # List of folders to delete
    folders_to_delete = [
        "weighted-samples-diff-params",
        "nmse",
    ]
    
    for folder in folders_to_delete:
        prefix = f"{your_username}/{your_username}/pco2_residual/{folder}/"
        print(f"\nDeleting contents of: {prefix}")
        
        blobs = bucket.list_blobs(prefix=prefix)
        
        # Delete all blobs in the folder
        blob_count = 0
        for blob in blobs:
            try:
                blob.delete()
                blob_count += 1
            except Exception as e:
                print(f"Failed to delete: {blob.name}")
                print(e)
        
        print(f"Deleted {blob_count} files from {folder}")
else:
    print("Set runthiscell = 1 to execute the deletion")


Deleting contents of: dhruvc99/dhruvc99/pco2_residual/weighted-samples-diff-params/
Deleted 7418 files from weighted-samples-diff-params

Deleting contents of: dhruvc99/dhruvc99/pco2_residual/nmse/
Deleted 7420 files from nmse


In [4]:
# a random csv!
fs.info('gs://leap-persistent/galenmckinley/test_folder/test.csv')

{'kind': 'storage#object',
 'id': 'leap-persistent/galenmckinley/test_folder/test.csv/1743448790690071',
 'selfLink': 'https://www.googleapis.com/storage/v1/b/leap-persistent/o/galenmckinley%2Ftest_folder%2Ftest.csv',
 'mediaLink': 'https://storage.googleapis.com/download/storage/v1/b/leap-persistent/o/galenmckinley%2Ftest_folder%2Ftest.csv?generation=1743448790690071&alt=media',
 'name': 'leap-persistent/galenmckinley/test_folder/test.csv',
 'bucket': 'leap-persistent',
 'generation': '1743448790690071',
 'metageneration': '1',
 'contentType': 'application/octet-stream',
 'storageClass': 'STANDARD',
 'size': 17,
 'md5Hash': 'owQ8WisrmPuOUx3Yq+PjjA==',
 'crc32c': 'JUM0Fg==',
 'etag': 'CJfS6e6EtYwDEAE=',
 'timeCreated': '2025-03-31T19:19:50.693Z',
 'updated': '2025-03-31T19:19:50.693Z',
 'timeStorageClassUpdated': '2025-03-31T19:19:50.693Z',
 'timeFinalized': '2025-03-31T19:19:50.693Z',
 'type': 'file',
 'mtime': datetime.datetime(2025, 3, 31, 19, 19, 50, 693000, tzinfo=datetime.timezon

### if we want specifically the size, we use 'fs.du()' which stands for "disk usage":

In [5]:
fs.du('gs://leap-persistent/galenmckinley/test_folder/test.csv')

17

### just "17" is unclear though -> this function will make this clearer with units!

In [6]:
def convert_to_human_readable(size):
    """
    Converts disk usage size to 'human readable' format!
    """
    
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size < 1024:
            return f"{size:.2f} {unit}"
        size /= 1024

In [7]:
convert_to_human_readable(17)

'17.00 B'

### that csv is 17 bytes!

### here's another example, but with a .zarr file:

In [8]:
fs.du('gs://leap-persistent/galenmckinley/pco2_residual/nmse/post02_xgb/reconstructions/CanESM5/member_r1i1p1f1/recon_pCO2residual_CanESM5_member_r1i1p1f1_mon_1x1_200401_202312.zarr')

166750349

In [9]:
convert_to_human_readable(21858071)

'20.85 MB'

that .zarr file is 20.85 megabytes

# Deleting a file
To remove a file, you will use 'fs.rm(path)'

# ***Be very careful*** 

Make sure you are sure that you are removing what you want and that you are not deleting in someone else's directory.

In [11]:
# do not actually run this code, please!

# fs.rm('gs://leap-persistent/galenmckinley/test_folder/test.csv')

### Things are more complicated with .zarr files because .zarr files are considered "directories" by the file system
### If deleting a .zarr file, you will just need to add the argument 'recursive=True' inside the parentheses

In [12]:
# fs.rm('gs://leap-persistent/....../filename.zarr', recursive=True)