Data curation: What is in the data folder

In [12]:
import pandas as pd
import os
from pathlib import Path
from natsort import natsorted, natsort_keygen
import numpy as np
import datetime as dt

In [13]:
DATA_DIR = os.environ.get('DATA_DIR', "/data/etienne.doumazane/theremia/hest_data")
DATA_DIR = Path(DATA_DIR)
natsorted(os.listdir(DATA_DIR))


['HEST_v1_1_0.csv',
 'MahmoodLab___hest',
 'README.md',
 '_data_etienne.doumazane_theremia_hest_data_MahmoodLab___hest_custom_config-patterns=%2A_1.0.0_94127ca856cb5f26aa6d5ab751be03921c2cc400b324e6285c015001d87154f7.lock',
 'cellvit_seg',
 'fig1_preview.jpg',
 'fig1a.jpg',
 'hest.py',
 'human_gene_db.parquet',
 'metadata',
 'patches',
 'patches_vis',
 'pixel_size_vis',
 'spatial_plots',
 'st',
 'thumbnails',
 'tissue_seg',
 'transcripts',
 'wsis',
 'xenium_seg']

In [14]:
natsorted(DATA_DIR.glob('*'))

[PosixPath('/data/etienne.doumazane/theremia/hest_data/HEST_v1_1_0.csv'),
 PosixPath('/data/etienne.doumazane/theremia/hest_data/MahmoodLab___hest'),
 PosixPath('/data/etienne.doumazane/theremia/hest_data/README.md'),
 PosixPath('/data/etienne.doumazane/theremia/hest_data/_data_etienne.doumazane_theremia_hest_data_MahmoodLab___hest_custom_config-patterns=%2A_1.0.0_94127ca856cb5f26aa6d5ab751be03921c2cc400b324e6285c015001d87154f7.lock'),
 PosixPath('/data/etienne.doumazane/theremia/hest_data/cellvit_seg'),
 PosixPath('/data/etienne.doumazane/theremia/hest_data/fig1_preview.jpg'),
 PosixPath('/data/etienne.doumazane/theremia/hest_data/fig1a.jpg'),
 PosixPath('/data/etienne.doumazane/theremia/hest_data/hest.py'),
 PosixPath('/data/etienne.doumazane/theremia/hest_data/human_gene_db.parquet'),
 PosixPath('/data/etienne.doumazane/theremia/hest_data/metadata'),
 PosixPath('/data/etienne.doumazane/theremia/hest_data/patches'),
 PosixPath('/data/etienne.doumazane/theremia/hest_data/patches_vis')

In [15]:
def list_files_excluding_zarr(root_dir):
    root_path = Path(root_dir)

    for dirpath, dirnames, filenames in os.walk(root_path):
        # Skip directories that end with .zarr by removing them from dirnames
        dirnames[:] = [d for d in dirnames if not d.endswith('.zarr')]

        # Yield files from the current directory
        for filename in filenames:
            yield Path(dirpath) / filename

def create_table_1(dir):
    file_paths = natsorted(list_files_excluding_zarr(dir))
    df = pd.DataFrame(file_paths, columns=["path"])
    df["name"] = df["path"].apply(lambda x: x.name)
    # df["size_GB"] = df["path"].apply(lambda x: np.round(x.stat().st_size / 1e9, 1))
    df["size_MB"] = df["path"].apply(lambda x: np.round(x.stat().st_size / 1e6, 1))
    # df["size_kB"] = df["path"].apply(lambda x: np.round(x.stat().st_size / 1e3, 1))
    df["relative_path"] = df["path"].apply(lambda x: str(x.relative_to(DATA_DIR)))
    df["mtime"] = df["path"].apply(lambda x: dt.datetime.fromtimestamp(x.stat().st_mtime).isoformat(timespec="minutes"))
    df = df.sort_values("name", key=natsort_keygen()).reset_index(drop=True)
    df = df.sort_values("size_MB", ascending=False)
    df["extension"] = df["path"].apply(lambda x: x.suffix.upper())
    return df[df.columns.difference(["path"])].copy()

In [16]:
df = create_table_1(DATA_DIR)
df

Unnamed: 0,extension,mtime,name,relative_path,size_MB
15190,.PARQUET,2024-12-03T22:29,TENX110_transcripts.parquet,transcripts/TENX110_transcripts.parquet,22052.2
15913,.PARQUET,2024-12-03T22:30,TENX159_transcripts.parquet,transcripts/TENX159_transcripts.parquet,15263.5
15109,.TIF,2024-12-04T15:42,TENX98.tif,wsis/TENX98.tif,15062.2
15715,.PARQUET,2024-12-03T22:28,TENX143_transcripts.parquet,transcripts/TENX143_transcripts.parquet,13957.2
15127,.TIF,2024-12-04T15:41,TENX99.tif,wsis/TENX99.tif,11784.3
...,...,...,...,...,...
2885,.JSON,2024-12-03T21:55,MISC106.json,metadata/MISC106.json,0.0
2897,.JSON,2024-12-03T21:55,MISC107.json,metadata/MISC107.json,0.0
2909,.JSON,2024-12-03T21:55,MISC108.json,metadata/MISC108.json,0.0
2849,.JSON,2024-12-03T21:55,MISC103.json,metadata/MISC103.json,0.0


In [17]:
df.extension.value_counts()

extension
.PNG        2580
.GEOJSON    2458
.JPG        2218
.PARQUET    1425
.ZIP        1359
.JSON       1230
.H5         1229
.TIF        1229
.H5AD       1229
.JPEG       1229
.LOCK          3
.CSV           1
.ARROW         1
.PY            1
.MD            1
               1
Name: count, dtype: int64

In [18]:
df.groupby("extension")["size_MB"].mean()

extension
              0.000000
.ARROW        0.000000
.CSV          0.500000
.GEOJSON    105.496094
.H5         232.722132
.H5AD        61.336859
.JPEG         0.130757
.JPG          0.229982
.JSON         0.000000
.LOCK         0.000000
.MD           0.000000
.PARQUET    138.905754
.PNG          0.564225
.PY           0.000000
.TIF        788.512937
.ZIP         18.531126
Name: size_MB, dtype: float64

In [19]:
list(df[df["extension"] == ".H5AD"]["relative_path"])

['st/TENX70.h5ad',
 'st/TENX72.h5ad',
 'st/TENX81.h5ad',
 'st/ZEN61.h5ad',
 'st/TENX73.h5ad',
 'st/TENX83.h5ad',
 'st/TENX78.h5ad',
 'st/TENX152.h5ad',
 'st/TENX82.h5ad',
 'st/TENX79.h5ad',
 'st/ZEN60.h5ad',
 'st/TENX84.h5ad',
 'st/TENX77.h5ad',
 'st/TENX74.h5ad',
 'st/TENX65.h5ad',
 'st/TENX71.h5ad',
 'st/MISC62.h5ad',
 'st/TENX86.h5ad',
 'st/NCBI793.h5ad',
 'st/NCBI682.h5ad',
 'st/TENX85.h5ad',
 'st/MISC63.h5ad',
 'st/NCBI681.h5ad',
 'st/NCBI776.h5ad',
 'st/TENX88.h5ad',
 'st/NCBI683.h5ad',
 'st/MISC39.h5ad',
 'st/MISC41.h5ad',
 'st/INT23.h5ad',
 'st/NCBI855.h5ad',
 'st/TENX69.h5ad',
 'st/INT27.h5ad',
 'st/MEND76.h5ad',
 'st/MISC40.h5ad',
 'st/TENX75.h5ad',
 'st/TENX16.h5ad',
 'st/TENX51.h5ad',
 'st/TENX87.h5ad',
 'st/TENX76.h5ad',
 'st/INT21.h5ad',
 'st/MEND78.h5ad',
 'st/TENX62.h5ad',
 'st/NCBI684.h5ad',
 'st/NCBI625.h5ad',
 'st/INT18.h5ad',
 'st/TENX91.h5ad',
 'st/MISC65.h5ad',
 'st/TENX45.h5ad',
 'st/TENX90.h5ad',
 'st/NCBI762.h5ad',
 'st/MISC66.h5ad',
 'st/MEND75.h5ad',
 'st/MEN