In [4]:
from pathlib import Path
import gzip
import shutil

ROOT_DIR = Path("../data/raw")
MAX_SIZE_BYTES = 50 * 1024 * 1024       # 50 MB

for csv_path in ROOT_DIR.rglob("*.csv"):
    # Skip files that are already compressed
    if csv_path.suffix == ".gz":
        continue

    gz_path = csv_path.with_suffix(csv_path.suffix + ".gz")

    # Skip recompressing if .gz already exists
    if gz_path.exists():
        print(f"Skip (already compressed): {gz_path}")
        continue

    orig_size = csv_path.stat().st_size
    orig_mb = orig_size / (1024**2)

    # if orig_size <= MAX_SIZE_BYTES:
    #     print(f"Skip (small enough): {csv_path} ({orig_mb:.2f} MB)")
    #     continue

    print(f"Compressing {csv_path} ({orig_mb:.2f} MB) -> {gz_path}")

    with csv_path.open("rb") as f_in, gzip.open(gz_path, "wb", compresslevel=9) as f_out:
        shutil.copyfileobj(f_in, f_out)

    comp_size = gz_path.stat().st_size
    comp_mb = comp_size / (1024**2)

    print(f"  Done. Compressed size: {comp_mb:.2f} MB "
          f"(saved {(orig_mb - comp_mb):.2f} MB)")

Skip (already compressed): ../data/raw/HD2024.csv.gz
Skip (already compressed): ../data/raw/All_Data_Combined_2004_2005_2006_2007_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017_2018_2019_2020_2021_2022_2023.csv.gz
Skip (already compressed): ../data/raw/College_Scorecard_Raw_Data_05192025/MERGED2011_12_PP.csv.gz
Skip (already compressed): ../data/raw/College_Scorecard_Raw_Data_05192025/FieldOfStudyData1819_1920_PP.csv.gz
Skip (already compressed): ../data/raw/College_Scorecard_Raw_Data_05192025/MERGED1999_00_PP.csv.gz
Skip (already compressed): ../data/raw/College_Scorecard_Raw_Data_05192025/MERGED2006_07_PP.csv.gz
Skip (already compressed): ../data/raw/College_Scorecard_Raw_Data_05192025/MERGED2014_15_PP.csv.gz
Skip (already compressed): ../data/raw/College_Scorecard_Raw_Data_05192025/Most-Recent-Cohorts-Institution.csv.gz
Compressing ../data/raw/College_Scorecard_Raw_Data_05192025/MERGED2003_04_PP.csv (73.84 MB) -> ../data/raw/College_Scorecard_Raw_Data_05192025/MERGED2003_04_PP.cs