In [1]:
from pathlib import Path
import gzip
import shutil

ROOT_DIR = Path("../data/raw")

for gz_path in ROOT_DIR.rglob("*.csv.gz"):
    # Target decompressed file
    csv_path = gz_path.with_suffix("")  # removes only the last suffix, so .csv.gz -> .csv

    # Skip if the .csv already exists
    if csv_path.exists():
        print(f"Skip (already decompressed): {csv_path}")
        continue

    comp_size = gz_path.stat().st_size
    comp_mb = comp_size / (1024**2)

    print(f"Decompressing {gz_path} ({comp_mb:.2f} MB) -> {csv_path}")

    with gzip.open(gz_path, "rb") as f_in, csv_path.open("wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

    new_size = csv_path.stat().st_size
    new_mb = new_size / (1024**2)

    print(f"  Done. Decompressed size: {new_mb:.2f} MB "
          f"(expanded {(new_mb - comp_mb):.2f} MB)")

Skip (already decompressed): ../data/raw/HD2024.csv
Skip (already decompressed): ../data/raw/All_Data_Combined_2004_2005_2006_2007_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017_2018_2019_2020_2021_2022_2023.csv
Skip (already decompressed): ../data/raw/College_Scorecard_Raw_Data_05192025/MERGED2020_21_PP.csv
Skip (already decompressed): ../data/raw/College_Scorecard_Raw_Data_05192025/MERGED2005_06_PP.csv
Skip (already decompressed): ../data/raw/College_Scorecard_Raw_Data_05192025/MERGED2007_08_PP.csv
Skip (already decompressed): ../data/raw/College_Scorecard_Raw_Data_05192025/MERGED2023_24_PP.csv
Skip (already decompressed): ../data/raw/College_Scorecard_Raw_Data_05192025/FieldOfStudyData1718_1819_PP.csv
Skip (already decompressed): ../data/raw/College_Scorecard_Raw_Data_05192025/MERGED2011_12_PP.csv
Skip (already decompressed): ../data/raw/College_Scorecard_Raw_Data_05192025/MERGED2015_16_PP.csv
Skip (already decompressed): ../data/raw/College_Scorecard_Raw_Data_05192025/MERGED2022