In [10]:
import os
import zipfile
import glob
import re
import shutil

import os
import zipfile
import shutil

def extract_zip_to_same_folder(zip_path,extract_dir):
    """Extract ZIP file in the same folder, excluding __MACOSX."""
    #extract_dir = os.path.dirname(zip_path)
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            for file in zip_ref.namelist():
                if '__MACOSX' not in file and not file.endswith('/'):
                    zip_ref.extract(file, extract_dir)
        print(f"✅ Extracted: {zip_path}")
    except zipfile.BadZipFile:
        print(f"❌ Invalid ZIP file: {zip_path}")

def find_all_csvs(folder_path, extract_dir):
    """
    Recursively find all CSV files in a folder.
    If any ZIPs are found, they will be extracted first (recursively).
    Skips __MACOSX folders.
    """
    csv_files = []

    for root, dirs, files in os.walk(folder_path):
        # Skip __MACOSX
        if '__MACOSX' in root:
            continue

        for file in files:
            full_path = os.path.join(root, file)

            if file.lower().endswith('.zip'):
                # Extract ZIP and process its contents
                extract_zip_to_same_folder(full_path, extract_dir)
                # Optional: delete ZIP after extracting
                os.remove(full_path)
            elif file.lower().endswith('.csv'):
                csv_files.append(full_path)

    # After extracting all ZIPs, do one more sweep to find any new CSVs
    for root, dirs, files in os.walk(folder_path):
        if '__MACOSX' in root:
            continue
        for file in files:
            if file.lower().endswith('.csv'):
                full_path = os.path.join(root, file)
                if full_path not in csv_files:
                    csv_files.append(full_path)

    return csv_files



# Folder containing ZIP files
zip_folder = "..//tripdata_zips"

# Base output folder
base_output_folder = "..//extracted_tripdata4"
os.makedirs(base_output_folder, exist_ok=True)

# Regex to extract years from 2013 to 2025
year_pattern = re.compile(r"(201[3-9]|202[0-5])")

# Get all ZIP files in the folder
zip_files = glob.glob(os.path.join(zip_folder, "*.zip"))

for zip_file in zip_files:
    base_name = os.path.basename(zip_file)

    # Extract year from filename
    match = year_pattern.search(base_name)
    if not match:
        print(f"No valid year found in {base_name}, skipping.")
        continue
    year = match.group(0)

    # Create year output folder
    year_folder = os.path.join(base_output_folder, year)
    os.makedirs(year_folder, exist_ok=True)

    # Extract all contents to year folder
    extract_zip_to_same_folder(zip_file, extract_dir=year_folder)

    # Find all CSVs inside the extracted content
    csv_files = find_all_csvs(year_folder, extract_dir=year_folder )
    print(f"  → Found {len(csv_files)} CSV file(s) in {year_folder}")

    # Optional: move all CSVs to year folder (flatten structure)
    for csv_path in csv_files:
        filename = os.path.basename(csv_path)
        new_path = os.path.join(year_folder, filename)

        # Avoid overwriting files with same name
        if os.path.abspath(csv_path) != os.path.abspath(new_path):
            shutil.move(csv_path, new_path)
            print(f"    Moved {csv_path} → {new_path}")


✅ Extracted: ..//tripdata_zips\2013-citibike-tripdata.zip
  → Found 34 CSV file(s) in ..//extracted_tripdata4\2013
    Moved ..//extracted_tripdata4\2013\2013-citibike-tripdata\201306-citibike-tripdata.csv → ..//extracted_tripdata4\2013\201306-citibike-tripdata.csv
    Moved ..//extracted_tripdata4\2013\2013-citibike-tripdata\201307-citibike-tripdata.csv → ..//extracted_tripdata4\2013\201307-citibike-tripdata.csv
    Moved ..//extracted_tripdata4\2013\2013-citibike-tripdata\201308-citibike-tripdata.csv → ..//extracted_tripdata4\2013\201308-citibike-tripdata.csv
    Moved ..//extracted_tripdata4\2013\2013-citibike-tripdata\201309-citibike-tripdata.csv → ..//extracted_tripdata4\2013\201309-citibike-tripdata.csv
    Moved ..//extracted_tripdata4\2013\2013-citibike-tripdata\201310-citibike-tripdata.csv → ..//extracted_tripdata4\2013\201310-citibike-tripdata.csv
    Moved ..//extracted_tripdata4\2013\2013-citibike-tripdata\201311-citibike-tripdata.csv → ..//extracted_tripdata4\2013\201311-c

    Moved ..//extracted_tripdata4\2016\2016-citibike-tripdata\12_December\201612-citibike-tripdata_1.csv → ..//extracted_tripdata4\2016\201612-citibike-tripdata_1.csv
    Moved ..//extracted_tripdata4\2016\2016-citibike-tripdata\1_January\201601-citibike-tripdata_1.csv → ..//extracted_tripdata4\2016\201601-citibike-tripdata_1.csv
    Moved ..//extracted_tripdata4\2016\2016-citibike-tripdata\2_February\201602-citibike-tripdata_1.csv → ..//extracted_tripdata4\2016\201602-citibike-tripdata_1.csv
    Moved ..//extracted_tripdata4\2016\2016-citibike-tripdata\3_March\201603-citibike-tripdata_1.csv → ..//extracted_tripdata4\2016\201603-citibike-tripdata_1.csv
    Moved ..//extracted_tripdata4\2016\2016-citibike-tripdata\4_April\201604-citibike-tripdata_1.csv → ..//extracted_tripdata4\2016\201604-citibike-tripdata_1.csv
    Moved ..//extracted_tripdata4\2016\2016-citibike-tripdata\4_April\201604-citibike-tripdata_2.csv → ..//extracted_tripdata4\2016\201604-citibike-tripdata_2.csv
    Moved ../

    Moved ..//extracted_tripdata4\2018\2018-citibike-tripdata\201812-citibike-tripdata.csv → ..//extracted_tripdata4\2018\201812-citibike-tripdata.csv
    Moved ..//extracted_tripdata4\2018\2018-citibike-tripdata\10_October\201810-citibike-tripdata_1.csv → ..//extracted_tripdata4\2018\201810-citibike-tripdata_1.csv
    Moved ..//extracted_tripdata4\2018\2018-citibike-tripdata\10_October\201810-citibike-tripdata_2.csv → ..//extracted_tripdata4\2018\201810-citibike-tripdata_2.csv
    Moved ..//extracted_tripdata4\2018\2018-citibike-tripdata\11_November\201811-citibike-tripdata_1.csv → ..//extracted_tripdata4\2018\201811-citibike-tripdata_1.csv
    Moved ..//extracted_tripdata4\2018\2018-citibike-tripdata\11_November\201811-citibike-tripdata_2.csv → ..//extracted_tripdata4\2018\201811-citibike-tripdata_2.csv
    Moved ..//extracted_tripdata4\2018\2018-citibike-tripdata\12_December\201812-citibike-tripdata_1.csv → ..//extracted_tripdata4\2018\201812-citibike-tripdata_1.csv
    Moved ..//ex

✅ Extracted: ..//extracted_tripdata4\2020\2020-citibike-tripdata\202003-citibike-tripdata.zip
✅ Extracted: ..//extracted_tripdata4\2020\2020-citibike-tripdata\202004-citibike-tripdata.zip
✅ Extracted: ..//extracted_tripdata4\2020\2020-citibike-tripdata\202005-citibike-tripdata.zip
✅ Extracted: ..//extracted_tripdata4\2020\2020-citibike-tripdata\202006-citibike-tripdata.zip
✅ Extracted: ..//extracted_tripdata4\2020\2020-citibike-tripdata\202007-citibike-tripdata.zip
✅ Extracted: ..//extracted_tripdata4\2020\2020-citibike-tripdata\202008-citibike-tripdata.zip
✅ Extracted: ..//extracted_tripdata4\2020\2020-citibike-tripdata\202009-citibike-tripdata.zip
✅ Extracted: ..//extracted_tripdata4\2020\2020-citibike-tripdata\202010-citibike-tripdata.zip
✅ Extracted: ..//extracted_tripdata4\2020\2020-citibike-tripdata\202011-citibike-tripdata.zip
✅ Extracted: ..//extracted_tripdata4\2020\2020-citibike-tripdata\202012-citibike-tripdata.zip
  → Found 27 CSV file(s) in ..//extracted_tripdata4\2020
✅ E

  → Found 25 CSV file(s) in ..//extracted_tripdata4\2016
✅ Extracted: ..//tripdata_zips\JC-201606-citibike-tripdata.csv.zip
  → Found 26 CSV file(s) in ..//extracted_tripdata4\2016
✅ Extracted: ..//tripdata_zips\JC-201607-citibike-tripdata.csv.zip
  → Found 27 CSV file(s) in ..//extracted_tripdata4\2016
✅ Extracted: ..//tripdata_zips\JC-201608-citibike-tripdata.csv.zip
  → Found 28 CSV file(s) in ..//extracted_tripdata4\2016
✅ Extracted: ..//tripdata_zips\JC-201609-citibike-tripdata.csv.zip
  → Found 29 CSV file(s) in ..//extracted_tripdata4\2016
✅ Extracted: ..//tripdata_zips\JC-201610-citibike-tripdata.csv.zip
  → Found 30 CSV file(s) in ..//extracted_tripdata4\2016
✅ Extracted: ..//tripdata_zips\JC-201611-citibike-tripdata.csv.zip
  → Found 31 CSV file(s) in ..//extracted_tripdata4\2016
✅ Extracted: ..//tripdata_zips\JC-201612-citibike-tripdata.csv.zip
  → Found 32 CSV file(s) in ..//extracted_tripdata4\2016
✅ Extracted: ..//tripdata_zips\JC-201701-citibike-tripdata.csv.zip
  → Foun

  → Found 37 CSV file(s) in ..//extracted_tripdata4\2022
✅ Extracted: ..//tripdata_zips\JC-202202-citibike-tripdata.csv.zip
  → Found 38 CSV file(s) in ..//extracted_tripdata4\2022
✅ Extracted: ..//tripdata_zips\JC-202203-citibike-tripdata.csv.zip
  → Found 39 CSV file(s) in ..//extracted_tripdata4\2022
✅ Extracted: ..//tripdata_zips\JC-202204-citibike-tripdata.csv.zip
  → Found 40 CSV file(s) in ..//extracted_tripdata4\2022
✅ Extracted: ..//tripdata_zips\JC-202205-citibike-tripdata.csv.zip
  → Found 41 CSV file(s) in ..//extracted_tripdata4\2022
✅ Extracted: ..//tripdata_zips\JC-202206-citibike-tripdata.csv.zip
  → Found 42 CSV file(s) in ..//extracted_tripdata4\2022
✅ Extracted: ..//tripdata_zips\JC-202207-citbike-tripdata.csv.zip
  → Found 43 CSV file(s) in ..//extracted_tripdata4\2022
✅ Extracted: ..//tripdata_zips\JC-202208-citibike-tripdata.csv.zip
  → Found 44 CSV file(s) in ..//extracted_tripdata4\2022
✅ Extracted: ..//tripdata_zips\JC-202209-citibike-tripdata.csv.zip
  → Found