Testing on 2022 OHIO data on google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os
import h5py
import numpy as np
import pandas as pd

# Base NDVI folder
base_folder = "/content/drive/MyDrive/MlOps_Project/NDVI"

# List of years to process
years = ["2017","2018","2019","2020","2021", "2022"]

# Placeholder for NDVI results
ndvi_records = []

for year in years:
    year_folder = os.path.join(base_folder, year)

    # Loop over states inside year
    for state_folder in os.listdir(year_folder):
        state_path = os.path.join(year_folder, state_folder)

        if not os.path.isdir(state_path):
            continue  # Skip non-folder files

        # Loop over H5 files inside state folder
        for filename in os.listdir(state_path):
            if filename.endswith(".h5"):
                file_path = os.path.join(state_path, filename)
                print(f"Processing {file_path}")

                try:
                    with h5py.File(file_path, "r") as f:
                        for fips in f.keys():
                            for date in f[fips].keys():
                                group = f[fips][date]

                                if "data" not in group:
                                    print(f"⚠️ No 'data' in {filename} -> {fips} / {date}")
                                    continue

                                try:
                                    data = group["data"][:]  # (time, height, width, bands)

                                    for i, tile in enumerate(data):
                                        # Assume [Red, NIR] band order
                                        red = tile[:, :, 0].astype(np.float32)
                                        nir = tile[:, :, 1].astype(np.float32)

                                        ndvi = (nir - red) / (nir + red + 1e-5)
                                        ndvi = np.clip(ndvi, -1, 1)

                                        mean_ndvi = np.nanmean(ndvi)

                                        if mean_ndvi > 0:  # Filter directly during processing
                                            ndvi_records.append({
                                                "year": year,
                                                "fips": fips,
                                                "date": date,
                                                "tile_index": i,
                                                "mean_ndvi": mean_ndvi
                                            })

                                except Exception as e:
                                    print(f"Error processing {filename} -> {fips}/{date}: {e}")
                except Exception as e:
                    print(f"Cannot open file {filename}: {e}")

# Now create final DataFrame
output_df = pd.DataFrame(ndvi_records)

# Group by year, fips, date and take the mean NDVI
grouped_df = output_df.groupby(["year", "fips", "date"]).agg({
    "mean_ndvi": "mean"
}).reset_index()

# Save final big CSV
csv_path = os.path.join("/content/drive/MyDrive/MlOps_Project/", "ndvi_summary_all_years.csv")
grouped_df.to_csv(csv_path, index=False)

print(f"Final NDVI summary saved to: {csv_path}")
