In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os
import pandas as pd

# Base directory
base_dir = "/content/drive/MyDrive/MlOps_Project/WRF-HRRR Computed Dataset/data/2021/CA"

# Output folder
output_folder = "/content/drive/MyDrive/MlOps_Project/"

# Target years
years = ["2017", "2018", "2019", "2020", "2021", "2022"]

# Placeholder for all HRRR data
hrrr_records = []

for year in years:
    year_folder = os.path.join(base_dir, year)

    # Loop over states inside year
    for state_folder in os.listdir(year_folder):
        state_path = os.path.join(year_folder, state_folder)

        if not os.path.isdir(state_path):
            continue  # Skip non-folder files

        # Loop over CSV files inside state folder
        for filename in os.listdir(base_dir):
          if filename.endswith(".csv"):
              file_path = os.path.join(base_dir, filename)
              print(f"Processing {file_path}")

              try:
                  df = pd.read_csv(file_path)

                  # If 'year', 'month', 'day' exist, combine them into 'date'
                  if {'Year', 'Month', 'Day'}.issubset(df.columns):
                      df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])
                      df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
                      df = df.drop(columns=['Year', 'Month', 'Day', 'State', 'County','Daily/Monthly'], errors='ignore')

                  # Add to list
                  hrrr_records.append(df)

              except Exception as e:
                  print(f"Cannot open file {filename}: {e}")

# Now create final DataFrame
if hrrr_records:
    output_df = pd.concat(hrrr_records, ignore_index=True)

    # Save final big CSV
    csv_path = os.path.join(output_folder, "hrrr_summary_all_years.csv")
    output_df.to_csv(csv_path, index=False)

    print(f"Final HRRR summary saved to: {csv_path}")
else:
    print("No data processed.")
