# Data Download and Augmentation
A little notebook to do little notebook things. More specifically, this notebook downloads the observations and model data and chops some of the uneeded data in the HRRR file. 

In [1]:
#Import Modules
import requests
import xarray as xr
from datetime import datetime, timedelta
import os
import glob
import pandas as pd

## Observations Data Download
data archive found here https://mesonet.agron.iastate.edu/GIS/rasters.php?rid=4

In [2]:
#Create a time window for data
daterange = pd.date_range(datetime(2022, 6, 1, 0), datetime(2022, 8, 31, 23), freq= '1H')

#Create for loop of datetimes
for single_date in daterange:
    
    #obtain the file for the MRMS lowest level reflectivity from the Iowa State website.
    #set the data and pathfile as well as create the directory.
    url = f'https://mesonet.agron.iastate.edu/cgi-bin/request/raster2netcdf.py?dstr={single_date.strftime("%Y%m%d%H00")}&prod=mrms_lcref'
    drt = f'/home/scratch/jcorner1/unidata/{url[-28:-16]}'
    dirs = glob.glob(f'/home/scratch/jcorner1/unidata/*')

    #Check to see if directory exist and if not create it.
    if drt not in dirs:
        os.mkdir(drt)

    #Get data from the website and write it to a file.
    response = requests.get(url)
    filepath = f'{drt}/mrms_lcref{url[-28:-16]}.nc4'
    open(filepath, 'wb').write(response.content)


## Model Data Download and Augmentaion

information for HRRR https://rapidrefresh.noaa.gov/hrrr/

data archive https://console.cloud.google.com/storage/browser/high-resolution-rapid-refresh;tab=objects?prefix=&forceOnObjectsSortingFiltering=false

In [None]:
#array of all the forecast hours and variables to drop from files.
forecast_hours  = [3, 6, 9, 12, 15, 18, 24, 30, 36]
var_drop = ['unknown', 'veril', 'hail', 'itng', 'tcolw', 'tcoli', 'tcc', 'ltng']

#Create a time window for data
daterange = pd.date_range(datetime(2022, 6, 1, 0), datetime(2022, 8, 31, 23), freq= '6H')

#Create for loop of datetimes
for single_date in daterange:
    
    drt = f'/home/scratch/jcorner1/unidata/{single_date.strftime("%Y%m%d%H00")}'
    print(single_date.strftime("%Y-%m-%dT%H:00"))
    
    #iterate through the forecast hours 
    for hours in forecast_hours:

        #calculate the model run that will line up with the correct forecast hour.
        new_hour = single_date - timedelta(hours=hours)

        #Get data from the websit and write it to a file.
        url = f'https://storage.googleapis.com/high-resolution-rapid-refresh/hrrr.{new_hour.strftime("%Y%m%d")}/conus/hrrr.t{new_hour.strftime("%H")}z.wrfsfcf{str(hours).zfill(2)}.grib2'
        response = requests.get(url)
        open(f'{drt}/hrrr_t{new_hour.strftime("%Y%m%dT%H00")}z_wrfsfcf{str(hours).zfill(2)}.grib2', 'wb').write(response.content)

        #Open downloaded dataset
        ds = xr.open_dataset(f'{drt}/hrrr_t{new_hour.strftime("%Y%m%dT%H00")}z_wrfsfcf{str(hours).zfill(2)}.grib2', filter_by_keys={'typeOfLevel': 'atmosphere'})

        #Create a new copy of the data, dropping the useless vars, and resaving the new file
        clean_copy = ds.copy()
        clean_copy.drop_vars(var_drop, errors='ignore')
        clean_copy.to_netcdf(path= f'{drt}/hrrr_t{new_hour.strftime("%Y%m%dT%H00")}z_wrfsfcf{str(hours).zfill(2)}.nc4', format='NETCDF4') 

        #Close both datasets
        clean_copy.close()
        ds.close()

        #Remove the old dataset
        os.remove(f'{drt}/hrrr_t{new_hour.strftime("%Y%m%dT%H00")}z_wrfsfcf{str(hours).zfill(2)}.grib2')
        

## Index File Remover

In [9]:
dates = glob.glob('/home/scratch/jcorner1/unidata/*')
dates.sort()

#Iterate through each date
for file in dates:
    
    #Find index files in the current directory
    idx_files = glob.glob(f'{file}/*grib2*.idx')
    
    #Interate through the index files
    for remove in idx_files:
        
        #Remove the index file
        os.remove(remove)