# Ingest NCEP GFS 0.25 Degree Data for 6 hour forecasts. 

#### 1.) Conda package installations to environment and importing appropriate libraries. 

In [2]:
# conda install -c conda-forge gdal
# conda install -c conda-forge geopandas
# conda install -c conda-forge earthpy
# conda install -c conda-forge cloudpathlib
# conda install -c conda-forge pyhdf
# conda install -c anaconda basemap

#conda install -c conda-forge xarray
#conda install -c conda-forge ipywidgets
#conda install -c conda-forge cartopy
## For IO dependencies in xarray 
#conda install -c conda-forge xarray dask netCDF4 bottleneck
#conda install -c conda-forge cfgrib
#conda install -c conda-forge pygrib
#conda install -c yt87 pywgrib2_xr

#conda install -c conda-forge pyarrow

In [3]:
#Make sure we are in right conda env. 
!jupyter kernelspec list

Available kernels:
  python3    /home/ec2-user/.conda/envs/capstone/share/jupyter/kernels/python3


In [4]:
#Import Packages. 
import sys
import os
import requests
import warnings
import glob

import matplotlib.pyplot as plt
import seaborn as sns
import numpy.ma as ma
import numpy as np
#from shapely.geometry import mapping, box
import geopandas as gpd
import earthpy as et
import earthpy.spatial as es
import earthpy.plot as ep
from osgeo import gdal
import pandas as pd

#GFS data
import xarray # used for reading the data.
import xarray_extras.csv # used for writing data to csv format. 
import pygrib
import xarray # used for reading the data.
import ipywidgets as widgets
import matplotlib.pyplot as plt # used to plot the data.
import ipywidgets as widgets # For ease in selecting variables.
import cartopy.crs as ccrs # Used to georeference data.


# #from cloudpathlib import S3Path, S3Client
# from pyhdf.SD import SD, SDC

warnings.simplefilter('ignore')



#### 2.) Download data from NCAR servers. 

In [5]:
 ## First, we need to authenticate
try:
    import getpass
    input = getpass.getpass
except:
    try:
        input = raw_input
    except:
        pass

In [6]:
## Now, we need your password.
pswd = input('password: ')

password:  ···········


In [7]:
values = {'email' : 'jericojohns@berkeley.edu', 'passwd' : pswd, 'action' : 'login'}
login_url = 'https://rda.ucar.edu/cgi-bin/login'

In [8]:
ret = requests.post(login_url, data=values)
if ret.status_code != 200:
    print('Bad Authentication')
    print(ret.text)
    exit(1)

In [43]:
dspath = 'https://rda.ucar.edu/data/ds084.1/'
filelist = ['2018/20180101/gfs.0p25.2018010100.f006.grib2', '2018/20180102/gfs.0p25.2018010200.f006.grib2']

In [44]:
save_dir = '/local/train/GFS/'

 #### Now to download the files

In [45]:
for file in filelist:
    filename = dspath + file
    outfile = save_dir + os.path.basename(filename)
    print('Downloading', file)
    req = requests.get(filename, cookies = ret.cookies, allow_redirects=True)
    open(outfile, 'wb').write(req.content)

Downloading 2018/20180101/gfs.0p25.2018010100.f006.grib2
Downloading 2018/20180102/gfs.0p25.2018010200.f006.grib2


#### Once you have downloaded the data, the next part can help you plot it.

In [46]:
filelist_arr = [save_dir + os.path.basename(file) for file in filelist]
selected_file = widgets.Dropdown(options=filelist_arr, description='data file')
display(selected_file)

Dropdown(description='data file', options=('/local/train/GFS/gfs.0p25.2018010100.f006.grib2', '/local/train/GF…

In [47]:
# Now to load in the data to xarray
type_of_level1 = 'surface' # for Temperature and Planetary Boundary Layer Height
type_of_level2 = 'atmosphereSingleLayer' # for Relative Humidity
ds_level_surface = xarray.open_dataset(selected_file.value, filter_by_keys={'typeOfLevel': type_of_level1}, engine="cfgrib")
ds_level_atmosphere = xarray.open_dataset(selected_file.value, filter_by_keys={'typeOfLevel': type_of_level2}, engine="cfgrib")

Ignoring index file '/local/train/GFS/gfs.0p25.2018010100.f006.grib2.923a8.idx' incompatible with GRIB file


In [48]:
#Define variable names
var_t = 't' #temperature (K) 
var_hpbl = 'hpbl' #Planetary Boundary Layer Height (m)
var_r = 'r' #Relative Humidity %

#Define filtered datasets (for each variable). 
ds_t = ds_level_surface[var_t] 
ds_hpbl = ds_level_surface[var_hpbl]
ds_r = ds_level_atmosphere[var_r]

In [49]:
ds_t

In [50]:
ds_t.shape

(721, 1440)

In [51]:
ds_t

In [22]:
!pwd

/local/capstone


In [None]:
### Subset data to different regions by lat / lon boundaries. 

In [52]:
#Define lat/lon bounds of our regions of interest. 
#Note: We must convert the original lon bounds of -180, 180 --> 0, 360 to match the GFS data format. 

#https://docs.google.com/spreadsheets/d/1sZwAc0dSuYTlZNeZEEu0320iUIQPO-WBjSGu50nlFpE/edit?resourcekey=0-4yL7WtTJDHthV8swuq7SBg#gid=0
#Los Angeles
la_min_lat = 30.01
la_max_lat = 40.00
la_min_lon = 49.46
la_max_lon = 76.06

#Tapei
tp_min_lat = 20.01
tp_max_lat = 30.00
tp_min_lon = 297.07
tp_max_lon = 318.55

#Delhi
dl_min_lat = 20.01
dl_max_lat = 30.00
dl_min_lon = 243.85
dl_max_lon = 260.82

In [53]:
#Filter by appropriate lat/lon bounds
def subset_dataset(dataset, min_lat, max_lat, min_lon, max_lon): 
    '''Takes a dataset and bounding coordinates and returns a filtered subset for the region of interest'''
    mask_lat = np.logical_and(dataset.coords['latitude'] >= min_lat, dataset.coords['latitude'] <= max_lat)
    mask_lon = np.logical_and(dataset.coords['longitude'] >= min_lon, dataset.coords['longitude'] <= max_lon)
    ds_filt = dataset.where(mask_lat & mask_lon, drop = True)
    return ds_filt

In [54]:
#TODOs: 
# Do this for each region and concatenate the 3 dataframes into one dataframe. (Do we want to add column with region labeled?). 
# Create strings for each possible filename (i.e. 01 through 31 for 01 through 12 months for 2018 to 2020 years). 
# Use Srishti's S3 bucket and add a test csv file to the bucket (so we don't have to store locally). 
# Pull file download, df creation, df to csv save to s3 (forecast time) and file deletion into one loop function (based on dates above). Quick exit if error bc date doesn't exist (i.e. 31).
# Make sure we can pass tuples or some combination for level and variable name into function so that we can quickly change variables included. 
# Add a timeit call to understand how long it takes to run end-to-end pipeline. 

ds_t = subset_dataset(ds_t, la_min_lat, la_max_lat, la_min_lon, la_max_lon)
ds_hpbl = subset_dataset(ds_hpbl, la_min_lat, la_max_lat, la_min_lon, la_max_lon)
ds_r = subset_dataset(ds_r, la_min_lat, la_max_lat, la_min_lon, la_max_lon)

In [55]:
#Make sure we preserve the type of level (atmospheric) of the observation to preserve metadata within the variable names
df_t = ds_t.to_dataframe(name = var_t)
df_t = df_t.drop(columns = ['surface', 'time', 'step'])
df_t = df_t.rename(columns = {"t" : "t_surface", "hpbl" : "pbl_surface", "r" : "r_atmosphere_single_layer"})

df_pbl = ds_hpbl.to_dataframe(name = var_hpbl)
df_pbl = df_pbl.drop(columns = ['surface', 'time', 'step'])
df_pbl = df_pbl.rename(columns = {"t" : "t_surface", "hpbl" : "pbl_surface", "r" : "r_atmosphere_single_layer"})

df_r = ds_r.to_dataframe(name = var_r)
df_r = df_r.drop(columns = ['atmosphereSingleLayer', 'time', 'step'])
df_r = df_r.rename(columns = {"t" : "t_surface", "hpbl" : "pbl_surface", "r" : "r_atmosphere_single_layer"})

In [56]:
joined_df = pd.merge(df_t, df_pbl, on = ["latitude", "longitude", "valid_time"], how = "left")
joined_df = pd.merge(joined_df, df_r, on = ["latitude", "longitude", "valid_time"], how = "left")
joined_df

Unnamed: 0_level_0,Unnamed: 1_level_0,valid_time,t_surface,pbl_surface,r_atmosphere_single_layer
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
40.00,49.50,2018-01-01 06:00:00,282.839142,136.487656,42.0
40.00,49.75,2018-01-01 06:00:00,283.539124,284.327667,42.0
40.00,50.00,2018-01-01 06:00:00,283.639130,341.207672,41.0
40.00,50.25,2018-01-01 06:00:00,283.739136,337.687653,41.0
40.00,50.50,2018-01-01 06:00:00,283.939148,336.967651,40.0
...,...,...,...,...,...
30.25,75.00,2018-01-01 06:00:00,301.139130,468.567657,14.0
30.25,75.25,2018-01-01 06:00:00,300.539124,471.447662,16.0
30.25,75.50,2018-01-01 06:00:00,299.039124,470.407654,17.0
30.25,75.75,2018-01-01 06:00:00,297.639130,471.527649,18.0


In [62]:
#Convert to csv, with the appropriate metadata in file name (will extract as field names later). 
filepath = '../train/GFS/test.parquet'
joined_df.to_parquet(path = filepath)

## Jerico next steps: 
- Confirm t = 'temperature' (email response pending from NCAR w/ full schema)
- Pull in all of Planetary Boundary-Layer Height (PBL), Relative Humidity, Surface Air Temp for 1 file. 
- Before exporting csv: 
    - subset xarray DataArray to relevenat lat/lon bounds 
    - pivot the lat x lon DataArray into a time, (lat,lon), field1, etc. columnar dataset. 
    - Save this columnar, tabular dataset as a csv for a given forecast time and drop the stored grib file from memory. 
    - Iterate to next forecast time grib2 file and repeat until entire date rnage is covered. 
    - Will have ~1095 csv files. These can then be appended into a large DataFrame to join to other datasets (by time, (lat,lon)). 
    