# Consolidate GEFS Data

Notebook to get consolidate the GEFS data from all the netCDF4 format files. 

Note: this can consume a lot of memory. 

In [1]:
# Necessary for running code first time on SageMaker
!conda install -c anaconda netcdf4 --yes -q
!conda install xarray --yes -q

Solving environment: ...working... done

## Package Plan ##

  environment location: /home/ec2-user/anaconda3/envs/python3

  added / updated specs: 
    - netcdf4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.1.23  |                0         126 KB  anaconda
    netcdf4-1.4.2              |   py36h4b4f87f_0         526 KB  anaconda
    cftime-1.0.3.4             |   py36hdd07704_0         304 KB  anaconda
    openssl-1.0.2p             |       h14c3975_0         3.5 MB  anaconda
    hdf4-4.2.13                |       h3ca952b_2         916 KB
    certifi-2018.11.29         |           py36_0         146 KB  anaconda
    libnetcdf-4.6.1            |       h13459d8_0         1.2 MB  anaconda
    ------------------------------------------------------------
                                           Total:         6.7 MB

The following NEW packages will be INSTALLED

In [1]:
import glob
import pandas as pd
import xarray as xr

Load all the training data

In [3]:
# Get all the GEFS training data file names
training_files = glob.glob("../data/gefs/train/*.nc")

# Load the first file data into a dataframe
# This will serve as the index information
df = xr.open_dataset(training_files[0]).to_dataframe().reset_index()

# Loop over all the remaining files and accumulate the last column,
# which is the physical quantity of interest
dfs_list = []
for f in training_files[1::]:
    tmp_df = xr.open_dataset(f).to_dataframe().reset_index()
    print("{0} dimensions: {1}".format(f, tmp_df.shape))
    dfs_list.append(tmp_df.iloc[:,-1])
    
# Append the first dataframe to the list for subsequent concatenation
dfs_list.append(df)

# Concatenate all the dataframes together
gefs_train_df = pd.concat(dfs_list, axis=1)

../data/gefs/train/ulwrf_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/uswrf_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/spfh_2m_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/dlwrf_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/tcolc_eatm_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/dswrf_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/pwat_eatm_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/apcp_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/tcdc_eatm_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/tmp_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/tmp_2m_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/trai

In [4]:
gefs_train_df.head()

Unnamed: 0,Upward_Long-Wave_Rad_Flux_surface,Upward_Short-Wave_Rad_Flux,Specific_humidity_height_above_ground,Downward_Long-Wave_Rad_Flux,Total_Column-Integrated_Condensate,Downward_Short-Wave_Rad_Flux,Precipitable_water,Total_precipitation,Total_cloud_cover,Temperature_surface,...,Maximum_temperature,Upward_Long-Wave_Rad_Flux,ens,fhour,lat,lon,time,intTime,intValidTime,Pressure
0,333.760559,0.0,0.00402,247.018982,0.0002,0.0,5.7,0.0,0.0,275.607605,...,280.903442,255.599884,0,12:00:00,31.0,254.0,1994-01-01,1994010100,1994010112,102244.9375
1,336.211304,0.0,0.00363,257.018982,0.0,0.0,6.798102,0.0,0.0,276.505676,...,281.626221,252.677521,0,12:00:00,31.0,254.0,1994-01-02,1994010200,1994010212,101962.007812
2,318.730194,0.0,0.00158,231.848587,0.0007,0.0,4.891023,0.0,0.0,272.480042,...,278.131714,242.086395,0,12:00:00,31.0,254.0,1994-01-03,1994010300,1994010312,102341.0
3,332.616638,0.0,0.00193,241.163208,0.0004,0.0,4.944365,0.0,0.0,274.793945,...,282.016174,254.670517,0,12:00:00,31.0,254.0,1994-01-04,1994010400,1994010412,102438.59375
4,329.197876,0.0,0.002092,242.197357,0.0002,0.0,4.567267,0.0,0.0,274.738159,...,280.722839,254.010925,0,12:00:00,31.0,254.0,1994-01-05,1994010500,1994010512,101882.703125


In [5]:
gefs_train_df.to_csv("../data/gefs/gefs_train.csv", index=False)

In [2]:
# Get all the GEFS testing data file names
testing_files = glob.glob("../data/gefs/test/*.nc")

# Load the first file data into a dataframe
# This will serve as the index information
df = xr.open_dataset(testing_files[0]).to_dataframe().reset_index()

# Loop over all the remaining files and accumulate the last column,
# which is the physical quantity of interest
dfs_list = []
for f in testing_files[1::]:
    tmp_df = xr.open_dataset(f).to_dataframe().reset_index()
    print("{0} dimensions: {1}".format(f, tmp_df.shape))
    dfs_list.append(tmp_df.iloc[:,-1])
    
# Append the first dataframe to the list for subsequent concatenation
dfs_list.append(df)

# Concatenate all the dataframes together
gefs_testing_df = pd.concat(dfs_list, axis=1)

../data/gefs/test/ulwrf_sfc_latlon_subset_20080101_20121130.nc dimensions: (14224320, 8)
../data/gefs/test/uswrf_sfc_latlon_subset_20080101_20121130.nc dimensions: (14224320, 8)
../data/gefs/test/tmp_sfc_latlon_subset_20080101_20121130.nc dimensions: (14224320, 8)
../data/gefs/test/tcdc_eatm_latlon_subset_20080101_20121130.nc dimensions: (14224320, 8)
../data/gefs/test/pres_msl_latlon_subset_20080101_20121130.nc dimensions: (14224320, 8)
../data/gefs/test/apcp_sfc_latlon_subset_20080101_20121130.nc dimensions: (14224320, 8)
../data/gefs/test/spfh_2m_latlon_subset_20080101_20121130.nc dimensions: (14224320, 8)
../data/gefs/test/dswrf_sfc_latlon_subset_20080101_20121130.nc dimensions: (14224320, 8)
../data/gefs/test/pwat_eatm_latlon_subset_20080101_20121130.nc dimensions: (14224320, 8)
../data/gefs/test/dlwrf_sfc_latlon_subset_20080101_20121130.nc dimensions: (14224320, 8)
../data/gefs/test/tmax_2m_latlon_subset_20080101_20121130.nc dimensions: (14224320, 8)
../data/gefs/test/tmp_2m_latl

In [3]:
gefs_testing_df.head()

Unnamed: 0,Upward_Long-Wave_Rad_Flux_surface,Upward_Short-Wave_Rad_Flux,Temperature_surface,Total_cloud_cover,Pressure,Total_precipitation,Specific_humidity_height_above_ground,Downward_Short-Wave_Rad_Flux,Precipitable_water,Downward_Long-Wave_Rad_Flux,...,Total_Column-Integrated_Condensate,Minimum_temperature,ens,fhour,lat,lon,time,intTime,intValidTime,Upward_Long-Wave_Rad_Flux
0,323.056244,0.0,272.827454,0.0,103451.4375,0.0,0.00237,0.0,4.6,230.338104,...,0.0002,274.581757,0,12:00:00,31.0,254.0,2008-01-01,2008010100,2008010112,247.413269
1,311.356201,0.0,270.866882,0.18,104056.453125,0.0,0.00127,0.0,6.084514,237.0,...,0.178,271.783356,0,12:00:00,31.0,254.0,2008-01-02,2008010200,2008010212,182.33313
2,309.691345,0.0,270.959564,0.0,103235.96875,0.0,0.00174,0.0,7.7,230.171143,...,0.006,272.651459,0,12:00:00,31.0,254.0,2008-01-03,2008010300,2008010312,235.133972
3,330.50827,0.0,275.10321,0.11,102106.59375,0.0,0.002562,0.0,11.6,282.661896,...,0.1075,277.632629,0,12:00:00,31.0,254.0,2008-01-04,2008010400,2008010412,170.600296
4,329.017456,0.0,274.84494,0.0,101985.15625,0.0,0.005176,0.0,10.74163,265.0,...,0.0001,278.268219,0,12:00:00,31.0,254.0,2008-01-05,2008010500,2008010512,251.538971


In [4]:
gefs_testing_df.to_csv("../data/gefs/gefs_test.csv", index=False)