# Consolidate GEFS Data

Notebook to get consolidate the GEFS data from all the netCDF4 format files. 

Note: this can consume a lot of memory. 

In [None]:
#!conda install -c anaconda netcdf4 --yes -q
#!conda install xarray --yes -q

In [4]:
import glob
import pandas as pd
import xarray as xr

Load all the training data

In [5]:
# Get all the GEFS training data file names
training_files = glob.glob("../data/gefs/train/*.nc")

# Load the first file data into a dataframe
# This will serve as the index information
df = xr.open_dataset(training_files[0]).to_dataframe().reset_index()

# Loop over all the remaining files and accumulate the last column,
# which is the physical quantity of interest
dfs_list = []
for f in training_files[1::]:
    tmp_df = xr.open_dataset(f).to_dataframe().reset_index()
    print("{0} dimensions: {1}".format(f, tmp_df.shape))
    dfs_list.append(tmp_df.iloc[:,-1])
    
# Append the first dataframe to the list for subsequent concatenation
dfs_list.append(df)

# Concatenate all the dataframes together
gefs_train_df = pd.concat(dfs_list, axis=1)

../data/gefs/train/apcp_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/dswrf_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/tmp_2m_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/spfh_2m_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/ulwrf_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/tmin_2m_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/tmax_2m_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/ulwrf_tatm_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/dlwrf_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/pwat_eatm_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/pres_msl_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/t

In [8]:
gefs_train_df.head()

Unnamed: 0,Total_precipitation,Precipitable_water,Pressure,Temperature_surface,Specific_humidity_height_above_ground,Downward_Short-Wave_Rad_Flux,Total_Column-Integrated_Condensate,Minimum_temperature,Upward_Long-Wave_Rad_Flux_surface,Maximum_temperature,...,Downward_Long-Wave_Rad_Flux,Temperature_height_above_ground,ens,fhour,lat,lon,time,intTime,intValidTime,Upward_Long-Wave_Rad_Flux
0,0.0,5.7,102244.9375,275.607605,0.00402,0.0,0.0002,278.543823,333.760559,280.903442,...,247.018982,278.527924,0,12:00:00,31.0,254.0,1994-01-01,1994010100,1994010112,255.599884
1,0.0,6.798102,101962.007812,276.505676,0.00363,0.0,0.0,279.856567,336.211304,281.626221,...,257.018982,279.874847,0,12:00:00,31.0,254.0,1994-01-02,1994010200,1994010212,252.677521
2,0.0,4.891023,102341.0,272.480042,0.00158,0.0,0.0007,275.893433,318.730194,278.131714,...,231.848587,275.895538,0,12:00:00,31.0,254.0,1994-01-03,1994010300,1994010312,242.086395
3,0.0,4.944365,102438.59375,274.793945,0.00193,0.0,0.0004,277.901978,332.616638,282.016174,...,241.163208,278.045502,0,12:00:00,31.0,254.0,1994-01-04,1994010400,1994010412,254.670517
4,0.0,4.567267,101882.703125,274.738159,0.002092,0.0,0.0002,278.60968,329.197876,280.722839,...,242.197357,278.777527,0,12:00:00,31.0,254.0,1994-01-05,1994010500,1994010512,254.010925


In [6]:
gefs_train_df.to_csv("../data/gefs/gefs_train.csv", index=False)

KeyboardInterrupt: 

In [5]:
# Get all the GEFS testing data file names
testing_files = glob.glob("../data/gefs/test/*.nc")

# Load the first file data into a dataframe
# This will serve as the index information
df = xr.open_dataset(testing_files[0]).to_dataframe().reset_index()

# Loop over all the remaining files and accumulate the last column,
# which is the physical quantity of interest
dfs_list = []
for f in testing_files[1::]:
    tmp_df = xr.open_dataset(f).to_dataframe().reset_index()
    print("{0} dimensions: {1}".format(f, tmp_df.shape))
    dfs_list.append(tmp_df.iloc[:,-1])
    
# Append the first dataframe to the list for subsequent concatenation
dfs_list.append(df)

# Concatenate all the dataframes together
gefs_testing_df = pd.concat(dfs_list, axis=1)

../data/gefs/train/apcp_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/dswrf_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/tmp_2m_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/spfh_2m_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/ulwrf_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/tmin_2m_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/tmax_2m_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/ulwrf_tatm_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/dlwrf_sfc_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/pwat_eatm_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/pres_msl_latlon_subset_19940101_20071231.nc dimensions: (40494960, 8)
../data/gefs/train/t

In [8]:
gefs_testing_df.head()

Unnamed: 0,Total_precipitation,Precipitable_water,Pressure,Temperature_surface,Specific_humidity_height_above_ground,Downward_Short-Wave_Rad_Flux,Total_Column-Integrated_Condensate,Minimum_temperature,Upward_Long-Wave_Rad_Flux_surface,Maximum_temperature,...,Downward_Long-Wave_Rad_Flux,Temperature_height_above_ground,ens,fhour,lat,lon,time,intTime,intValidTime,Upward_Long-Wave_Rad_Flux
0,0.0,5.7,102244.9375,275.607605,0.00402,0.0,0.0002,278.543823,333.760559,280.903442,...,247.018982,278.527924,0,12:00:00,31.0,254.0,1994-01-01,1994010100,1994010112,255.599884
1,0.0,6.798102,101962.007812,276.505676,0.00363,0.0,0.0,279.856567,336.211304,281.626221,...,257.018982,279.874847,0,12:00:00,31.0,254.0,1994-01-02,1994010200,1994010212,252.677521
2,0.0,4.891023,102341.0,272.480042,0.00158,0.0,0.0007,275.893433,318.730194,278.131714,...,231.848587,275.895538,0,12:00:00,31.0,254.0,1994-01-03,1994010300,1994010312,242.086395
3,0.0,4.944365,102438.59375,274.793945,0.00193,0.0,0.0004,277.901978,332.616638,282.016174,...,241.163208,278.045502,0,12:00:00,31.0,254.0,1994-01-04,1994010400,1994010412,254.670517
4,0.0,4.567267,101882.703125,274.738159,0.002092,0.0,0.0002,278.60968,329.197876,280.722839,...,242.197357,278.777527,0,12:00:00,31.0,254.0,1994-01-05,1994010500,1994010512,254.010925


In [None]:
gefs_testing_df.to_csv("../data/gefs/gefs_test.csv", index=False)