This notebook preprocess ensemble GEFS according to input used in Price and Rasp (2022)

Input data is 5 ensemble apcp, 5 ensemble pwat, cape, cin, and t2m

In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import cartopy.crs as ccrs  # for plotting map
import cartopy
import matplotlib as mpl

from functions import *



In [2]:
PATH = 'C:\\Users\\bobby\\Desktop\\.vscode\\1 UROP Research\\UROP v2\\raw_preprocessing\\GEFS\\'

# ensemble variables 
apcp_p01 = xr.open_dataset(PATH+'GEFSv12-Reforecast_apcp_p01.nc')
apcp_p02 = xr.open_dataset(PATH+'GEFSv12-Reforecast_apcp_p02.nc')
apcp_p03 = xr.open_dataset(PATH+'GEFSv12-Reforecast_apcp_p03.nc')
apcp_p04 = xr.open_dataset(PATH+'GEFSv12-Reforecast_apcp_p04.nc')
apcp_c00 = xr.open_dataset(PATH+'GEFSv12-Reforecast_apcp_c00.nc')

pwat_p01 = xr.open_dataset(PATH+'GEFSv12-Reforecast_pwat_p01.nc')
pwat_p02 = xr.open_dataset(PATH+'GEFSv12-Reforecast_pwat_p02.nc')
pwat_p03 = xr.open_dataset(PATH+'GEFSv12-Reforecast_pwat_p03.nc')
pwat_p04 = xr.open_dataset(PATH+'GEFSv12-Reforecast_pwat_p04.nc')
pwat_c00 = xr.open_dataset(PATH+'GEFSv12-Reforecast_pwat_c00.nc')

# single variables
cape_c00 = xr.open_dataset(PATH+'GEFSv12-Reforecast_cape_c00.nc')
t2m_c00 = xr.open_dataset(PATH+'GEFSv12-Reforecast_tmp2m_c00.nc')
cin_c00 = xr.open_dataset(PATH+'GEFSv12-Reforecast_cin_c00.nc') # have duplicates

In [3]:
# print(len(set(cin_c00.time.values)))
# print(len(cin_c00.time.values))
# print(len(set(pwat_p01.time.values)))
# print(len(pwat_p01.time.values))

# have duplicated dates in pwat_p01, p02, p03, p04
# print(len(set(pwat_p01.time.values)))
# print(len(pwat_p01.time.values))

# to find the indices of duplicated value
# pwat_duplicates = list(pwat_p01.get_index("time").duplicated())
# indices = [i for i, x in enumerate(pwat_duplicates) if x == True]
# indices

# check if duplicates are due to error in date naming
# pwat_p01.isel(time=6200).pwat.values
# pwat_p01.isel(time=9116).pwat.values
# the duplicates are exactly the same, so it is not a date naming error

In [4]:
pwat_p01 = pwat_p01.sel(time=~pwat_p01.get_index("time").duplicated())
pwat_p02 = pwat_p02.sel(time=~pwat_p02.get_index("time").duplicated())
pwat_p03 = pwat_p03.sel(time=~pwat_p03.get_index("time").duplicated())
pwat_p04 = pwat_p04.sel(time=~pwat_p04.get_index("time").duplicated())

cin_c00 = cin_c00.sel(time=~cin_c00.get_index("time").duplicated())

apcp_train_p01, apcp_val_p01, apcp_test_p01 = trainvaltest(apcp_p01)
apcp_train_p02, apcp_val_p02, apcp_test_p02 = trainvaltest(apcp_p02)
apcp_train_p03, apcp_val_p03, apcp_test_p03 = trainvaltest(apcp_p03)
apcp_train_p04, apcp_val_p04, apcp_test_p04 = trainvaltest(apcp_p04)
apcp_train_c00, apcp_val_c00, apcp_test_c00 = trainvaltest(apcp_c00)

pwat_train_p01, pwat_val_p01, pwat_test_p01 = trainvaltest(pwat_p01)
pwat_train_p02, pwat_val_p02, pwat_test_p02 = trainvaltest(pwat_p02)
pwat_train_p03, pwat_val_p03, pwat_test_p03 = trainvaltest(pwat_p03)
pwat_train_p04, pwat_val_p04, pwat_test_p04 = trainvaltest(pwat_p04)
pwat_train_c00, pwat_val_c00, pwat_test_c00 = trainvaltest(pwat_c00)

cape_train_c00, cape_val_c00, cape_test_c00 = trainvaltest(cape_c00)
t2m_train_c00, t2m_val_c00, t2m_test_c00 = trainvaltest(t2m_c00)
cin_train_c00, cin_val_c00, cin_test_c00 = trainvaltest(cin_c00)

In [5]:
# import scipy as sp
# # a=cape_train_c00.cape.values
# # a = pwat_train_c00.pwat.values

# a=t2m_train_c00.t2m.values

# p=sp.stats.mstats.normaltest(a, axis=0).pvalue
# if p.all()<0.01:
#    print ('distribution is not normal')
# p=sp.stats.mstats.normaltest(np.log(a), axis=0).pvalue
# if p.all()<0.01:
#    print ('distribution is not log-normal')

In [6]:
# ALL MISSING DATES LIE IN TRAINING DATASET

# print(timecheck(apcp_train_p01, apcp_val_p01, apcp_test_p01))
# print(timecheck(apcp_train_p02, apcp_val_p02, apcp_test_p02))
# print(timecheck(apcp_train_p03, apcp_val_p03, apcp_test_p03))
# print(timecheck(apcp_train_p04, apcp_val_p04, apcp_test_p04))
# print(timecheck(apcp_train_c00, apcp_val_c00, apcp_test_c00))

# print(timecheck(pwat_train_p01, pwat_val_p01, pwat_test_p01))
# print(timecheck(pwat_train_p02, pwat_val_p02, pwat_test_p02))
# print(timecheck(pwat_train_p03, pwat_val_p03, pwat_test_p03))
# print(timecheck(pwat_train_p04, pwat_val_p04, pwat_test_p04))
# print(timecheck(pwat_train_c00, pwat_val_c00, pwat_test_c00))

# print(timecheck(cape_train_c00, cape_val_c00, cape_test_c00))
# print(timecheck(t2m_train_c00, t2m_val_c00, t2m_test_c00))
# print(timecheck(cin_train_c00, cin_val_c00, cin_test_c00))

In [7]:
# 4min to run this
apcp_train_p01 = resampling(apcp_train_p01, 'apcp') 
apcp_train_p02 = resampling(apcp_train_p02, 'apcp')
apcp_train_p03 = resampling(apcp_train_p03, 'apcp')
apcp_train_p04 = resampling(apcp_train_p04, 'apcp')
apcp_train_c00 = resampling(apcp_train_c00, 'apcp')

pwat_train_p01 = resampling(pwat_train_p01, 'pwat')
pwat_train_p02 = resampling(pwat_train_p02, 'pwat')
pwat_train_p03 = resampling(pwat_train_p03, 'pwat')
pwat_train_p04 = resampling(pwat_train_p04, 'pwat')
pwat_train_c00 = resampling(pwat_train_c00, 'pwat')

cape_train_c00 = resampling(cape_train_c00, 'cape')
t2m_train_c00 = resampling(t2m_train_c00, 't2m')
cin_train_c00 = resampling(cin_train_c00, 'cin') 
# an area with a high convection inhibition number (CIN)
# is considered stable and has very little likelihood of developing a thunderstorm

transforming train, val and test datasets

In [8]:
scaler_train_apcp_c00, apcp_train_c00 = transform_train(apcp_train_c00, 'apcp')
scaler_train_apcp_p01, apcp_train_p01 = transform_train(apcp_train_p01, 'apcp')
scaler_train_apcp_p02, apcp_train_p02 = transform_train(apcp_train_p02, 'apcp')
scaler_train_apcp_p03, apcp_train_p03 = transform_train(apcp_train_p03, 'apcp')
scaler_train_apcp_p04, apcp_train_p04 = transform_train(apcp_train_p04, 'apcp')

scaler_train_pwat_c00, pwat_train_c00 = transform_train(pwat_train_c00, 'pwat')
scaler_train_pwat_p01, pwat_train_p01 = transform_train(pwat_train_p01, 'pwat')
scaler_train_pwat_p02, pwat_train_p02 = transform_train(pwat_train_p02, 'pwat')
scaler_train_pwat_p03, pwat_train_p03 = transform_train(pwat_train_p03, 'pwat')
scaler_train_pwat_p04, pwat_train_p04 = transform_train(pwat_train_p04, 'pwat')

scaler_train_t2m_c00, t2m_train_c00 = transform_train(t2m_train_c00, 't2m')
scaler_train_cape_c00, cape_train_c00 = transform_train(cape_train_c00, 'cape') 
scaler_train_cin_c00, cin_train_c00 = transform_train(cin_train_c00, 'cin') # a lot of close to 1 value

In [9]:
# apcp val
apcp_val_c00 = transform_val_test(apcp_val_c00.tp, scaler_train_apcp_c00, is_prec=True)
apcp_val_p01 = transform_val_test(apcp_val_p01.tp, scaler_train_apcp_p01, is_prec=True)
apcp_val_p02 = transform_val_test(apcp_val_p02.tp, scaler_train_apcp_p02, is_prec=True)
apcp_val_p03 = transform_val_test(apcp_val_p03.tp, scaler_train_apcp_p03, is_prec=True)
apcp_val_p04 = transform_val_test(apcp_val_p04.tp, scaler_train_apcp_p04, is_prec=True)
# apcp test
apcp_test_c00 = transform_val_test(apcp_test_c00.tp, scaler_train_apcp_c00, is_prec=True)
apcp_test_p01 = transform_val_test(apcp_test_p01.tp, scaler_train_apcp_p01, is_prec=True)
apcp_test_p02 = transform_val_test(apcp_test_p02.tp, scaler_train_apcp_p02, is_prec=True)
apcp_test_p03 = transform_val_test(apcp_test_p03.tp, scaler_train_apcp_p03, is_prec=True)
apcp_test_p04 = transform_val_test(apcp_test_p04.tp, scaler_train_apcp_p04, is_prec=True)

# pwat val
pwat_val_c00 = transform_val_test(pwat_val_c00.pwat, scaler_train_pwat_c00, is_prec=True)
pwat_val_p01 = transform_val_test(pwat_val_p01.pwat, scaler_train_pwat_p01, is_prec=True)
pwat_val_p02 = transform_val_test(pwat_val_p02.pwat, scaler_train_pwat_p02, is_prec=True)
pwat_val_p03 = transform_val_test(pwat_val_p03.pwat, scaler_train_pwat_p03, is_prec=True)
pwat_val_p04 = transform_val_test(pwat_val_p04.pwat, scaler_train_pwat_p04, is_prec=True)
# pwat test
pwat_test_c00 = transform_val_test(pwat_test_c00.pwat, scaler_train_pwat_c00, is_prec=True)
pwat_test_p01 = transform_val_test(pwat_test_p01.pwat, scaler_train_pwat_p01, is_prec=True)
pwat_test_p02 = transform_val_test(pwat_test_p02.pwat, scaler_train_pwat_p02, is_prec=True)
pwat_test_p03 = transform_val_test(pwat_test_p03.pwat, scaler_train_pwat_p03, is_prec=True)
pwat_test_p04 = transform_val_test(pwat_test_p04.pwat, scaler_train_pwat_p04, is_prec=True)

# val
t2m_val_c00 = transform_val_test(t2m_val_c00.t2m, scaler_train_t2m_c00, is_prec=False)
cape_val_c00 = transform_val_test(cape_val_c00.cape, scaler_train_cape_c00, is_prec=False)
cin_val_c00 = transform_val_test(cin_val_c00.cin, scaler_train_cin_c00, is_prec=False)

# test
t2m_test_c00 = transform_val_test(t2m_test_c00.t2m, scaler_train_t2m_c00, is_prec=False)
cape_test_c00 = transform_val_test(cape_test_c00.cape, scaler_train_cape_c00, is_prec=False)
cin_test_c00 = transform_val_test(cin_test_c00.cin, scaler_train_cin_c00, is_prec=False)

converting to .npy files

In [10]:
apcp_train_p01 = apcp_train_p01.tp.values
apcp_train_p02 = apcp_train_p02.tp.values
apcp_train_p03 = apcp_train_p03.tp.values
apcp_train_p04 = apcp_train_p04.tp.values
apcp_train_c00 = apcp_train_c00.tp.values

pwat_train_p01 = pwat_train_p01.pwat.values
pwat_train_p02 = pwat_train_p02.pwat.values
pwat_train_p03 = pwat_train_p03.pwat.values
pwat_train_p04 = pwat_train_p04.pwat.values
pwat_train_c00 = pwat_train_c00.pwat.values

cape_train_c00 = cape_train_c00.cape.values
t2m_train_c00 = t2m_train_c00.t2m.values
cin_train_c00 = cin_train_c00.cin.values

combining variables into 1 4D array

In [16]:
lst_train_ensemble = [apcp_train_p01, apcp_train_p02, apcp_train_p03, apcp_train_p04, apcp_train_c00,
                  pwat_train_p01, pwat_train_p02, pwat_train_p03, pwat_train_p04, pwat_train_c00,
                  cape_train_c00, t2m_train_c00, cin_train_c00]
                  
X_train_ensemble = np.stack((lst_train_ensemble), axis = -1) # stacking 13 variables into 1 single 4D array

lst_val_ensemble = [apcp_val_p01, apcp_val_p02, apcp_val_p03, apcp_val_p04, apcp_val_c00,
                  pwat_val_p01, pwat_val_p02, pwat_val_p03, pwat_val_p04, pwat_val_c00,
                  cape_val_c00, t2m_val_c00, cin_val_c00]
                  
X_val_ensemble = np.stack((lst_val_ensemble), axis = -1) # stacking 13 variables into 1 single 4D array

lst_test_ensemble = [apcp_test_p01, apcp_test_p02, apcp_test_p03, apcp_test_p04, apcp_test_c00,
                  pwat_test_p01, pwat_test_p02, pwat_test_p03, pwat_test_p04, pwat_test_c00,
                  cape_test_c00, t2m_test_c00, cin_test_c00]
                  
X_test_ensemble = np.stack((lst_test_ensemble), axis = -1) # stacking 13 variables into 1 single 4D array

In [20]:
datapath = 'C:\\Users\\bobby\\Desktop\\.vscode\\1 UROP Research\\UROP v2\\data\\'
np.save(datapath+'X_train_ensemble.npy', X_train_ensemble)
np.save(datapath+'X_val_ensemble.npy', X_val_ensemble)
np.save(datapath+'X_test_ensemble.npy', X_test_ensemble)