In [1]:
# Importing libraries
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



Example of reforecast 'tp' variable of lead time 1 day

In [6]:
PATH = 'C:\\Users\\bobby\\Documents\\GitHub\\attentionMedium\\raw_data\\'
dr_test = xr.open_dataset(PATH + 'reforecast-leadtime1.nc')
dr_test

# Reanalysis data

In [7]:
idx = pd.IndexSlice

lat = [1.50, 1.25]
lon = [103.75, 104.00]
vars_ = ['tp']
leadtime = range(0,1)
ground_truth = pd.DataFrame(index=pd.date_range('20000101', '20191231', freq='D'),
                    columns=pd.MultiIndex.from_product([leadtime, vars_, lat, lon],
                                                      names=['leadtime', 'vars', 'lat', 'lon']))

In [8]:
# Read reanalysis data and resample daily data (sum of 06, 12, 18, 00)
# reanalysis = xr.open_dataset('../../reanalysis/combined-reanalysis.nc')
reanalysis = xr.open_dataset(PATH + 'combined-reanalysis.nc')
# replace outliers
reanalysis['tp'][20252, :, :] = np.array([[0,0], [0,0]]) # 2013-09-20 00:00:00

reanalysis['time'] = pd.to_datetime(['%s%02d'%(str(i).split('.')[0], int(str(i).split('.')[1])/100*24) if int(str(i).split('.')[1]) != 5 
                  else '%s%02d'%(str(i).split('.')[0], int(str(i).split('.')[1])/10*24) 
                             for i in reanalysis.time.data], format='%Y%m%d%H')

reanalysis = reanalysis.resample(time='D').sum().to_dataframe()['tp'].unstack(level=[2,1])

# Take reanalysis data as data of leadtime=0 
for i in lat:
    for j in lon: 
        ground_truth.loc[:, idx[0,'tp',i,j]] = reanalysis.loc[:, idx[i,j]]

In [9]:
reanalysis = xr.open_dataset(PATH + 'combined-reanalysis.nc')
reanalysis

In [12]:
# convert into tabular form in CSV
post_PATH = 'C:\\Users\\bobby\\Documents\\GitHub\\attentionMedium\\postprocessed_data\\'
ground_truth = ground_truth.groupby(level=[0,1],axis=1).mean()
ground_truth.to_csv(post_PATH + 'ground_truth.csv', index=True)

# Reforecast data

In [13]:
import pandas as pd

idx = pd.IndexSlice

lat = [1.50, 1.25]
lon = [103.75, 104.00]
vars_ = ['t2m', 'tp', 'H', 'C', 'E']
leadtime = range(1, 11)
input_data = pd.DataFrame(index=pd.date_range('20000101', '20191231', freq='D'),
                    columns=pd.MultiIndex.from_product([leadtime, vars_, lat, lon],
                                                      names=['leadtime', 'vars', 'lat', 'lon']))
# input_data

Retrieve cape data

In [15]:
# Retrieve cape data
for leadtime in range(1,11):
    # cape = xr.open_dataset('../../script/reforecast-cape_sfc-leadtime%s.nc'%leadtime).to_dataframe()['cape']
    cape = xr.open_dataset(PATH + 'reforecast-cape_sfc-leadtime%s.nc'%leadtime).to_dataframe()['cape']
    cape = cape.loc[~cape.index.duplicated(),:].unstack(level=[1,2])
    # Fill missing value using nan value 
    cape = cape.resample('D').fillna('ffill')
    #cape = cape.shift(leadtime-1)

    for i in lat:
        for j in lon: 
            input_data.loc[:, idx[leadtime,'E',i,j]] = cape.loc[:, idx[i,j]]

Example of cape variable of lead time 1 day in CSV

In [7]:
leadtime = 1
cape = xr.open_dataset('reforecast-cape_sfc-leadtime%s.nc'%leadtime).to_dataframe()['cape']
cape = cape.loc[~cape.index.duplicated(),:].unstack(level=[1,2])
cape

latitude,1.50,1.50,1.25,1.25
longitude,103.75,104.00,103.75,104.00
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2000-01-01,972.50,1117.75,1059.25,1262.50
2000-01-02,886.25,1007.75,983.25,1154.00
2000-01-03,842.75,893.50,927.25,1075.25
2000-01-04,850.00,834.25,942.00,964.25
2000-01-05,594.50,681.25,619.50,790.75
...,...,...,...,...
2019-12-27,220.50,246.50,299.00,391.25
2019-12-28,151.25,182.25,257.75,385.25
2019-12-29,343.50,362.25,480.25,553.00
2019-12-30,325.25,426.50,464.00,671.50


Retrieve tcdc_eatm data (cloud cover)

In [17]:
# Retrieve tcdc_eatm data (cloud cover)
for leadtime in range(1,11):
    # tcdc = xr.open_dataset('../../script/reforecast-tcdc_eatm-leadtime%s.nc'%leadtime).to_dataframe()['tcc']
    tcdc = xr.open_dataset(PATH + 'reforecast-tcdc_eatm-leadtime%s.nc'%leadtime).to_dataframe()['tcc']
    tcdc = tcdc.loc[~tcdc.index.duplicated(),:].unstack(level=[1,2])
    # Fill missing value using nan value 
    tcdc = tcdc.resample('D').fillna('ffill')
    #tcdc = tcdc.shift(leadtime-1)

    for i in lat:
        for j in lon: 
            input_data.loc[:, idx[leadtime,'C',i,j]] = tcdc.loc[:, idx[i,j]]

Retrieve spfh data

In [18]:
# Retrieve spfh data
for leadtime in range(1,11):
    # spfh = xr.open_dataset('../../script/reforecast-spfh_2m-leadtime%s.nc'%leadtime).to_dataframe()['q']
    spfh = xr.open_dataset(PATH + 'reforecast-spfh_2m-leadtime%s.nc'%leadtime).to_dataframe()['q']
    spfh = spfh.loc[~spfh.index.duplicated(),:].unstack(level=[1,2])
    # Fill missing value using nan value 
    spfh = spfh.resample('D').fillna('ffill')
    #spfh = spfh.shift(leadtime-1)

    for i in lat:
        for j in lon: 
            input_data.loc[:, idx[leadtime,'H',i,j]] = spfh.loc[:, idx[i,j]]

Retrieve reforecast data (tp)

In [19]:
# Retrieve reforecast data
for leadtime in range(1,11):
    # tp = xr.open_dataset('../../script/reforecast-leadtime%s.nc'%leadtime).to_dataframe()['tp'].unstack(level=[1,2])
    tp = xr.open_dataset(PATH + 'reforecast-leadtime%s.nc'%leadtime).to_dataframe()['tp'].unstack(level=[1,2])
    # Fill missing value using nan value 
    tp = tp.resample('D').fillna('ffill')
    #tp = tp.shift(leadtime-1)
    
    for i in lat:
        for j in lon: 
            input_data.loc[:, idx[leadtime,'tp',i,j]] = tp.loc[:, idx[i,j]]

Retrieve t2m data

In [20]:
# Retrieve t2m data
for leadtime in range(1,11):
    # t2m = xr.open_dataset('../../script/reforecast-t2m-leadtime%s.nc'%leadtime).to_dataframe()['t2m']
    t2m = xr.open_dataset(PATH + 'reforecast-t2m-leadtime%s.nc'%leadtime).to_dataframe()['t2m']
    t2m = t2m.loc[~t2m.index.duplicated(),:].unstack(level=[1,2])
    # Fill missing value using nan value 
    t2m = t2m.resample('D').fillna('ffill')
    #t2m = t2m.shift(leadtime-1)
    
    for i in lat:
        for j in lon: 
            input_data.loc[:, idx[leadtime,'t2m',i,j]] = t2m.loc[:, idx[i,j]]

In [21]:
input_data.shape

(7305, 200)

In [23]:
input_data = input_data.dropna(axis=0)
input_data = input_data.groupby(level=[0,1],axis=1).mean()
idx = pd.MultiIndex.from_product([range(1,11),['t2m', 'tp', 'H', 'C', 'E']])
input_data = input_data[idx]
input_data.to_csv(post_PATH + 'input_data.csv', index=True)

# Input data

In [24]:
input_data

leadtime,1,1,1,1,1,2,2,2,2,2,...,9,9,9,9,9,10,10,10,10,10
vars,t2m,tp,H,C,E,t2m,tp,H,C,E,...,t2m,tp,H,C,E,t2m,tp,H,C,E
2000-01-01,298.732574,9.575000,0.017915,86.574997,1103.0000,299.111755,15.725000,0.017794,99.2500,1072.3125,...,298.886749,0.525000,0.016846,36.2500,925.3125,298.846619,9.975000,0.017213,49.5000,1169.3750
2000-01-02,299.032166,8.450000,0.017592,100.000000,1007.8125,298.414093,19.375000,0.017458,99.1875,767.8750,...,298.484497,8.100000,0.017432,65.7500,1015.0000,298.197449,19.775002,0.017117,96.3125,799.1875
2000-01-03,299.274414,2.492500,0.017432,90.868752,934.6875,299.059875,12.500000,0.017477,83.5000,790.7500,...,298.538849,26.824999,0.017166,86.8750,964.3125,297.618561,11.575001,0.017026,97.0000,824.5625
2000-01-04,299.242432,11.217500,0.017409,96.537498,897.6250,297.286194,13.475000,0.016981,98.1875,367.8750,...,298.658875,16.100000,0.017146,86.5000,787.5000,298.438538,11.450000,0.017258,90.0000,890.8125
2000-01-05,298.122131,14.474999,0.017249,98.793747,671.5000,295.831390,73.887505,0.016346,99.8125,113.7500,...,298.475403,5.925000,0.017026,65.0625,946.3125,298.594696,0.675000,0.016015,54.5625,390.4375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-27,299.413849,0.000000,0.016638,56.112499,289.3125,299.304077,2.925000,0.016809,44.1875,266.0625,...,298.847687,0.075000,0.017107,87.3750,663.8750,299.307739,1.550000,0.017038,96.0000,623.2500
2019-12-28,299.767151,0.000000,0.016651,63.575001,244.1250,299.399933,0.025000,0.016865,49.1250,330.9375,...,299.033478,8.434999,0.017320,86.2500,611.1250,299.209320,9.825000,0.017156,95.5000,539.6250
2019-12-29,299.702393,0.175000,0.017191,32.112499,434.7500,299.147278,0.150000,0.017005,53.1250,345.6250,...,298.984619,7.875000,0.017670,88.8125,844.5625,299.174286,7.570000,0.017564,95.0000,840.6250
2019-12-30,299.547485,0.175000,0.017161,44.156250,471.8125,299.225006,2.500000,0.016858,95.0000,366.0625,...,298.759247,4.575000,0.016866,89.0000,446.3125,298.978607,1.325000,0.017175,99.0000,559.1875


# Ground Truth

In [25]:
ground_truth

leadtime,0
vars,tp
2000-01-01,11.800
2000-01-02,11.175
2000-01-03,4.125
2000-01-04,14.575
2000-01-05,12.000
...,...
2019-12-27,0.225
2019-12-28,0.125
2019-12-29,0.550
2019-12-30,0.800
