# Pre-extracting

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
import xarray as xr
import datetime
print('All packages imported.')

All packages imported.


## Site Selection

In [2]:
product = 'armbeatm'
site = widgets.Select(
    options=['sgp-C1', 'twp-C1', 'twp-C2', 'twp-C3'],
    value='sgp-C1',
    rows=6,
    description='ARM Site:',
    disabled=False
)


def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print(f"Switch site to {change['new']}")


site.layout.width = '175px'
site.observe(on_change)
display(site)

Select(description='ARM Site:', layout=Layout(width='175px'), options=('sgp-C1', 'twp-C1', 'twp-C2', 'twp-C3')…

Switch site to twp-C1
Switch site to twp-C2
Switch site to twp-C3


## Data Import

In [31]:
def site2stream(site, product):
    region, index = site.split('-')
    stream = region+product+index
    print(f'Ready for data stream: {stream}')
    return stream

def get_ARM_files_path(site, stream, ext='nc'):
    CURRENT_DIR = os.getcwd()
    DATA_DIR = f'{CURRENT_DIR}/../../data/ARM/{site}/{stream}'
    FILE_PATH = f'{DATA_DIR}/{stream}.*.{ext}'
    return FILE_PATH

def get_DS(mf):
    return xr.open_mfdataset(mf)

In [32]:
stream = site2stream(site.value, product)
FILE_PATH = get_ARM_files_path(site=site.value, stream=stream, ext='cdf')
DS = get_DS(FILE_PATH)
DS

Ready for data stream: twparmbeatmC3


<xarray.Dataset>
Dimensions:        (p: 37, range: 2, time: 78888, z: 512)
Coordinates:
  * time           (time) datetime64[ns] 2002-01-01T00:30:00 ... 2010-12-31T23:30:00
  * p              (p) float32 1000.0 975.0 950.0 925.0 ... 150.0 125.0 100.0
  * z              (z) float32 15.0 60.0 105.0 150.0 ... 22920.0 22965.0 23010.0
    z10            float32 10.0
    z2             float32 2.0
Dimensions without coordinates: range
Data variables:
    base_time      (time) datetime64[ns] 2002-01-01 2002-01-01 ... 2010-01-01
    time_offset    (time) datetime64[ns] dask.array<shape=(78888,), chunksize=(8760,)>
    time_bounds    (time, range) datetime64[ns] dask.array<shape=(78888, 2), chunksize=(8760, 2)>
    time_frac      (time) datetime64[ns] dask.array<shape=(78888,), chunksize=(8760,)>
    p_bounds       (time, p, range) float64 dask.array<shape=(78888, 37, 2), chunksize=(8760, 37, 2)>
    z_bounds       (time, z, range) float64 dask.array<shape=(78888, 512, 2), chunksize=(8760, 512,

In [33]:
# Try to import newer measurements with different format
try:
    stream = site2stream(site.value, product)
    FILE_PATH = get_ARM_files_path(site=site.value, stream=stream, ext='nc')
    DS2 = xr.open_mfdataset(FILE_PATH, decode_times=False, concat_dim='time')
    DS2['time'] = [np.datetime64(dt_) for dt_ in [datetime.datetime.utcfromtimestamp(
        int_) for int_ in (DS2.base_time + DS2.time_offset).values]]
    old_name, new_name = [['precip_rate_sfc', 'temperature_sfc',
                          'relative_humidity_sfc', 'u_wind_sfc', 'v_wind_sfc', 'pressure_sfc', 'temperature_p', 'relative_humidity_p', 'u_wind_p', 'v_wind_p'],
    ['prec_sfc', 'T_sfc', 'rh_sfc', 'u_sfc',
     'v_sfc', 'p_sfc', 'T_p', 'rh_p', 'u_p', 'v_p']]
    name_dict = dict(zip(old_name, new_name))
    DS2 = DS2.rename(name_dict)
    print(DS2)
except:
    print('Nothing to do.')

Ready for data stream: twparmbeatmC3
Nothing to do.


In [34]:
# Ensure that qc is null
try:
    if DS['qc_precip_sfc'].dropna(dim='time').values.size == 0:
        print('QC is empty.')
    else:
        print('QC IS NOT EMPTY!!!')
except:
    print('There is no QC.')

QC is empty.


# Data Handling

## Variable Extraction

In [35]:
from utils import DS_count_valid, DS_extract

In [36]:
# hand-pick var_interest
var_interest = ['prec_sfc', 'T_sfc', 'rh_sfc', 'u_sfc', 'v_sfc', 'p_sfc', 'T_p', 'rh_p', 'u_p', 'v_p']

In [37]:
DS_interest = DS_extract(DS, 
                         extract_list=var_interest,
                         drop_list=['z2','z10'])
print(DS_interest)

try:
    DS2_interest = DS_extract(DS2, 
                              extract_list=var_interest)
    print(DS2_interest)
except:
    print('')

<xarray.Dataset>
Dimensions:   (p: 37, time: 78888)
Coordinates:
  * p         (p) float32 1000.0 975.0 950.0 925.0 ... 175.0 150.0 125.0 100.0
  * time      (time) datetime64[ns] 2002-01-01T00:30:00 ... 2010-12-31T23:30:00
Data variables:
    prec_sfc  (time) float32 dask.array<shape=(78888,), chunksize=(8760,)>
    T_sfc     (time) float32 dask.array<shape=(78888,), chunksize=(8760,)>
    rh_sfc    (time) float32 dask.array<shape=(78888,), chunksize=(8760,)>
    u_sfc     (time) float32 dask.array<shape=(78888,), chunksize=(8760,)>
    v_sfc     (time) float32 dask.array<shape=(78888,), chunksize=(8760,)>
    p_sfc     (time) float32 dask.array<shape=(78888,), chunksize=(8760,)>
    T_p       (time, p) float32 dask.array<shape=(78888, 37), chunksize=(8760, 37)>
    rh_p      (time, p) float32 dask.array<shape=(78888, 37), chunksize=(8760, 37)>
    u_p       (time, p) float32 dask.array<shape=(78888, 37), chunksize=(8760, 37)>
    v_p       (time, p) float32 dask.array<shape=(78888, 3

## Predictand Shift

In [38]:
from utils import DS_shift_and_append

In [39]:
DS_shift = DS_shift_and_append(DS_interest,
                               var_name='prec_sfc',
                               new_var_name='prec_sfc_next',
                               shift_hour=1)
print(DS_shift)

try:
    DS2_shift = DS_shift_and_append(DS2_interest,
                                     var_name='prec_sfc',
                                     new_var_name='prec_sfc_next',
                                     shift_hour=1)
    print(DS2_shift)
except:
    print('')

<xarray.Dataset>
Dimensions:        (p: 37, time: 78888)
Coordinates:
  * time           (time) datetime64[ns] 2002-01-01T00:30:00 ... 2010-12-31T23:30:00
  * p              (p) float32 1000.0 975.0 950.0 925.0 ... 150.0 125.0 100.0
Data variables:
    prec_sfc_next  (time) float32 dask.array<shape=(78888,), chunksize=(8760,)>
    prec_sfc       (time) float32 dask.array<shape=(78888,), chunksize=(8760,)>
    T_sfc          (time) float32 dask.array<shape=(78888,), chunksize=(8760,)>
    rh_sfc         (time) float32 dask.array<shape=(78888,), chunksize=(8760,)>
    u_sfc          (time) float32 dask.array<shape=(78888,), chunksize=(8760,)>
    v_sfc          (time) float32 dask.array<shape=(78888,), chunksize=(8760,)>
    p_sfc          (time) float32 dask.array<shape=(78888,), chunksize=(8760,)>
    T_p            (time, p) float32 dask.array<shape=(78888, 37), chunksize=(8760, 37)>
    rh_p           (time, p) float32 dask.array<shape=(78888, 37), chunksize=(8760, 37)>
    u_p      

In [40]:
if np.nanargmax(DS_shift['prec_sfc'].values) - np.nanargmax(DS_shift['prec_sfc_next'].values) == 1:
    print('Precipitation shift is correctly done.')
else:
    print('Something wrong with the shift.')

Precipitation shift is correctly done.


## NaN Dropping

In [41]:
DS_shift = DS_shift.dropna(dim='time')
print(DS_shift)

try:
    DS2_shift = DS2_shift.dropna(dim='time')
    print(DS2_shift)
except:
    print('')

<xarray.Dataset>
Dimensions:        (p: 37, time: 5470)
Coordinates:
  * time           (time) datetime64[ns] 2002-04-01T11:30:00 ... 2010-08-16T23:30:00
  * p              (p) float32 1000.0 975.0 950.0 925.0 ... 150.0 125.0 100.0
Data variables:
    prec_sfc_next  (time) float32 dask.array<shape=(5470,), chunksize=(487,)>
    prec_sfc       (time) float32 dask.array<shape=(5470,), chunksize=(487,)>
    T_sfc          (time) float32 dask.array<shape=(5470,), chunksize=(487,)>
    rh_sfc         (time) float32 dask.array<shape=(5470,), chunksize=(487,)>
    u_sfc          (time) float32 dask.array<shape=(5470,), chunksize=(487,)>
    v_sfc          (time) float32 dask.array<shape=(5470,), chunksize=(487,)>
    p_sfc          (time) float32 dask.array<shape=(5470,), chunksize=(487,)>
    T_p            (time, p) float32 dask.array<shape=(5470, 37), chunksize=(487, 37)>
    rh_p           (time, p) float32 dask.array<shape=(5470, 37), chunksize=(487, 37)>
    u_p            (time, p) flo

# DataSet Merging

In [42]:
try:
    DS2_shift = DS2_shift.rename({'pressure':'p'})
    DS_shift = xr.merge([DS_shift, DS2_shift])
    print(DS_shift)
except:
    print('')




# Post-extracting

## NetCDF Saving

In [43]:
def save_netcdf(DS, FILE_PATH):
    DS.to_netcdf(FILE_PATH)
    
    print_path = FILE_PATH.split('../')
    print(f'Saved to {print_path[-1]}')
    return None


def get_save_file_path(file_name, stage=3):
    CURRENT_DIR = os.getcwd()
    DATA_DIR = f'{CURRENT_DIR}/../../data/stage-{stage}_cleaned'
    FILE_PATH = f'{DATA_DIR}/{file_name}'
    return FILE_PATH

In [44]:
file_name = f'{stream}_standard_dropped.cdf'
FILE_PATH = get_save_file_path(file_name)

save_netcdf(DS_shift, FILE_PATH)

Saved to data/stage-3_cleaned/twparmbeatmC3_standard_dropped.cdf
