# Pre-extracting

In [37]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
import xarray as xr
import datetime
print('All packages imported.')

All packages imported.


## Site Selection

In [2]:
site = widgets.Select(
    options=['hfe-M1', 'nsa-C1', 'sgp-C1', 'twp-C1', 'twp-C2', 'twp-C3'],
    value='sgp-C1',
    rows=6,
    description='ARM Site:',
    disabled=False
)


def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print(f"Switch site to {change['new']}")


site.layout.width = '175px'
site.observe(on_change)
display(site)

Select(description='ARM Site:', index=2, layout=Layout(width='175px'), options=('hfe-M1', 'nsa-C1', 'sgp-C1', …

In [3]:
product = widgets.Select(
    options=['armbeatm'],
    value='armbeatm',
    rows=1,
    description='Product:',
    disabled=False
)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print(f"Switch site to {change['new']}")
              
product.layout.width = '175px'
product.observe(on_change)
display(product)

Select(description='Product:', layout=Layout(width='175px'), options=('armbeatm',), rows=1, value='armbeatm')

## Data Import

In [14]:
def site2stream(site, product):
    region, index = site.split('-')
    stream = region+product+index
    print(f'Ready for data stream: {stream}')
    return stream

def get_ARM_files_path(site, stream, ext='nc'):
    CURRENT_DIR = os.getcwd()
    DATA_DIR = f'{CURRENT_DIR}/../../data/ARM/{site}/{stream}'
    FILE_PATH = f'{DATA_DIR}/{stream}.*.{ext}'
    return FILE_PATH

def get_DS(mf):
    return xr.open_mfdataset(mf)

In [13]:
stream = site2stream(site.value, product.value)
FILE_PATH = get_ARM_files_path(site=site.value, stream=stream, ext='cdf')
DS = get_DS(FILE_PATH)
DS

Ready for data stream: sgparmbeatmC1


<xarray.Dataset>
Dimensions:        (p: 37, range: 2, time: 166560, z: 512)
Coordinates:
  * time           (time) datetime64[ns] 1994-01-01T00:30:00 ... 2012-12-31T23:30:00
  * p              (p) float32 1000.0 975.0 950.0 925.0 ... 150.0 125.0 100.0
  * z              (z) float32 15.0 60.0 105.0 150.0 ... 22920.0 22965.0 23010.0
    z10            float32 10.0
    z2             float32 2.0
Dimensions without coordinates: range
Data variables:
    base_time      (time) datetime64[ns] 1994-01-01 1994-01-01 ... 2012-01-01
    time_offset    (time) datetime64[ns] dask.array<shape=(166560,), chunksize=(8760,)>
    time_bounds    (time, range) datetime64[ns] dask.array<shape=(166560, 2), chunksize=(8760, 2)>
    time_frac      (time) datetime64[ns] dask.array<shape=(166560,), chunksize=(8760,)>
    p_bounds       (time, p, range) float64 dask.array<shape=(166560, 37, 2), chunksize=(8760, 37, 2)>
    z_bounds       (time, z, range) float64 dask.array<shape=(166560, 512, 2), chunksize=(8760

In [78]:
try:
    stream = site2stream(site.value, product.value)
    FILE_PATH = get_ARM_files_path(site=site.value, stream=stream, ext='nc')
    DS2 = xr.open_mfdataset(FILE_PATH, decode_times=False, concat_dim='time')
    DS2['time'] = [np.datetime64(dt_) for dt_ in [datetime.datetime.utcfromtimestamp(
        int_) for int_ in (DS2.base_time + DS2.time_offset).values]]
    old_name, new_name = [['precip_rate_sfc', 'temperature_sfc',
                          'relative_humidity_sfc', 'u_wind_sfc', 'v_wind_sfc', 'pressure_sfc', 'temperature_p', 'relative_humidity_p', 'u_wind_p', 'v_wind_p'],
    ['prec_sfc', 'T_sfc', 'rh_sfc', 'u_sfc',
     'v_sfc', 'p_sfc', 'T_p', 'rh_p', 'u_p', 'v_p']]
    name_dict = dict(zip(old_name, new_name))
    DS2 = DS2.rename(name_dict)
    print(DS2)
except:
    print('Nothing to do.')

Ready for data stream: sgparmbeatmC1
<xarray.Dataset>
Dimensions:                    (bound: 2, height: 512, pressure: 37, time: 35064)
Coordinates:
  * pressure                   (pressure) float32 1000.0 975.0 ... 125.0 100.0
  * height                     (height) float32 15.0 60.0 ... 22965.0 23010.0
  * time                       (time) datetime64[ns] 2013-01-01T00:30:00 ... 2016-12-31T23:30:00
Dimensions without coordinates: bound
Data variables:
    base_time                  (time) int32 1356998400 1356998400 ... 1451606400
    time_offset                (time) float64 dask.array<shape=(35064,), chunksize=(8760,)>
    time_bounds                (time, bound) float64 dask.array<shape=(35064, 2), chunksize=(8760, 2)>
    time_frac                  (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    pressure_bounds            (time, pressure, bound) float32 dask.array<shape=(35064, 37, 2), chunksize=(8760, 37, 2)>
    height_bounds              (time, height, bound) f

In [79]:
# Ensure that qc is null
try:
    if DS['qc_precip_sfc'].dropna(dim='time').values.size == 0:
        print('QC is empty.')
    else:
        print('QC IS NOT EMPTY!!!')
except:
    print('There is no QC.')

QC is empty.


# Data Handling

## Variable Extraction

In [80]:
from utils import DS_count_valid, DS_extract

In [81]:
# hand-pick var_interest
var_interest = ['prec_sfc', 'T_sfc', 'rh_sfc', 'u_sfc', 'v_sfc', 'p_sfc', 'T_p', 'rh_p', 'u_p', 'v_p']

In [87]:
DS_interest = DS_extract(DS, 
                         extract_list=var_interest,
                         drop_list=['z2','z10'])
DS_interest

<xarray.Dataset>
Dimensions:   (pressure: 37, time: 35064)
Coordinates:
  * pressure  (pressure) float32 1000.0 975.0 950.0 925.0 ... 150.0 125.0 100.0
  * time      (time) datetime64[ns] 2013-01-01T00:30:00 ... 2016-12-31T23:30:00
Data variables:
    prec_sfc  (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    T_sfc     (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    rh_sfc    (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    u_sfc     (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    v_sfc     (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    p_sfc     (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    T_p       (time, pressure) float32 dask.array<shape=(35064, 37), chunksize=(8760, 37)>
    rh_p      (time, pressure) float32 dask.array<shape=(35064, 37), chunksize=(8760, 37)>
    u_p       (time, pressure) float32 dask.array<shape=(35064, 37), chunksize=(8760, 37)>
    v_p       (time, pressure

## Predictand Shift

In [83]:
from utils import DS_shift_and_append

In [88]:
DS_shift = DS_shift_and_append(DS_interest,
                               var_name='prec_sfc',
                               new_var_name='prec_sfc_next',
                               shift_hour=1)
DS_shift

<xarray.Dataset>
Dimensions:        (pressure: 37, time: 35064)
Coordinates:
  * time           (time) datetime64[ns] 2013-01-01T00:30:00 ... 2016-12-31T23:30:00
  * pressure       (pressure) float32 1000.0 975.0 950.0 ... 150.0 125.0 100.0
Data variables:
    prec_sfc_next  (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    prec_sfc       (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    T_sfc          (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    rh_sfc         (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    u_sfc          (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    v_sfc          (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    p_sfc          (time) float32 dask.array<shape=(35064,), chunksize=(8760,)>
    T_p            (time, pressure) float32 dask.array<shape=(35064, 37), chunksize=(8760, 37)>
    rh_p           (time, pressure) float32 dask.array<shape=(35064, 37), chunksize=(87

In [90]:
if np.nanargmax(DS_shift['prec_sfc'].values) - np.nanargmax(DS_shift['prec_sfc_next'].values) == 1:
    print('Precipitation shift is correctly done.')
else:
    print('Something wrong with the shift.')

Precipitation shift is correctly done.


## NaN Dropping

In [94]:
DS_shift = DS_shift.dropna(dim='time')
DS_shift

<xarray.Dataset>
Dimensions:        (p: 37, time: 517)
Coordinates:
  * time           (time) datetime64[ns] 2013-01-01T17:30:00 ... 2016-12-30T11:30:00
  * p              (p) float32 1000.0 975.0 950.0 925.0 ... 150.0 125.0 100.0
Data variables:
    prec_sfc_next  (time) float32 dask.array<shape=(517,), chunksize=(91,)>
    prec_sfc       (time) float32 dask.array<shape=(517,), chunksize=(91,)>
    T_sfc          (time) float32 dask.array<shape=(517,), chunksize=(91,)>
    rh_sfc         (time) float32 dask.array<shape=(517,), chunksize=(91,)>
    u_sfc          (time) float32 dask.array<shape=(517,), chunksize=(91,)>
    v_sfc          (time) float32 dask.array<shape=(517,), chunksize=(91,)>
    p_sfc          (time) float32 dask.array<shape=(517,), chunksize=(91,)>
    T_p            (time, p) float32 dask.array<shape=(517, 37), chunksize=(91, 37)>
    rh_p           (time, p) float32 dask.array<shape=(517, 37), chunksize=(91, 37)>
    u_p            (time, p) float32 dask.array<sha

# Post-extracting

## NetCDF Saving

In [97]:
def save_netcdf(DS, FILE_PATH):
    DS.to_netcdf(FILE_PATH)
    
    print_path = FILE_PATH.split('../')
    print(f'Saved to {print_path[-1]}')
    return None


def get_save_file_path(file_name, stage=2):
    CURRENT_DIR = os.getcwd()
    DATA_DIR = f'{CURRENT_DIR}/../../data/stage-{stage}_cleaned'
    FILE_PATH = f'{DATA_DIR}/{file_name}'
    return FILE_PATH

In [98]:
file_name = f'{stream}_standard_dropped.cdf'
FILE_PATH = get_save_file_path(file_name)

save_netcdf(DS_shift, FILE_PATH)

Saved to data/stage-2_cleaned/sgparmbeatmC1_standard_dropped.cdf
