# Libraries

In [1]:
import os

import numpy as np
import pandas as pd
from math import floor, ceil

import matplotlib
# matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import seaborn as sns

import xarray as xr
#import tensorflow as tf

print('All packages imported.')

All packages imported.


# Data Import

In [2]:
def get_file_path(file_name):
    CURRENT_DIR = os.getcwd()
    DATA_DIR = f'{CURRENT_DIR}/../../../data/stage-1_cleaned'
    FILE_PATH = f'{DATA_DIR}/{file_name}'
    return FILE_PATH


def import_DS(FILE_PATH):
    return xr.open_dataset(FILE_PATH)

def DS_dropna(DS):
    return DS.dropna(dim='time')

In [3]:
# Get file
file_name = 'twparmbeatmC1.cdf'
FILE_PATH = get_file_path(file_name)

# Import data
armbeatm = import_DS(FILE_PATH)
armbeatm

<xarray.Dataset>
Dimensions:        (p: 37, time: 131496)
Coordinates:
  * time           (time) datetime64[ns] 1996-01-01T00:30:00 ... 2010-12-31T23:30:00
  * p              (p) float32 1000.0 975.0 950.0 925.0 ... 150.0 125.0 100.0
Data variables:
    prec_sfc_next  (time) float32 ...
    T_sfc          (time) float32 ...
    p_sfc          (time) float32 ...
    rh_sfc         (time) float32 ...
    u_sfc          (time) float32 ...
    v_sfc          (time) float32 ...
    prec_sfc       (time) float32 ...
    T_p            (time, p) float32 ...
    rh_p           (time, p) float32 ...
    u_p            (time, p) float32 ...
    v_p            (time, p) float32 ...

In [4]:
# Get file
file_name = 'twpqcrad1longC1.cdf'
FILE_PATH = get_file_path(file_name)

# Import data
qcrad = import_DS(FILE_PATH)
qcrad

<xarray.Dataset>
Dimensions:                       (time: 8904960)
Coordinates:
  * time                          (time) datetime64[ns] 1996-10-10 ... 2014-07-06T23:59:00
Data variables:
    down_short_diffuse_hemisp     (time) float32 ...
    qc_down_short_diffuse_hemisp  (time) int32 ...

In [5]:
np.isnan(qcrad['down_short_diffuse_hemisp'].values).sum()

204901

In [6]:
np.isnan(qcrad['qc_down_short_diffuse_hemisp'].values).mean()

0.0

# Data Filtering

In [7]:
def np_filter(np_array, qc_array, good_class=0):
    if np_array.size != qc_array.size:
        raise Exception('Input arrays should have the same sizes.')
    else:
        return np.asarray([np_array[i] if qc_array[i] == good_class else np.nan for i in range(0,np_array.size)])

In [8]:
# class 0 is good, class 1 is suspicious, class 2/3 are bad and missing
# Number of NaN and class 1
np.isnan(qcrad['down_short_diffuse_hemisp'].values).sum() + (qcrad['qc_down_short_diffuse_hemisp'].values == 1).sum()

278213

In [9]:
# After filtering, number of NaN should be the same as the number above (hurray!)
np_filtered = np_filter(qcrad['down_short_diffuse_hemisp'].values, 
                        qcrad['qc_down_short_diffuse_hemisp'].values)
print(np.isnan(np_filtered).sum())
np_filtered

278213


array([407.3999939 , 407.5083313 , 414.06466675, ..., 298.5       ,
       289.20001221, 291.89001465])

In [10]:
(qcrad['qc_down_short_diffuse_hemisp'].values == 2).sum()

127589

In [11]:
qcrad['down_short_diffuse_hemisp'].values[qcrad['qc_down_short_diffuse_hemisp'].values == 1].sum()

16389709.0

In [12]:
da = xr.DataArray(np_filtered, coords=[('time', qcrad.time)], name='down_short_diffuse_hemisp')
da

<xarray.DataArray 'down_short_diffuse_hemisp' (time: 8904960)>
array([407.399994, 407.508331, 414.064667, ..., 298.5     , 289.200012,
       291.890015])
Coordinates:
  * time     (time) datetime64[ns] 1996-10-10 ... 2014-07-06T23:59:00

# Merge

In [None]:
DS = xr.merge([armbeatm, da]).dropna(dim='time')
DS

In [1]:
def save_netcdf(DS, FILE_PATH):
    DS.to_netcdf(FILE_PATH)
    print('Saved.')
    return None


def get_save_file_path(file_name, stage=1):
    CURRENT_DIR = os.getcwd()
    DATA_DIR = f'{CURRENT_DIR}/../../../data/stage-{stage}_cleaned'
    FILE_PATH = f'{DATA_DIR}/{file_name}'
    return FILE_PATH

In [None]:
# Save file
file_name = 'merged_dropped.cdf'
FILE_PATH = get_save_file_path(file_name)

save_netcdf(DS, FILE_PATH)