In [33]:
from matplotlib import colors
from parse_dataset import *
from typing import Dict, List, Optional, Text, Tuple
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import yaml

In [34]:
# '../config.yml', to be ignored via .gitignore, is assumed to exist in advance
with open('../config.yml', 'r') as inf:
    cfg_dict = yaml.safe_load(inf)

tf_dataset = get_dataset(cfg_dict['file_pattern'], 64)

In [35]:
train_cases = list(tf_dataset.as_numpy_iterator())
num_train_cases = len(train_cases)
feat_names = sorted(train_cases[0].keys())
print('* total # of training cases: {}'.format(num_train_cases))
print('* type of each train case: {}'.format(type(train_cases[0])))
print('* feature names in sorted order: {}'.format(feat_names))
print('* feature config:')
print('------------------------------------------------------------')
print('{:>20s}{:>20s}{:>20s}'.format('feature name', 'feature shape', 'dtype'))
print('------------------------------------------------------------')
for k, v in train_cases[0].items():
    print('{:>20s}{:>20s}{:>20s}'.format(k, str(v.shape), str(v.dtype)))
print('------------------------------------------------------------')

* total # of training cases: 14979
* type of each train case: <class 'dict'>
* feature names in sorted order: ['FireMask', 'NDVI', 'PrevFireMask', 'elevation', 'erc', 'pdsi', 'population', 'pr', 'sph', 'th', 'tmmn', 'tmmx', 'vs']
* feature config:
------------------------------------------------------------
        feature name       feature shape               dtype
------------------------------------------------------------
            FireMask            (64, 64)             float32
                NDVI            (64, 64)             float32
        PrevFireMask            (64, 64)             float32
           elevation            (64, 64)             float32
                 erc            (64, 64)             float32
                pdsi            (64, 64)             float32
          population            (64, 64)             float32
                  pr            (64, 64)             float32
                 sph            (64, 64)             float32
                  th

In [37]:
numeric_feat_names = [feat_name for feat_name in feat_names
                      if feat_name not in ['PrevFireMask', 'FireMask']]
feat_val = {}
feat_summary = {}

for feat_name in numeric_feat_names:
    feat_val[feat_name] = []
    for train_case in train_cases:
        feat_val[feat_name].extend(train_case[feat_name].reshape(-1))
    feat_summary[feat_name] = {}
    feat_summary[feat_name]['size'] = len(feat_val[feat_name])
    feat_summary[feat_name]['min'] = np.min(feat_val[feat_name])
    feat_summary[feat_name]['1stQtr'] = np.percentile(feat_val[feat_name], 25)
    feat_summary[feat_name]['mean'] = np.mean(feat_val[feat_name])
    feat_summary[feat_name]['3rdQtr'] = np.percentile(feat_val[feat_name], 75)
    feat_summary[feat_name]['max'] = np.max(feat_val[feat_name])
    feat_summary[feat_name]['std'] = np.std(feat_val[feat_name])

print(feat_summary['tmmn']['3rdQtr'])
print(feat_summary['pdsi']['mean'])
print(feat_summary['elevation']['std'])

287.880615234375
-0.7728697
842.61084
