# Parse & load the original '*.tfrecord' files

In [58]:
from parse_dataset import *
import json
import numpy as np
import tensorflow as tf
import yaml

In [59]:
# '../config.yml', to be ignored via .gitignore, is assumed to exist in advance
with open('../config.yml', 'r') as inf:
    cfg_dict = yaml.safe_load(inf)

tf_dataset = get_dataset(cfg_dict['file_pattern'], 64)

In [60]:
train_cases = list(tf_dataset.as_numpy_iterator())
num_train_cases = len(train_cases)
feat_names = sorted(train_cases[0].keys())
print('* total # of training cases: {}'.format(num_train_cases))
print('* type of each train case: {}'.format(type(train_cases[0])))
print('* feature names in sorted order: {}'.format(feat_names))
print('* feature config:')
print('------------------------------------------------------------')
print('{:>20s}{:>20s}{:>20s}'.format('feature name', 'feature shape', 'dtype'))
print('------------------------------------------------------------')
for k, v in train_cases[0].items():
    print('{:>20s}{:>20s}{:>20s}'.format(k, str(v.shape), str(v.dtype)))
print('------------------------------------------------------------')

* total # of training cases: 14979
* type of each train case: <class 'dict'>
* feature names in sorted order: ['FireMask', 'NDVI', 'PrevFireMask', 'elevation', 'erc', 'pdsi', 'population', 'pr', 'sph', 'th', 'tmmn', 'tmmx', 'vs']
* feature config:
------------------------------------------------------------
        feature name       feature shape               dtype
------------------------------------------------------------
            FireMask            (64, 64)             float32
                NDVI            (64, 64)             float32
        PrevFireMask            (64, 64)             float32
           elevation            (64, 64)             float32
                 erc            (64, 64)             float32
                pdsi            (64, 64)             float32
          population            (64, 64)             float32
                  pr            (64, 64)             float32
                 sph            (64, 64)             float32
                  th

# Original authors' data stats

In [61]:
# source: https://github.com/google-research/google-research/blob/master/simulation_research/next_day_wildfire_spread/data_export/dataset_demo.ipynb
# Data statistics
# For each variable, the statistics are ordered in the form:
# (min_clip, max_clip, mean, std)
DATA_STATS = {
    # 0.1 percentile, 99.9 percentile
    'elevation': (0.0, 3141.0, 657.3003, 649.0147),
    # Pressure
    # 0.1 percentile, 99.9 percentile
    'pdsi': (-6.1298, 7.8760, -0.0053, 2.6823),
    'NDVI': (-9821.0, 9996.0, 5157.625, 2466.6677),
    # Precipitation in mm.
    # Negative values make no sense, so min is set to 0.
    # 0., 99.9 percentile
    'pr': (0.0, 44.5304, 1.7398051, 4.4828),
    # Specific humidity ranges from 0 to 100%.
    'sph': (0., 1., 0.0071658953, 0.0042835088),
    # Wind direction in degrees clockwise from north.
    # Thus min set to 0 and max set to 360.
    'th': (0., 360.0, 190.3298, 72.5985),
    # Min/max temperature in Kelvin.
    # -20 degree C, 99.9 percentile
    'tmmn': (253.15, 298.9489, 281.08768, 8.9824),
    # -20 degree C, 99.9 percentile
    'tmmx': (253.15, 315.0923, 295.17383, 9.8155),
    # Wind speed.
    # Negative values do not make sense, given there is a wind direction.
    # 0., 99.9 percentile
    'vs': (0.0, 10.0243, 3.8501, 1.4110),
    # NFDRS fire danger index energy release component expressed in BTU's per
    # square foot.
    # Negative values do not make sense. Thus min set to zero.
    # 0., 99.9 percentile
    'erc': (0.0, 106.2489, 37.3263, 20.8460),
    # Population
    # min, 99.9 percentile
    'population': (0., 2534.0630, 25.5314, 154.7233),
    # We don't want to normalize the FireMasks.
    'PrevFireMask': (-1., 1., 0., 1.),
    'FireMask': (-1., 1., 0., 1.)
}

# min/max/mean/std for each raw numeric feature data vs original authors'

In [62]:
numeric_feat_names = [feat_name for feat_name in feat_names
                      if feat_name not in ['PrevFireMask', 'FireMask']]
feat_val = {}
feat_min = {}
feat_max = {}
feat_mean = {}
feat_std = {}

for feat_name in numeric_feat_names:
    feat_val[feat_name] = []
    for idx, train_case in enumerate(train_cases):
        feat_val[feat_name].extend(train_case[feat_name].reshape(-1))
    feat_min[feat_name] = np.min(feat_val[feat_name])
    feat_max[feat_name] = np.max(feat_val[feat_name])
    feat_mean[feat_name] = np.mean(feat_val[feat_name])
    feat_std[feat_name] = np.std(feat_val[feat_name])

In [63]:
col1_width = max([len(feat_name) for feat_name in feat_min]) + 1
format_header = '{:<' + str(col1_width) + 's}' + '{:>20s}' * 4
format_body = '{:<' + str(col1_width) + 's}' + '{:>20.4f}' * 4
header = format_header.format('Feature', 'min', 'max', 'mean', 'std')

print('# raw numeric feature data as of Dec/2022\n')
print(header)
print('-' * len(header))
for feat_name in sorted(feat_min.keys()):
    print(format_body.format(feat_name,
                             feat_min[feat_name], feat_max[feat_name],
                             feat_mean[feat_name], feat_std[feat_name]))

print('\n# original authors\'\n')
header = format_header.format('Feature', 'min_clip', 'max_clip', 'mean', 'std')
print(header)
print('-' * len(header))
for feat_name in sorted(feat_min.keys()):
    print(format_body.format(feat_name,
                             DATA_STATS[feat_name][0], DATA_STATS[feat_name][1],
                             DATA_STATS[feat_name][2], DATA_STATS[feat_name][3]))

# raw numeric feature data as of Dec/2022

Feature                     min                 max                mean                 std
-------------------------------------------------------------------------------------------
NDVI                 -9567.0000           9966.0000           5350.6748           2185.2170
elevation              -45.0000           4193.0000            896.5713            842.6105
erc                  -1196.0886           2470.8823             53.4690             25.0980
pdsi                  -125.7109             52.2690             -0.7729              2.4407
population               0.0000          27103.6055             30.4603            214.2003
pr                    -167.4483             56.2148              0.3234              1.5337
sph                     -0.1290              0.0855              0.0065              0.0037
th                 -505870.0625          37735.6289            146.6466           3435.0671
tmmn                  -444.6930      

# min_clip/max_clip by numpy.percentile

Recalculate min_clip/max_clip according to the original authors' logic, as the dataset seems to be different from that of the authors' paper.  
Assume gaussian(normal) distribution for each numeric feature data.

In [64]:
# {feat_name: [min_clip, max_clip], ...}
my_clips = {feat_name: [0.0, 0.0] for feat_name in numeric_feat_names}

# elevation: 0.1 percentile, 99.9 percentile
my_clips['elevation'][0] = np.percentile(feat_val['elevation'], 0.1, method='linear')#0.001)
my_clips['elevation'][1] = np.percentile(feat_val['elevation'], 99.9, method='linear')#0.999)
# pdsi: 0.1 percentile, 99.9 percentile
my_clips['pdsi'][0] = np.percentile(feat_val['pdsi'], 0.1, method='linear')#0.001)
my_clips['pdsi'][1] = np.percentile(feat_val['pdsi'], 99.9, method='linear')#0.999)
# NDVI: NA(0.1 percentile, 99.9 percentile by default)
my_clips['NDVI'][0] = np.percentile(feat_val['NDVI'], 0.1, method='linear')#0.001)
my_clips['NDVI'][1] = np.percentile(feat_val['NDVI'], 99.9, method='linear')#0.999)
# pr: Precipitation in mm. Negative values make no sense, so min is set to 0. 0., 99.9 percentile
my_clips['pr'][0] = 0.0
my_clips['pr'][1] = np.percentile(feat_val['pr'], 99.9, method='linear')#0.999)
# sph: Specific humidity ranges from 0 to 100%.
my_clips['sph'][0] = 0.0
my_clips['sph'][1] = 1.0
# th: Wind direction in degrees clockwise from north. Thus min set to 0 and max set to 360.
my_clips['th'][0] = 0.0
my_clips['th'][1] = 360.0
# tmmn: min temperature in Kelvin. -20 degree C, 99.9 percentile
my_clips['tmmn'][0] = 253.15
my_clips['tmmn'][1] = np.percentile(feat_val['tmmn'], 99.9, method='linear')#0.999)
# tmmx: max temperature in Kelvin. -20 degree C, 99.9 percentile
my_clips['tmmx'][0] = 253.15
my_clips['tmmx'][1] = np.percentile(feat_val['tmmx'], 99.9, method='linear')#0.999)
# vs: Wind speed. Negative values do not make sense, given there is a wind direction. 0., 99.9 percentile
my_clips['vs'][0] = 0.0
my_clips['vs'][1] = np.percentile(feat_val['vs'], 99.9, method='linear')#0.999)
# erc: NFDRS fire danger index energy release component expressed in BTU's per square foot.
#      Negative values do not make sense. Thus min set to zero. 0., 99.9 percentile
my_clips['erc'][0] = 0.0
my_clips['erc'][1] = np.percentile(feat_val['erc'], 99.9, method='linear')#0.999)
# population: Population. 0., 99.9 percentile
my_clips['population'][0] = 0.0
my_clips['population'][1] = np.percentile(feat_val['population'], 99.9, method='linear')#0.999)

In [65]:
# my min_clip/max_clip using np.percentile vs original authors'

In [66]:
# my mean/std using np.percentile vs original authors'

# min_clip/max_clip by scipy.stats.norm.ppf

Assume no specific distribution for each numeric feature data.

In [67]:
# my min_clip/max_clip using scipy.stats.norm.ppf vs original authors'

In [68]:
# my mean/std using scipy.stats.norm.ppf vs original authors'