In [None]:
import pandas as pd
import xarray as xr
from collections import Counter

In [None]:
# Name of the metadata output file to write
metadata_output_file = '/home/pnorton/tmp/auxhist24_metadata.csv'

# The initial variable attributes are read from one of the wrfxtrm_* model output files
auxhist24_file = f'/caldera/projects/usgs/water/impd/wrf-conus404/kyoko/OUTPUT/WY1996/auxhist24_d01_1996-10-01_00:00:00'

base_dir = '/home/pnorton/notebooks/02_conus404_metadata/auxhist24_overrides'

# Directory containing the overrides files
overrides_dir = base_dir

wrf_wordmap_file = f'{overrides_dir}/auxhist24_wordmap.csv'
wrf_long_name_override = f'{overrides_dir}/auxhist24_long_name_overrides.txt'
wrf_units_override = f'{overrides_dir}/auxhist24_units_overrides.txt'
wrf_notes_overrides = f'{overrides_dir}/auxhist24_notes_overrides.txt'
wrf_scale_factor_overrides = f'{overrides_dir}/auxhist24_scale_factor_overrides.txt'

# Variables that are integrated over 60 minutes per hourly timestep
vars_60min_accum = ['PREC_ACC_NC']
vars_model_accum = ['I_RAINNC']
vars_bucket_mm_accum = ['RAINNC']

print(f'{len(vars_60min_accum)=}')
print(f'{len(vars_model_accum)=}')
print(f'{len(vars_bucket_mm_accum)=}')

In [None]:
def read_override_file(filename):
    # Read override file
    fhdl = open(filename, 'r', encoding='ascii')
    rawdata = fhdl.read().splitlines()
    fhdl.close()

    it = iter(rawdata)
    next(it)   # Skip header

    override_map = {}
    for row in it:
        flds = row.split('\t')
        override_map[flds[0]] = flds[1]
        # print(flds)  
    return override_map

In [None]:
# Read word map file for processing the description strings
fhdl = open(wrf_wordmap_file, 'r', encoding='ascii')
rawdata = fhdl.read().splitlines()
fhdl.close()

it = iter(rawdata)
next(it)   # Skip header

word_map = {}
for row in it:
    flds = row.split('\t')
    if len(flds[2]) != 0:
        word_map[flds[0].replace('"', '')] = flds[2].replace('"', '')
    # print(flds)

In [None]:
word_map

In [None]:
# Read long_name override file 
long_name_map = read_override_file(wrf_long_name_override)

# wrf_notes_overrides
notes_map = read_override_file(wrf_notes_overrides)

# wrf_scale_factor_overrides
scale_factor_map = read_override_file(wrf_scale_factor_overrides)

# Read units override file
units_map = read_override_file(wrf_units_override)

In [None]:
valid_range_map = {}
flag_values_map = {}
flag_meanings_map = {}

### Read dimensions, variables, and attributes from a single wrfout file

In [None]:
df = xr.open_dataset(auxhist24_file, decode_coords=False, chunks={})

In [None]:
df.Times

In [None]:
attr_cnt = Counter()
word_cnt = Counter()

wrfout_vars = {}

for vv in list(df.keys()):
    cvar = df[vv]
    wrfout_vars[vv] = {}
    
    for cattr, val in cvar.attrs.items():
        if cattr in ['description', 'units', 'coordinates']:
            attr_cnt[cattr] += 1
            
            if cattr == 'units':
                if vv in units_map:
                    # Units are overidden
                    wrfout_vars[vv][cattr] = units_map[vv]
                else:
                    wrfout_vars[vv][cattr] = val
            elif cattr == 'description':
                # Copy the original description
                wrfout_vars[vv][cattr] = val
                
                # Add a long_name attribute
                if vv in long_name_map:
                    # long_name is overidden
                    wrfout_vars[vv]['long_name'] = long_name_map[vv]
                else:
                    # Construct long_name from the word map
                    new_val = []
                    for ww in val.split(' '):
                        if ww in word_map:
                            new_val.append(word_map[ww])
                        else:
                            new_val.append(ww)
                        word_cnt[ww] += 1

                    # result = string[0].upper() + string[1:]
                    outstr = ' '.join(new_val)

                    if len(outstr) > 0:
                        outstr = outstr[0].upper() + outstr[1:]
                    wrfout_vars[vv]['long_name'] = outstr
            else:
                # Just copy other attributes
                wrfout_vars[vv][cattr] = val
                
    wrfout_vars[vv]['datatype'] = cvar.encoding['dtype'].name
    wrfout_vars[vv]['dimensions'] = ' '.join(cvar.dims)
    
    if vv == 'XTIME':
        # Units doesn't exist for XTIME so we'll create it
        wrfout_vars[vv]['units'] = units_map[vv]
        
    if vv == 'Times':
        # The Times variable is missing any sort of description
        wrfout_vars[vv]['long_name'] = long_name_map[vv]
        
    if vv in valid_range_map:
        wrfout_vars[vv]['valid_range'] = valid_range_map[vv]
    if vv in flag_values_map:
        wrfout_vars[vv]['flag_values'] = flag_values_map[vv]
    if vv in flag_meanings_map:
        wrfout_vars[vv]['flag_meanings'] = flag_meanings_map[vv]
    if vv in notes_map:
        wrfout_vars[vv]['notes'] = notes_map[vv]
    if vv in scale_factor_map:
        wrfout_vars[vv]['scale_factor'] = scale_factor_map[vv]
        
    if vv in vars_60min_accum:
        # Add accumulated and integration field
        wrfout_vars[vv]['accumulated'] = True
        
        # For daily variables the 60-min accumulation is not valid
        # wrfout_vars[vv]['integration_length'] = 'accumulated over prior 60 minutes'
        wrfout_vars[vv]['integration_length'] = 'accumulated since last top-of-hour'
    elif vv in vars_model_accum:
        # Add accumulated and integration field
        wrfout_vars[vv]['accumulated'] = True
        wrfout_vars[vv]['integration_length'] = 'accumulated since 1979-10-01 00:00:00'
    elif vv in vars_bucket_mm_accum:
        wrfout_vars[vv]['accumulated'] = True
        wrfout_vars[vv]['integration_length'] = 'accumulated since last bucket_mm (100 mm) reset'
    else:
        wrfout_vars[vv]['accumulated'] = False

In [None]:
attr_cnt

In [None]:
out_df = pd.DataFrame(wrfout_vars).transpose()
out_df.head()

In [None]:
out_df.info()

In [None]:
out_df.sort_index().to_csv(metadata_output_file, sep='\t', index_label = 'varname', 
                           columns=['long_name', 'accumulated', 'integration_length',
                                    'description', 'notes', 'units', 'scale_factor', 
                                    'dimensions', 'coordinates', 'datatype'])

### Don't run

In [None]:
word_df = pd.DataFrame(word_cnt, index=[0]).transpose()
word_df.head()

In [None]:
#word_df.to_csv('wrfout_words.csv', sep='\t')

In [None]:
fhdl = open('wrfout_words.txt', 'r', encoding='ascii')
rawdata = fhdl.read().splitlines()
fhdl.close()

it = iter(rawdata)
next(it)   # Skip header

word_map = {}
for row in it:
    flds = row.split('\t')
    if len(flds[2]) != 0:
        word_map[flds[0].replace('"', '')] = flds[2].replace('"', '')
    print(flds)
    

In [None]:
word_map['LATITUDE,']

In [None]:
len(flds[1])

## Create word map

In [None]:
df = xr.open_dataset(auxhist24_file, decode_coords=False, engine='netcdf4', chunks={})
df

In [None]:
attr_cnt = Counter()
word_cnt = Counter()

wrfout_vars = {}
word_map = {}

for vv in list(df.keys()):
    cvar = df[vv]
    wrfout_vars[vv] = {}
    
    for cattr, val in cvar.attrs.items():
        if cattr in ['description', 'units', 'coordinates']:
            attr_cnt[cattr] += 1
            wrfout_vars[vv][cattr] = val
            
            if cattr == 'description':
                new_val = []
                for ww in val.split(' '):
                    if ww in word_map:
                        new_val.append(word_map[ww])
                    else:
                        new_val.append(ww)
                    word_cnt[ww] += 1
                    
#                 result = string[0].upper() + string[1:]
                outstr = ' '.join(new_val)
    
                if len(outstr) > 0:
                    outstr = outstr[0].upper() + outstr[1:]
                wrfout_vars[vv]['description_new'] = outstr
    
    wrfout_vars[vv]['datatype'] = cvar.encoding['dtype'].name
    wrfout_vars[vv]['dimensions'] = ' '.join(cvar.dims)

In [None]:
word_cnt

In [None]:
wordmap_df = pd.DataFrame(word_cnt, index=[0]).transpose()
wordmap_df.head()

In [None]:
wordmap_df.to_csv(f'{overrides_dir}/auxhist24_wordmap.csv', sep='\t')