In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline


In [6]:
import numpy
import scipy
import pylab as plt
import math
#
import sys
import os
#
import csv
import xml
import xml.etree.ElementTree as ET


In [27]:

data_path = os.path.join(os.path.split(os.getcwd())[0], 'vtkOutput')
print('** data_path: ', data_path)




** data_path:  /scratch/users/myoder96/haycat_geos/vtkOutput


# GEOS output layout
The directory structure is something like,
```
$ tree -L 5
.
└── vtkOutput
    ├── 000000
    │   └── mesh
    │       └── Level0
    │           ├── carbonate_cement
    │           ├── coarse_sandstone
    │           ├── composite_high
    │           ├── composite_low
    │           ├── composite_mid
    │           ├── wellRegionCRC3
    │           └── wellRegionCRC8
    ├── 000000.vtm
    ├── 000007
    │   └── mesh
    │       └── Level0
    │           ├── carbonate_cement
    │           ├── coarse_sandstone
    │           ├── composite_high
    │           ├── composite_low
    │           ├── composite_mid
    │           ├── wellRegionCRC3
    │           └── wellRegionCRC8
    ├── 000007.vtm
```

Where under each Level-5 entry (carbonate_cement, coarse_sandstone, ...) are N XML files, `rank_{k_mpi}.vtu`, where `k_mpi` are the MPI rank.

The most obvious TODO is to consolidate the MPI rank collections into a single file. In most cases, this will reduce the iNode count by ~100. Then, it is pretty straight forward (I think?) to stack the outputs as a time series.

There is a lot of redundant information. For example, at least the way this GEOS output is produced, the `time` index is repeated in each MPI rank, at least in this implementation, `mesh` is the only output, so the only member of the `{time_step}` directory, and then we only have the one `Level0`. So really everything can be aggregated up to the `{time_step}` level.

Note that the directory structure is described in the `{timestep}.vtm` XML file, eg:

```
<?xml version="1.0"?>
<VTKFile type="vtkMultiBlockDataSet" version="1.0">
        <vtkMultiBlockDataSet>
                <Block name="mesh">
                        <Block name="Level0">
                                <Block name="CellElementRegion">
                                        <Block name="coarse_sandstone">
                                                <DataSet name="rank_000" file="000022/mesh/Level0/coarse_sandstone/rank_000.vtu" />
                                                <DataSet name="rank_001" file="000022/mesh/Level0/coarse_sandstone/rank_001.vtu" />
                                                <DataSet name="rank_002" file="000022/mesh/Level0/coarse_sandstone/rank_002.vtu" />
                                                <DataSet name="rank_003" file="000022/mesh/Level0/coarse_sandstone/rank_003.vtu" />
```

Then, of course, the data must be extracted from each `{rank}..vtu` file, the process for which is not obvious. So we will start with just a script to describe the structure to aggregate. We will base this aggregation process on the `{timestep}.vtm` XML file, rather than an external knowledge of the directory structure. 


In [49]:
# tree of the timeseries. Start with one file.
# as expected, at the root level, we have only the single 'vtkMultBlockDataSet' entry.
#
tree_ts = ET.parse(os.path.join(data_path, '000000.vtm'))
root_ts = tree_ts.getroot()
#
# I think we need to manually handle the recursion.
for k,child in enumerate(tree_ts.getroot()):
    print(f'** item[{k}]: {child}: {child.tag} :: {child.attrib}')
    for k_mesh, mesh in enumerate(child): 
        print(f'** [{k, k_mesh}]: {child.tag}:: {mesh.attrib}')
        for k_level, level in enumerate(mesh):
            print(f'** [{k, k_mesh, k_level}]: {mesh.attrib["name"]}:: {level.attrib["name"]}')
            for k_region, region in enumerate(level):
                print(f'**[{k, k_mesh, k_level, k_region, k_element}]: {level.attrib["name"]}:: {region.attrib["name"]}')
                for k_element, element in enumerate(region):
                    #
                    # Here is where would aggregate the ranks
                    for k_rank, rank in enumerate(element):
                        # read the file; aggregate...
                        pass
                    print(f'** [{k, k_mesh, k_level, k_element}]: {region.attrib["name"]}:: {element.attrib["name"]}:: {k_rank+1} rank files')

** item[0]: <Element 'vtkMultiBlockDataSet' at 0x7f5d7dd5e270>: vtkMultiBlockDataSet :: {}
** [(0, 0)]: vtkMultiBlockDataSet:: {'name': 'mesh'}
** [(0, 0, 0)]: mesh:: Level0
**[(0, 0, 0, 0, 1)]: Level0:: CellElementRegion
** [(0, 0, 0, 0)]: CellElementRegion:: coarse_sandstone:: 128 rank files
** [(0, 0, 0, 1)]: CellElementRegion:: composite_low:: 128 rank files
** [(0, 0, 0, 2)]: CellElementRegion:: composite_mid:: 128 rank files
** [(0, 0, 0, 3)]: CellElementRegion:: composite_high:: 128 rank files
** [(0, 0, 0, 4)]: CellElementRegion:: carbonate_cement:: 128 rank files
**[(0, 0, 0, 1, 4)]: Level0:: WellElementRegion
** [(0, 0, 0, 0)]: WellElementRegion:: wellRegionCRC3:: 128 rank files
** [(0, 0, 0, 1)]: WellElementRegion:: wellRegionCRC8:: 128 rank files


In [48]:
# can we do that with a generalized recursion? will it remember the variables in the correct scope?
#
k_recurse = 0
k_r_max = 5
#
items = tree_ts.getroot()
while k_recurse < k_r_max:
    for k, elem in enumerate(items):
        print(f'item [{k}]: {items.attrib}: {elem.attrib["name"]}')
        items = elem
        k_recurse +=1
        #
    

KeyError: 'name'