# Preprocessing

## Preparation

In [2]:
import re
import pyproj
import shapely
import rasterio
import threading
import numpy as np
import pandas as pd
import geopandas as gpd
from pathlib import Path
from collections import defaultdict
from src.utils import get_data_dir
from src.decorators import benchmark
from collections import namedtuple
from rasterio import warp, merge, mask


# Convenient access to data directory, is a namedtuple with folder names as attributes
DIRS = get_data_dir(str(Path('data').resolve()))

# Processing pipeline coordinates system
WGS84 = {'init': 'epsg:4326'}

# Many functions of the processing pipeline are multi-threaded this attribute controls
# max number of threads
THREADLIMIT = 12


def read_raster(item) -> rasterio.io.DatasetReader:
    """
    Helper method to return a raster file as a opened instance of
    rasterio.io.DatasetReader in read mode. Throws a exception if
    raster file is not openable with the assigned file system handle.
    
    :param item: str, pathlib.Path or rasterio.io.DatasetReader
        Should be the path to the raster file on filesystem as a string
        or pathlib.Path object. If item is a instance of DatasetReader
        the function returns immediately.
    :return: rasterio.io.DatasetReader
        Retruns an instance of rasterio.io.DatasetReader in read mode.
    """
    if isinstance(item, rasterio.io.DatasetReader):
        return item
    else:
        try:
            path = str(item)  # Cast pathlib.Path to string
            return rasterio.open(path, 'r')
        except:
            msg = 'Attr {}, Type {} is not a valid raster file'.format(item, type(item))
            raise ValueError(msg)


def fetch_metadata(features: list, from_path_or_reader: str) -> namedtuple:
    """
    This method fetches user selected metadata features from a raster file and
    returns them as a named tuple where the attribute name is the selected
    metadata feature key and the assigned value the corresponding metadata
    feature. Please refer to the documentation of rasterio for a comprehenisve 
    list of fetchable metadata features provided by a raster file.  
    
    :param features: list or tuple of str
        The requested metadata feature as a list or tuple of strings.
    :param from_path_or_reader: str, pathlib.Path or rasterio.io.DatasetReader
        Path to the raster file on drive as string or pathlib.Path object or a
        opened raster dataset.
    :return: namedtuple
        The requested metadata features as a namedtuple where the attribute 
        name is the selected metadata feature key and the assigned value the 
        corresponding metadata feature.
        Example:
        fetch_metadata(('bounds', 'crs'), path)
        (bounds=value, crs=value)
    """
    reader = read_raster(from_path_or_reader)
    
    values = []
    for f in features:
        value = reader.__getattribute__(f)
        if value is not None:
            values.append(value)
        else:
            raise ValueError('{} is not set'.format(f))
    
    # Closes the reader but if a user just want to proceed with the reader
    # provided as arg this can be a pitfal
    reader.close()
    Metadata = namedtuple('Metadata', features)
    return Metadata(*values)


# TODO refactor to def(left, right, top, y2)
def polygon_from(bounds: namedtuple) -> shapely.geometry.Polygon:
    """
    Creates a polygon object from a bounds object. 
    
    :param bounds: namedtuple
        Should be a namedtuple comprising the attributes
        left, right, top and bottom.
    :return: shapely.geometry.Polygon
        The polygon object in extent of the provided bounds
        object.
    """
    x_points = ['left', 'left', 'right', 'right']
    y_points = ['top', 'bottom', 'bottom', 'top']    
    
    polygon_bounds = [
        (bounds.__getattribute__(x), bounds.__getattribute__(y))
        for x, y in zip(x_points, y_points)
    ]
    
    return shapely.geometry.Polygon(polygon_bounds)


def reproject_bounds(bounds: namedtuple, source_crs: dict, target_crs: dict) -> namedtuple:
    """
    This method reprojects the coordinates of an bounds object to the requested
    coordinate system.
    
    :param bounds: namedtuple
        Should be a namedtuple containing the attributes
        left, right, top and bottom.
    :param source_crs: dict
        The coordinate reference system of the bounds object as a dictionary
        with the following shape:
        {'init': 'epsg:<id>'} where <id> is the epsg number of the crs 
    :param target_crs: dict
        The coordinates system for the reprojection of the bounds object.
        Shape should be equal to source_crs.
    :return: namedtuple(left, right, top, bottom)
        Reprojected bounds object
    """
    p1 = pyproj.Proj(**source_crs)
    p2 = pyproj.Proj(**target_crs)
    
    left, bottom = pyproj.transform(p1, p2, bounds.left, bounds.bottom)
    right, top = pyproj.transform(p1, p2, bounds.right, bounds.top)
    
    BoundingBox = namedtuple('BoundingBox', 'left bottom right top')
    return BoundingBox(left, bottom, right, top)


def polygoniz(paths_or_readers: list, target_crs: dict) -> gpd.GeoSeries:
    """
    This function creates a tile index from a set of raster files.
    
    :param path_or_readers: list
        Pending
    :param target_crs: dict
        If the raster files have different coordinate reference systems
        this arguement prevents a messed up dataset.
    :return: geopandas.GeoSeries
        Each element of the geoseries is a polygon
        covering the corresponding raster file.
    """
    polygons = []
    for item in paths_or_readers:
        bounds, crs = fetch_metadata(('bounds', 'crs'), item)
        if crs != target_crs:
            bounds = reproject_bounds(bounds, crs, target_crs)
        polygon = polygon_from(bounds)
        polygons.append(polygon)
        
    geometry = gpd.GeoSeries(polygons)
    geometry.crs = target_crs
    return geometry


def tile_index(rasters: list, target_crs: dict, **kwargs) -> gpd.GeoDataFrame:
    """
    Description Pending
    
    :param rasters: list
        A list of str where each element is a path to a raster file
        on disk.
    :param target_crs: dict
        The coordinate reference system which should be applied on
        the tile index dataset.
    :param **kwargs:
    :return: geopandas.GeoDataFrame
    """
    geometry = polygoniz(rasters, target_crs)
    features = pd.DataFrame(kwargs)
    
    return gpd.GeoDataFrame(features, geometry=geometry)


# TODO accept **kwargs to alter write parameters
def reproject_from(in_path: str, to_crs: dict, to_out_path: str):
    """
    This method reprojects a raster file to a selected coordinate
    reference system.
    
    :param in_path: str
        Path to raster file on drive
    :param to_crs: dict
        Target coordinate reference system for reprojection
    :param to_out_path: str
        Path where the reprojected raster file should be stored
    :return: str
        Path where the reprojected raster file is stored
    """
    with rasterio.open(in_path, 'r') as src:
        affine, width, height = rasterio.warp.calculate_default_transform(
            src_crs=src.crs,
            dst_crs=to_crs,
            width=src.width,
            height=src.height,
            **src.bounds._asdict(),
        )
        
        kwargs = src.profile.copy()
        kwargs.update(
            transform=affine,
            width=width,
            height=height,
            crs=to_crs
        )
        
        with rasterio.open(to_out_path, 'w', **kwargs) as dst:
            for idx in src.indexes:
                rasterio.warp.reproject(
                    source=rasterio.band(src, idx), 
                    destination=rasterio.band(dst, idx)
                )
        
        return to_out_path


def reproject_like(template: str, source: str, out_path: str) -> str:
    """
    """
    crs, transform, width, height = fetch_metadata(('crs', 'transform', 'width', 'height'),
                                                   template)
    
    with rasterio.open(source, 'r') as src:
        out_kwargs = src.profile.copy()
        out_kwargs.update({
            'crs': crs,
            'transform': transform,
            'width': width,
            'height': height
        })
        
        with rasterio.open(out_path, 'w', **out_kwargs) as dst:
            rasterio.warp.reproject(source=rasterio.band(src, list(range(1, src.count + 1))), 
                                    destination=rasterio.band(dst, list(range(1, src.count + 1))))
    
    return out_path


def merge_from(paths_or_readers: list, **kwargs) -> namedtuple:
    """
    Merges a list of raster files to one single raster dataset.
    This method is wrapped around the rasterio.merge.merge method
    therefore this method accept keyword arguments as well.
    
    :param paths_or_readers: list
        A list of strings where each list element reference a path to a
        raster file on drive.
    :param **kwargs:
        Please refer to the rasterio documentation for a full list
        of possible keyword arguments.
    :return: namedtuple(data, affine)
        A namedtuple with the attributes data and affine, where the parameter
        data contains the merged data of the raster files as a numpy.ndarray 
        and affine an affine transformation matrix.
    """
    readers = [read_raster(item) for item in paths_or_readers]

    dest, affine = rasterio.merge.merge(readers, **kwargs)
    
    [reader.close() for reader in readers]
    Merge = namedtuple('Merge', 'data affine')  
    return Merge(dest, affine)


def merge_alike(with_template: str, to_merge: list) -> namedtuple:
    """
    Merges the input raster files like a template raster, hence the output
    dataset has same bounds and resolution as the template raster. Both datasets
    must have the same coordinate reference system.
    
    :param with_template: str
        Path to the template raster file
    :param to_merge: list
        A list of strings where each list element reference a path to a
        raster file on drive.
    :return: namedtuple(data, affine)
        A namedtuple with the attributes data and affine, where the parameter
        data contains the merged data of the raster files as a numpy.ndarray 
        and affine an affine transformation matrix.
    """
    bounds, res = fetch_metadata(('bounds', 'res'), with_template)
    return merge_from(to_merge, bounds=bounds, res=res)


def write(data: np.ndarray, to_path: str, **kwargs):
    """
    Writes a multi-dimensional numpy.ndarray as a raster dataset to file.
    This method is wrapped around the rasterio.open method therefore 
    you can modify the methods behavior  with **kwargs arguements provided
    by the rasterio documentation.
    
    :param data: numpy.ndarray
        A multi-dimensional numpy array. If array has three dimensions
        each dimension depict a raster band. If array has two dimensions
        the resulting raster file contains a sinlge band.
    :param to_path: str
        Path where the new raster file should be stored
    :param **kwargs:
        Keyword arguments consumed by the rasterio.open function.
        Please refer to the rasterio documentation for a comprehensive
        list of possible keyword arguements.
    :return: str
        Path where the raster file is stored 
    """
    if len(data.shape) == 3:
        idx, height, width = data.shape  # z, y, x
    elif len(data.shape) == 2:
        idx = 1  # z
        height, width = data.shape  # y, x
        data = np.reshape(data.copy(), (idx, height, width))
    else:
        raise ValueError('Please, provide a valid dataset')
    
    dtype = data.dtype
    kwargs.update(
        count=idx,
        height=height,
        width=width,
        dtype=dtype
    )
    
    with rasterio.open(to_path, 'w', **kwargs) as dst:
        for i in range(idx):
            dst.write(data[i], i+1)  # rasterio band index start at one, thus we increment by one
    
    return to_path


def int_to_orient(x, y):
    """
    Converts a x- and y-coordinate to an integer north/south,
    weste/east string representation.
    Example: (x=-179.3457, y=80.2222) -> 80N_179W
             
    :param x: float
        Longitudinal coordinate  
    :param y: float
        Latitudinal coordinate 
    :return: str
        Lat/Lon coordinates as a integer string with the according
        orientation.
    """
    x = round(x)
    y = round(y)
    
    lng, we = (-1 * x, 'W') if x < 0 else (x, 'E')
    lat, ns = (-1 * y, 'S') if y < 0 else (y, 'N')
    
    return '{:02d}{}_{:03d}{}'.format(lat, ns, lng, we)


def round_bounds(bounds: namedtuple) -> namedtuple:
    attrs = ('left', 'bottom', 'right', 'top')
    
    coords = []
    for attr in attrs:
        coord = bounds.__getattribute__(attr)
        coords.append(round(coord))
    
    BoundingBox = namedtuple('BoundingBox', attrs)
    return BoundingBox(*coords)


# TODO implement error handling 
def worker(to_reproject: list, to_crs: dict, to_merge_alike: list, out_path: str, generic_name: str):
    """
    """
    template = None
    path = Path(out_path)
    
    for idx, raster in enumerate(to_reproject):
        opath = str(path / 'reproject_{}_{}'.format(idx, generic_name))
        
        if idx == 0:
            # if error throw fatal exception log
            template = reproject_from(raster, to_crs, opath)
        else:
            # if error proceed log
            reproject_like(template, raster, opath)
    
    kwargs, *_ = fetch_metadata(('profile',), template)
    
    for idx, rasters in enumerate(to_merge_alike):
        opath = str(path / 'merge_{}_{}'.format(idx, generic_name))
        
        # if error proceed log 
        data, transform = merge_alike(template, rasters)
        kwargs.update({'transform': transform})
        write(data, opath, **kwargs)

## Masking

### GFC mask

In [3]:
gfc = sorted(DIRS.gfc.glob('*.tif'))

data_len = int(len(gfc)/3)

kwargs = {
    'gain': [i.name for i in gfc[:data_len]],
    'loss': [i.name for i in gfc[data_len:2*data_len]],
    'cover': [i.name for i in gfc[2*data_len:]],
}

gfc_mask = tile_index(gfc[:data_len], WGS84, **kwargs)
gfc_mask.to_file(str(DIRS.masks / 'gfc_mask.shp'))
gfc_mask.head()

Unnamed: 0,cover,gain,loss,geometry
0,Hansen_GFC2013_treecover2000_00N_000E.tif,Hansen_GFC2013_gain_00N_000E.tif,Hansen_GFC2013_lossyear_00N_000E.tif,POLYGON ((-0.0001388888888982365 0.00013888888...
1,Hansen_GFC2013_treecover2000_00N_010E.tif,Hansen_GFC2013_gain_00N_010E.tif,Hansen_GFC2013_lossyear_00N_010E.tif,POLYGON ((9.999861111111102 0.0001388888888840...
2,Hansen_GFC2013_treecover2000_00N_010W.tif,Hansen_GFC2013_gain_00N_010W.tif,Hansen_GFC2013_lossyear_00N_010W.tif,POLYGON ((-10.0001388888889 0.0001388888888840...
3,Hansen_GFC2013_treecover2000_00N_020E.tif,Hansen_GFC2013_gain_00N_020E.tif,Hansen_GFC2013_lossyear_00N_020E.tif,POLYGON ((19.9998611111111 0.00013888888888402...
4,Hansen_GFC2013_treecover2000_00N_020W.tif,Hansen_GFC2013_gain_00N_020W.tif,Hansen_GFC2013_lossyear_00N_020W.tif,POLYGON ((-20.0001388888889 0.0001388888888840...


### GL30 mask

In [4]:
gl30 = sorted(DIRS.gl30.glob('*.tif'), key=lambda key: (key.name[7:11], key.name[0:6]))

exclude = 'n01_00 s01_00 s01_10 s01_15 s01_20 s60_00 s60_05 s60_10 s60_15 n53_00'.split()
gl30 = [item for item in gl30 if item.name[0:6] not in exclude]
data_len = int(len(gl30)/2)

kwargs = {
    'gl30_00': [i.name for i in gl30[:data_len]],
    'gl30_10': [i.name for i in gl30[data_len:]],
    'key': [i.name[0:6] for i in gl30[:data_len]]
}

gl30_mask = tile_index(gl30[data_len:], WGS84, **kwargs)
gl30_mask.to_file(str(DIRS.masks / 'gl30_mask.shp'))
gl30_mask.head()

Unnamed: 0,gl30_00,gl30_10,key,geometry
0,n02_15_2000lc030.tif,n02_15_2010lc030.tif,n02_15,POLYGON ((-174.0053601744084 20.00401663536249...
1,n03_05_2000lc030.tif,n03_05_2010lc030.tif,n03_05,POLYGON ((-168.0054833302891 10.00519024901941...
2,n03_20_2000lc030.tif,n03_20_2010lc030.tif,n03_20,POLYGON ((-168.0051433812486 25.00312959291788...
3,n04_00_2000lc030.tif,n04_00_2010lc030.tif,n04_00,POLYGON ((-162.0055192236557 5.005478418984219...
4,n04_05_2000lc030.tif,n04_05_2010lc030.tif,n04_05,"POLYGON ((-162.0054833302891 10.0051902490194,..."


### Biomass mask

In [5]:
biomass = gpd.read_file(str(DIRS.masks / 'biomass.geojson'))
biomass_mask = biomass.drop(biomass.columns[[0, 1, 4, 5]], axis=1)
biomass_mask.rename(columns={'download': 'biomass'}, inplace=True)

for idx, row in biomass_mask.iterrows():
    biomass = row.biomass.split('/')[-1]
    confidence = row.confidence.split('/')[-1]
    
    biomass_mask.at[idx, 'biomass'] = biomass
    biomass_mask.at[idx, 'confidence'] = confidence

biomass_mask.to_file(str(DIRS.masks / 'biomass_mask.shp'))
biomass_mask.head()

Unnamed: 0,biomass,confidence,geometry
0,10N_090W_merge.tif,merged_per_tropical_asia_10N_090W.tif,POLYGON ((-90.0001388885744 0.0001388894243932...
1,10N_050E_merge.tif,merged_per_tropical_asia_10N_050E.tif,"POLYGON ((49.9998611109019 7.999861110840482, ..."
2,10S_170E_merge.tif,merged_per_tropical_asia_10S_170E.tif,POLYGON ((169.9998611109663 -12.00013888921919...
3,20N_100W_merge.tif,merged_per_tropical_asia_20N_100W.tif,POLYGON ((-100.0001388891786 13.00013888876461...
4,20S_020E_merge.tif,merged_per_tropical_asia_20S_020E.tif,POLYGON ((19.99986111088579 -29.99986111075941...


## Raster alignment
- include soil layer

In [8]:
gl30_mask = gpd.read_file(str(DIRS.masks / 'gl30_mask.shp'))
gfc_mask = gpd.read_file(str(DIRS.masks / 'gfc_mask.shp'))
biomass_mask = gpd.read_file(str(DIRS.masks / 'biomass_mask.shp'))
gsoc_mask = None

intersect = gpd.overlay(gfc_mask, gl30_mask, how='intersection')
intersect = gpd.overlay(intersect, biomass_mask, how='intersection')

threads = []
for key, values in intersect.groupby(by='key', sort=False):
    if len(threads) == THREADLIMIT:
        [thread.join() for thread in threads]
        threads = []
    
    to_reproject = [
        str(DIRS.gl30 / name)
        for name in list(*zip(set(values.gl30_10), set(values.gl30_00)))
    ]
    to_merge = [
        [str(DIRS.gfc / name) for name in set(values.cover)],
        [str(DIRS.gfc / name) for name in set(values.loss)],
        [str(DIRS.gfc / name) for name in set(values.gain)],
        [str(DIRS.biomass / name) for name in set(values.biomass)]
        [str(DIRS.biomass / name) for name in set(values.confidence)]
    ]
    generic_name = '_{}.tif'.format(key)
    
    thread = threading.Thread(target=worker,
                              args=(to_reproject, WGS84, to_merge, str(DIRS.proc), generic_name))
    thread.start()
    threads.append(thread)

Exception in thread Thread-82:
Traceback (most recent call last):
  File "rasterio/_base.pyx", line 125, in rasterio._base.DatasetBase.__init__
  File "rasterio/_err.pyx", line 188, in rasterio._err.exc_wrap_pointer
rasterio._err.CPLE_OpenFailedError: /home/tobi/Documents/Master/code/python/Master/data/core/biomass/10S_170E_merge.tif: No such file or directory

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<ipython-input-1-ef283db7988d>", line 46, in read_raster
    return rasterio.open(path, 'r')
  File "/home/tobi/.local/lib/python3.5/site-packages/rasterio/__init__.py", line 263, in open
    s = DatasetReader(fp)
  File "rasterio/_base.pyx", line 127, in rasterio._base.DatasetBase.__init__
rasterio.errors.RasterioIOError: /home/tobi/Documents/Master/code/python/Master/data/core/biomass/10S_170E_merge.tif: No such file or directory

During handling of the above exception, another exception occurred:

Traceback (most re

## Croping and masking
- goal crop to a feasible extent (get rid of nodata values)
- create final mask with all layers attributes:
    - gain 
    - cover 
    - loss
    - gl30_00
    - gl30_10 
    - soil
    - biomass
    - biomass error
    - key = top, left coordinate of a layer set
    - region = continental orientation (america, afric or asia)
- TODO
    - create file dictionary key: list (sorted reproject_i, reproject_i+n, merge_0, merge_i, merge_i+n)
    - get bounds from first file 
    - round bounds to int
    - bounds to polygon and geojson geometry
    - crop files to bounds
    - save files with template name 1\_top\_left.tif (use int_to_orient)
    - put bounds polygon to a list
    - put files to dict with ...

In [20]:
# Make file dictionary
files = defaultdict(list)
regex = re.compile(r'.*(?P<key>(?:n|s)\d{2}_\d{2}).*', re.I)
for path in DIRS.proc.glob('*.tif'):
    match = regex.match(str(path))
    files[match.group('key')].append(path)
    files[match.group('key')] = sorted(files[match.group('key')])

merge_0, merge_i, merge_i+n, reproject_0, reproject_i

In [None]:
def map_filename(name: str) -> tuple:
    reproject = 'gl30_00 gl30_10'.split()
    merge = 'cover loss gain biomass confidence'.split()
    regex = re.compile(r'')
    

def clip_raster(raster: str, bounds: namedtuple):
    pass

In [92]:
attrs = ('gain', 'cover', 'loss', 'biomass', 'gl30_10', 'gl30_00')
Props = namedtuple('Props', attrs)

polygons = []
props = []
for key, val in files.items():
    # bounds issue
    bounds, *_ = fetch_metadata(('bounds',), val[0])
    bounds = round_bounds(bounds)
    if len(val) == 6:
        polygons.append(polygon_from(bounds))
        foo = [i.name for i in val]
        props.append(Props(*foo)._asdict())
        
series = gpd.GeoSeries(polygons)
series.crs = WGS84
df = pd.DataFrame(props)

layer = gpd.GeoDataFrame(df, geometry=series)
layer.to_file(str(DIRS.tmp / 'test.shp'))

In [98]:
import fiona

with fiona.open(str(DIRS.tmp / 'test.shp'), 'r') as layer:
    for fet in layer:
        raster = DIRS.proc / fet['properties']['gl30_10']
        with rasterio.open(str(raster), 'r') as src:
            kwargs = src.meta.copy()
            data, transform = rasterio.mask.mask(src, [fet['geometry']], crop=True, all_touched=True)
            print(fet['geometry'])
            print(transform)
            kwargs.update({
                'transform': transform,
                'driver': 'GTiff',
            })
            write(data, str(DIRS.tmp / raster.name), **kwargs)

{'type': 'Polygon', 'coordinates': [[(-114.0, 15.0), (-108.0, 15.0), (-108.0, 10.0), (-114.0, 10.0), (-114.0, 15.0)]]}
| 0.00, 0.00,-114.00|
| 0.00,-0.00, 15.00|
| 0.00, 0.00, 1.00|
{'type': 'Polygon', 'coordinates': [[(-54.0, 10.0), (-48.0, 10.0), (-48.0, 5.0), (-54.0, 5.0), (-54.0, 10.0)]]}
| 0.00, 0.00,-54.00|
| 0.00,-0.00, 10.00|
| 0.00, 0.00, 1.00|
{'type': 'Polygon', 'coordinates': [[(-114.0, 20.0), (-108.0, 20.0), (-108.0, 15.0), (-114.0, 15.0), (-114.0, 20.0)]]}
| 0.00, 0.00,-114.00|
| 0.00,-0.00, 20.00|
| 0.00, 0.00, 1.00|
{'type': 'Polygon', 'coordinates': [[(-60.0, 5.0), (-54.0, 5.0), (-54.0, 0.0), (-60.0, 0.0), (-60.0, 5.0)]]}
| 0.00, 0.00,-60.00|
| 0.00,-0.00, 5.00|
| 0.00, 0.00, 1.00|


KeyboardInterrupt: 