# Preprocessing

## Preparation

In [1]:
%matplotlib inline
import queue
import pyproj
import shapely
import rasterio
import threading
import numpy as np
import pandas as pd
import geopandas as gpd
from pathlib import Path
from src.utils import get_data_dir
from src.decorators import benchmark
from collections import namedtuple
from rasterio import warp, merge


DIRS = get_data_dir(str(Path('data').resolve()))
WGS84 = {'init': 'epsg:4326'}


def read_raster(item) -> rasterio.io.DatasetReader:
    """
    Helper method to return a raster file as a opened instance of
    rasterio.io.DatasetReader in read mode. Throws a exception if
    raster file is not openable with the assigned file system handle.
    
    :param item: str, pathlib.Path or rasterio.io.DatasetReader
        Should be the path to the raster file on filesystem as a string
        or pathlib.Path object. If item is a instance of DatasetReader
        the function returns immediately.
    :return: rasterio.io.DatasetReader
        Retruns an instance of rasterio.io.DatasetReader in read mode.
    """
    if isinstance(item, rasterio.io.DatasetReader):
        return item
    else:
        try:
            path = str(item)  # Cast pathlib.Path to string
            return rasterio.open(path, 'r')
        except:
            msg = 'Attr {}, Type {} is not a valid raster file'.format(item, type(item))
            raise ValueError(msg)


def fetch_metadata(features: list, from_path_or_reader: str) -> namedtuple:
    """
    This method fetches user selected metadata features from a raster file and
    returns them as a named tuple where the attribute name is the selected
    metadata feature key and the assigned value the corresponding metadata
    feature. Please refer to the documentation of rasterio for a comprehenisve 
    list of fetchable metadata features provided by a raster file.  
    
    :param features: list or tuple of str
        The requested metadata feature as a list or tuple of strings.
    :param from_path_or_reader: str, pathlib.Path or rasterio.io.DatasetReader
        Path to the raster file on drive as string or pathlib.Path object or a
        opened raster dataset.
    :return: namedtuple
        The requested metadata features as a namedtuple where the attribute 
        name is the selected metadata feature key and the assigned value the 
        corresponding metadata feature.
        Example:
        fetch_metadata(('bounds', 'crs'), path)
        (bounds=value, crs=value)
    """
    reader = read_raster(from_path_or_reader)
    
    values = []
    for f in features:
        value = reader.__getattribute__(f)
        if value is not None:
            values.append(value)
        else:
            raise ValueError('{} is not set'.format(f))
    
    # Closes the reader but if a user just want to proceed with the reader
    # provided as arg this can be a pitfal
    reader.close()
    Metadata = namedtuple('Metadata', features)
    return Metadata(*values)


# TODO refactor to def(left, right, top, y2)
def polygon_from(bounds: namedtuple) -> shapely.geometry.Polygon:
    """
    Creates a polygon obeject from a bounds object. 
    
    :param bounds: namedtuple
        Should be a namedtuple comprising the attributes
        left, right, top and bottom.
    :return: shapely.geometry.Polygon
        The polygon object in extent of the provided bounds
        object.
    """
    x_points = ['left', 'left', 'right', 'right']
    y_points = ['top', 'bottom', 'bottom', 'top']    
    
    polygon_bounds = [
        (bounds.__getattribute__(x), bounds.__getattribute__(y))
        for x, y in zip(x_points, y_points)
    ]
    
    return shapely.geometry.Polygon(polygon_bounds)


def reproject_bounds(bounds: namedtuple, source_crs: dict, target_crs: dict) -> namedtuple:
    """
    This method reprojects the coordinates of an bounds object to the requested
    coordinate system.
    
    :param bounds: namedtuple
        Should be a namedtuple containing the attributes
        left, right, top and bottom.
    :param source_crs: dict
        The coordinate reference system of the bounds object as a dictionary
        with the following shape:
        {'init': 'epsg:<id>'} where <id> is the epsg number of the crs 
    :param target_crs: dict
        The coordinates system for the reprojection of the bounds object.
        Shape should be equal to source_crs.
    :return: namedtuple(left, right, top, bottom)
        Reprojected bounds object
    """
    p1 = pyproj.Proj(**source_crs)
    p2 = pyproj.Proj(**target_crs)
    
    left, bottom = pyproj.transform(p1, p2, bounds.left, bounds.bottom)
    right, top = pyproj.transform(p1, p2, bounds.right, bounds.top)
    
    BoundingBox = namedtuple('BoundingBox', 'left bottom right top')
    return BoundingBox(left, bottom, right, top)


def polygoniz(paths_or_readers: list, target_crs: dict) -> gpd.GeoSeries:
    """
    This function creates a tile index from a set of raster files.
    
    :param path_or_readers: list
        FUCK YOU READE THE CODE AND TRY TO UNDERSTAND STUPID MORON
    :param target_crs: dict
        If the raster files have different coordinate reference systems
        this arguement prevents a messed up dataset.
    :return: geopandas.GeoSeries
        Each element of the geoseries is a polygon
        covering the corresponding raster file.
    """
    polygons = []
    for item in paths_or_readers:
        bounds, crs = fetch_metadata(('bounds', 'crs'), item)
        if crs != target_crs:
            bounds = reproject_bounds(bounds, crs, target_crs)
        polygon = polygon_from(bounds)
        polygons.append(polygon)
        
    geometry = gpd.GeoSeries(polygons)
    geometry.crs = target_crs
    return geometry


def tile_index(rasters: list, target_crs: dict, **kwargs) -> gpd.GeoDataFrame:
    """
    Description
    
    :param rasters: list
        A list of str where each element is a path to a raster file
        on disk.
    :param target_crs: dict
        The coordinate reference system which should be applied on
        the tile index dataset.
    :param **kwargs:
    :return: geopandas.GeoDataFrame
    """
    geometry = polygoniz(rasters, target_crs)
    features = pd.DataFrame(kwargs)
    
    return gpd.GeoDataFrame(features, geometry=geometry)


# TODO accept **kwargs to alter write parameters
def reproject_from(in_path: str, to_crs: dict, to_out_path: str):
    """
    This method reprojects a raster file to a selected coordinate
    reference system.
    
    :param in_path: str
        Path to raster file on drive
    :param to_crs: dict
        Target coordinate reference system for reprojection
    :param to_out_path: str
        Path where the reprojected raster file should be stored
    :return: str
        Path where the reprojected raster file is stored
    """
    with rasterio.open(in_path, 'r') as src:
        affine, width, height = rasterio.warp.calculate_default_transform(
            src_crs=src.crs,
            dst_crs=to_crs,
            width=src.width,
            height=src.height,
            **src.bounds._asdict(),
        )
        
        kwargs = src.profile.copy()
        kwargs.update(
            transform=affine,
            width=width,
            height=height,
            crs=to_crs
        )
        
        with rasterio.open(to_out_path, 'w', **kwargs) as dst:
            for idx in src.indexes:
                rasterio.warp.reproject(
                    source=rasterio.band(src, idx), 
                    destination=rasterio.band(dst, idx)
                )
        
        return to_out_path
    
    
def merge_from(paths_or_readers: list, **kwargs) -> namedtuple:
    """
    Merges a list of raster files to one single raster dataset.
    This method is wrapped around the rasterio.merge.merge method
    therefore this method accept keyword arguments as well.
    
    :param paths_or_readers: list
        A list of strings where each list element reference a path to a
        raster file on drive.
    :param **kwargs:
        Please refer to the rasterio documentation for a full list
        of possible keyword arguments.
    :return: namedtuple(data, affine)
        A namedtuple with the attributes data and affine, where the parameter
        data contains the merged data of the raster files as a numpy.ndarray 
        and affine an affine transformation matrix.
    """
    readers = [read_raster(item) for item in paths_or_readers]

    dest, affine = rasterio.merge.merge(readers, **kwargs)
    
    [reader.close() for reader in readers]
    Merge = namedtuple('Merge', 'data affine')  
    return Merge(dest, affine)


def merge_alike(with_template: str, to_merge: list) -> namedtuple:
    """
    Merges the input raster files like a template raster, hence the output
    dataset has same bounds and resolution as the template raster. Both datasets
    must have the same coordinate reference system.
    
    :param with_template: str
        Path to the template raster file
    :param to_merge: list
        A list of strings where each list element reference a path to a
        raster file on drive.
    :return: namedtuple(data, affine)
        A namedtuple with the attributes data and affine, where the parameter
        data contains the merged data of the raster files as a numpy.ndarray 
        and affine an affine transformation matrix.
    """
    bounds, res = fetch_metadata(('bounds', 'res'), with_template)
    return merge_from(to_merge, bounds=bounds, res=res)


def write(data: np.ndarray, to_path: str, **kwargs):
    """
    Writes a multi-dimensional numpy.ndarray as a raster dataset to file.
    This method is wrapped around the rasterio.open method therefore 
    you can modify the methods behavior  with **kwargs arguements provided
    by the rasterio documentation.
    
    :param data: numpy.ndarray
        A multi-dimensional numpy array. If array has three dimensions
        each dimension depict a raster band. If array has two dimensions
        the resulting raster file contains a sinlge band.
    :param to_path: str
        Path where the new raster file should be stored
    :param **kwargs:
        Keyword arguments consumed by the rasterio.open function.
        Please refer to the rasterio documentation for a comprehensive
        list of possible keyword arguements.
    :return: str
        Path where the raster file is stored 
    """
    if len(data.shape) == 3:
        idx, height, width = data.shape  # z, y, x
    elif len(data.shape) == 2:
        idx = 1  # z
        height, width = data.shape  # y, x
        data = np.reshape(data.copy(), (idx, height, width))
    else:
        raise ValueError('Please, provide a valid dataset')
    
    dtype = data.dtype
    kwargs.update(
        count=idx,
        height=height,
        width=width,
        dtype=dtype
    )
    
    with rasterio.open(to_path, 'w', **kwargs) as dst:
        for i in range(idx):
            dst.write(data[i], i+1)  # rasterio band index start at one, thus we increment by one
    
    return to_path


def int_to_orient(x, y):
    """
    Converts a x- and y-coordinate to an integer north/south,
    weste/east string representation.
    Example: (x=-179.3457, y=80.2222) -> 80N_179W
             
    :param x: float
        Longitudinal coordinate  
    :param y: float
        Latitudinal coordinate 
    :return: str
        Lat/Lon coordinates as a integer string with the according
        orientation.
    """
    x = round(x)
    y = round(y)
    
    lng, we = (-1 * x, 'W') if x < 0 else (x, 'E')
    lat, ns = (-1 * y, 'S') if y < 0 else (y, 'N')
    
    return '{:02d}{}_{:03d}{}'.format(lat, ns, lng, we)

    
def binary_jaccard(arr1, arr2, return_matrix=False):
    """
    Calculates the Jaccard Index (JI) of two equal sized binary arrays or vectors.
    If return_matrix is set to true the method provides the JI and the necessary 
    calculation matrix as a named tuple. Attention, this method does not work in-place!
    
    :param arr1, arr2: numpy.ndarray, list, tuple
        Both array alike objects sized in equal dimensions should contain exclusively 
        binary data (1,0). 
    :param return_matrix: boolean
        Optional, a boolean value determining the return of the calculation matrix. 
    :return: float OR (float, namedtuple(m11, m01, m10, m00))
        Defaultly, the method returns only the JI if, return_matrix is set to true the 
        method returns the JI and the computation matrix.
        The Matrix contains the following attributes:
        m11 = total number of attributes where arr1 == 1 and arr2 == 1
        m10 = total number of attributes where arr1 == 1 and arr2 == 0
        m01 = total number of attributes where arr1 == 0 and arr2 == 1
        m00 = not required, set to 0
    """
    A, B = np.array(arr1, dtype=np.int8), np.array(arr2, dtype=np.int8)
    
    if np.sum(np.logical_or(A<0,A>1)) != 0 or np.sum(np.logical_or(B<0,B>1)) != 0:
        raise ValueError('Attributes should contain only binary values')
  
    C = A + B
    a = (B - C) + B  # a = (A - C) + A, m10 = a == 1
    b = (A - C) + A  # b = (B - C) + B, m01 = b == 1

    # Total number of attributes where A == 1 and B == 1
    m11 = np.sum(C==2)
    # Total number of attributes where A == 1 and B == 0
    m10 = np.sum(a==-1)
    # Total number of attributes where A == 0 and B == 1
    m01 = np.sum(b==-1)
    
    jaccard = m11 / (m10 + m01 + m11)
    
    if return_matrix:
        Matrix = namedtuple('Matrix', 'm11 m10 m01 m00')
        return jaccard, Matrix(m11, m10, m01, 0)
    return jaccard


def simple_matching_coefficient(arr1, arr2, return_matrix=False):
    """
    Calculates the Simple Matching Coefficient (SMC) of two equal sized arrays or vectors.
    If return_matrix is set to true the method provides the SMC and the necessary calculation 
    matrix as a named tuple. Attention, this method does not work in-place!
    
    :param arr1, arr2: numpy.ndarray, list, tuple
        Both array alike objects sized in equal dimensions should contain exclusively 
        binary data (1,0).
    :param return_matrix: boolean
        Optional, a boolean value determining the return of the calculation matrix.
    :return: float OR (float, namedtuple(m11, m01, m10, m00))
        Defaultly, the method returns only the SMC, if return_matrix is
        set to true the method returns the SMC and the computation matrix.
        The Matrix contains the following attributes:
        m11 = total number of attributes where arr1 == 1 and arr2 == 1
        m10 = total number of attributes where arr1 == 1 and arr2 == 0
        m01 = total number of attributes where arr1 == 0 and arr2 == 1
        m00 = total number of attributes where arr1 == 0 and arr2 == 0
    """
    _, matrix = binary_jaccard(arr1, arr2, True)
    A = np.array(arr1, dtype=np.int8)
    
    # Total number of attributes where A == 0 and B == 0
    m00 = A.size - sum(matrix)
    
    smc = (matrix.m11 + m00) / A.size

    if return_matrix:
        matrix = matrix._replace(m00=m00)
        return smc, matrix
    return smc

## Masking

### GFC mask

In [2]:
gfc = sorted(DIRS.gfc.glob('*.tif'))

data_len = int(len(gfc)/3)

kwargs = {
    'gain': [i.name for i in gfc[:data_len]],
    'loss': [i.name for i in gfc[data_len:2*data_len]],
    'cover': [i.name for i in gfc[2*data_len:]],
}

gfc_mask = tile_index(gfc[:data_len], WGS84, **kwargs)
gfc_mask.to_file(str(DIRS.masks / 'gfc_mask.shp'))
gfc_mask.head()

Unnamed: 0,cover,gain,loss,geometry
0,Hansen_GFC2013_treecover2000_00N_000E.tif,Hansen_GFC2013_gain_00N_000E.tif,Hansen_GFC2013_lossyear_00N_000E.tif,POLYGON ((-0.0001388888888982365 0.00013888888...
1,Hansen_GFC2013_treecover2000_00N_010E.tif,Hansen_GFC2013_gain_00N_010E.tif,Hansen_GFC2013_lossyear_00N_010E.tif,POLYGON ((9.999861111111102 0.0001388888888840...
2,Hansen_GFC2013_treecover2000_00N_010W.tif,Hansen_GFC2013_gain_00N_010W.tif,Hansen_GFC2013_lossyear_00N_010W.tif,POLYGON ((-10.0001388888889 0.0001388888888840...
3,Hansen_GFC2013_treecover2000_00N_020E.tif,Hansen_GFC2013_gain_00N_020E.tif,Hansen_GFC2013_lossyear_00N_020E.tif,POLYGON ((19.9998611111111 0.00013888888888402...
4,Hansen_GFC2013_treecover2000_00N_020W.tif,Hansen_GFC2013_gain_00N_020W.tif,Hansen_GFC2013_lossyear_00N_020W.tif,POLYGON ((-20.0001388888889 0.0001388888888840...


### GL30 mask

In [3]:
gl30 = sorted(DIRS.gl30.glob('*.tif'), key=lambda key: (key.name[7:11], key.name[0:6]))

exclude = 'n01_00 s01_00 s01_10 s01_15 s01_20 s60_00 s60_05 s60_10 s60_15 n53_00'.split()
gl30 = [item for item in gl30 if item.name[0:6] not in exclude]
data_len = int(len(gl30)/2)

kwargs = {
    'gl30_00': [i.name for i in gl30[:data_len]],
    'gl30_10': [i.name for i in gl30[data_len:]],
    'key': [i.name[0:6] for i in gl30[:data_len]]
}

gl30_mask = tile_index(gl30[data_len:], WGS84, **kwargs)
gl30_mask.to_file(str(DIRS.masks / 'gl30_mask.shp'))
gl30_mask.head()

Unnamed: 0,gl30_00,gl30_10,key,geometry
0,n02_15_2000lc030.tif,n02_15_2010lc030.tif,n02_15,POLYGON ((-174.0053601744084 20.00401663536249...
1,n03_05_2000lc030.tif,n03_05_2010lc030.tif,n03_05,POLYGON ((-168.0054833302891 10.00519024901941...
2,n03_20_2000lc030.tif,n03_20_2010lc030.tif,n03_20,POLYGON ((-168.0051433812486 25.00312959291788...
3,n04_00_2000lc030.tif,n04_00_2010lc030.tif,n04_00,POLYGON ((-162.0055192236557 5.005478418984219...
4,n04_05_2000lc030.tif,n04_05_2010lc030.tif,n04_05,"POLYGON ((-162.0054833302891 10.0051902490194,..."


## Raster alignment
- store in files in a folder processed 
- reproject all files to wgs84 epsg4326 for convenience (entire gl30 dataset must be reprojected)
- intersect gl30 mask with gfc mask
- find gfc datasets covering a gl30 tile 
- merge them and crop them to the extent of gl30 tile
- reproject 2000 like 2010

In [17]:
# TODO refactor
def merge_worker(template, to_merge, path, **kwargs):
    data, transform = merge_alike(template, to_merge)
    
    orient = int_to_orient(transform[2], transform[5])
    to_path = path + orient + '.tif'
    
    kwargs.update(transform=transform)
    write(data, to_path, **kwargs)   


# TODO refactor
def reproject_worker(queue, **kwargs):
    path = reproject_from(kwargs['in_path'], kwargs['to_crs'], kwargs['to_out_path'])
    
    meta = fetch_metadata(features=('bounds',), from_path_or_reader=path)
    orient = int_to_orient(meta.bounds.left, meta.bounds.top)    
    name = kwargs['rename'] + orient + '.tif'
    
    src = Path(path)
    dst = Path(src.parent / name)
    src.rename(dst)
    
    queue.put(dst)

# TODO refactor
gfc_mask = gpd.read_file(str(DIRS.masks / 'gfc_mask.shp'))
gl30_mask = gpd.read_file(str(DIRS.masks / 'gl30_mask.shp'))

intersect = gpd.overlay(gfc_mask, gl30_mask, how='intersection')

for _, val in intersect.groupby(by='key', sort=False):
    # reproject gl30 tiles
    threads = []
    que = queue.Queue()
    rename = ('gl30_2000_', 'gl30_2010_')
    gl30 = list(*zip(set(val.gl30_00), set(val.gl30_10))) 
    
    for idx, item in enumerate(gl30):
        kwargs = {
            'rename': rename[idx],
            'in_path': str(DIRS.gl30/item),
            'to_crs': WGS84,
            'to_out_path': str(DIRS.proc/item)
        }
        thread = threading.Thread(target=reproject_worker, args=(que,), kwargs=kwargs)
        thread.start()
        threads.append(thread)

    [thread.join() for thread in threads]
    template = que.get()
    
    # merge gfc tiles
    threads = []
    rename = (DIRS.proc / 'gfc_gain_', DIRS.proc / 'gfc_loss_', DIRS.proc / 'gfc_treecover_')
    gain = list(map(lambda x: DIRS.gfc / x, list(set(val.gain))))
    loss = list(map(lambda x: DIRS.gfc / x, list(set(val.loss))))
    cover = list(map(lambda x: DIRS.gfc / x, list(set(val.cover))))
    gfc = (gain, loss, cover)
    for idx, item in enumerate(gfc):
        kwargs = {
            'crs': WGS84,
            'driver': 'GTiff',
            'compress': 'lzw'
        }
        thread = threading.Thread(target=merge_worker, args=(template, item, str(rename[idx])), kwargs=kwargs)
        thread.start()
        threads.append(thread)
    [thread.join() for thread in threads]

## Spatial harmonization
Workflow
- consider to use additional classes from gl30 wetlands or tundra
- initial
    - select forest (class value 20) from dataset gl30 - 2000
    - recode values to binary format 20 = 1, 0 = 0
    - select forest (class value 0 - 100) from hansen tree cover 2000
    - recode values to binary format 1 - 100 = 1, 0 = 0
    - calculate Jaccard Index with chen and hansen
- looping
    - select forest (0 + 10) - 100 from hansen tree cover 2000
    - recode values to binary format (0 + 10) - 100 = 1, 0 = 0
    - calculate Jaccard Index with chen and hansen
    - do till 30 or Jaccard Index is max
Potential Images
- world agreement map with different 
    - compare chen and hansen treccover in one image
    - sum of both dataset
    - 2 = agreement, 1 = disagreement