# Preprocessing

## Preparation

In [17]:
%matplotlib inline
import pyproj
import shapely
import rasterio
import numpy as np
import pandas as pd
import geopandas as gpd
from pathlib import Path
from src.utils import get_data_dir
from collections import namedtuple
from rasterio import warp, merge


DIRS = get_data_dir(str(Path('data').resolve()))
WGS84 = {'init': 'epsg:4326'}


def read_raster(item) -> rasterio.io.DatasetReader:
    if isinstance(item, rasterio.io.DatasetReader):
        return item
    else:
        try:
            path = str(item)  # Cast pathlib.Path to string
            return rasterio.open(path, 'r')
        except:
            msg = '{}/{} is not a valid raster file'.format(item, type(item))
            raise ValueError(msg)


def fetch_metadata(attributes: list, from_path_or_reader: str) -> namedtuple:
    reader = read_raster(from_path_or_reader)
    
    values = []
    for attr in attributes:
        value = reader.__getattribute__(attr)
        if value is not None:
            values.append(value)
        else:
            raise ValueError('{} is not set'.format(attr))
    
    reader.close()
    Metadata = namedtuple('Metadata', attributes)
    return Metadata(*values)

## Masking

### GFC mask

In [23]:
def polygon_from(bounds: namedtuple) -> shapely.geometry.Polygon:
    x_points = ['left', 'left', 'right', 'right']
    y_points = ['top', 'bottom', 'bottom', 'top']    
    
    polygon_bounds = [
        (bounds.__getattribute__(x), bounds.__getattribute__(y))
        for x, y in zip(x_points, y_points)
    ]
    
    return shapely.geometry.Polygon(polygon_bounds)


def reproject_bounds(bounds: namedtuple, source_crs: dict, target_crs: dict) -> namedtuple:
    p1 = pyproj.Proj(**source_crs)
    p2 = pyproj.Proj(**target_crs)
    
    left, bottom = pyproj.transform(p1, p2, bounds.left, bounds.bottom)
    right, top = pyproj.transform(p1, p2, bounds.right, bounds.top)
    
    BoundingBox = namedtuple('BoundingBox', 'left bottom right top')
    return BoundingBox(left, bottom, right, top)


def geoseries(rasters: list, target_crs: dict) -> gpd.GeoSeries:
    polygons = []
    for raster in rasters:
        bounds, crs = fetch_metadata(('bounds', 'crs'), raster)
        if crs != target_crs:
            bounds = reproject_bounds(bounds, crs, target_crs)
        polygon = polygon_from(bounds)
        polygons.append(polygon)
        
    geometry = gpd.GeoSeries(polygons)
    geometry.crs = target_crs
    return geometry


def attributes(**kwargs):
    pass
 

gfc_files = sorted(DIRS.gfc.glob('*.tif'))

hansen_mask = geoseries(gfc_files[:int(len(gfc_files)/3)], {'init': 'epsg:4326'})
hansen_mask

0      POLYGON ((-0.0001388888888982365 0.00013888888...
1      POLYGON ((9.999861111111102 0.0001388888888840...
2      POLYGON ((-10.0001388888889 0.0001388888888840...
3      POLYGON ((19.9998611111111 0.00013888888888402...
4      POLYGON ((-20.0001388888889 0.0001388888888840...
5      POLYGON ((29.9998611111111 0.00013888888888402...
6      POLYGON ((-30.0001388888889 0.0001388888888840...
7      POLYGON ((39.9998611111111 0.00013888888888402...
8      POLYGON ((-40.0001388888889 0.0001388888888840...
9      POLYGON ((49.9998611111111 0.00013888888888402...
10     POLYGON ((-50.0001388888889 0.0001388888888840...
11     POLYGON ((59.9998611111111 0.00013888888888402...
12     POLYGON ((-60.0001388888889 0.0001388888888840...
13     POLYGON ((69.9998611111111 0.00013888888888402...
14     POLYGON ((-70.0001388888889 0.0001388888888840...
15     POLYGON ((79.9998611111111 0.00013888888888402...
16     POLYGON ((-80.0001388888889 0.0001388888888840...
17     POLYGON ((89.99986111111

### GL30 mask

- edge tiles of gl30 have coordinate system issues, x overflows boundaries of applied coor system
- result: in wgs84 polygon of a certain tile covers the entire globe

## Raster alignment
- store in files in a folder processed 
- reproject all files to wgs84 epsg4326 for convenience (entire gl30 dataset must be reprojected)
- intersect gl30 mask with gfc mask
- find gfc datasets covering a gl30 tile 
- merge them and crop them to the extent of gl30 tile

In [22]:
def reproject_from(in_path: str, to_crs: dict, to_out_path: str):
    with rasterio.open(in_path, 'r') as src:
        affine, width, height = rasterio.warp.calculate_default_transform(
            src_crs=src.crs,
            dst_crs=to_crs,
            width=src.width,
            height=src.height,
            **src.bounds._asdict(),
        )
        
        kwargs = src.profile.copy()
        kwargs.update(
            transform=affine,
            width=width,
            height=height,
            crs=to_crs
        )
        
        with rasterio.open(to_out_path, 'w', **kwargs) as dst:
            for idx in src.indexes:
                rasterio.warp.reproject(
                    source=rasterio.band(src, idx), 
                    destination=rasterio.band(dst, idx)
                )
        
        return to_out_path
    
    
def merge_from(paths_or_readers: list, **kwargs) -> namedtuple:
    readers = [read_raster(item) for item in paths_or_readers]

    dest, affine = rasterio.merge.merge(readers, **kwargs)
    
    [reader.close() for reader in readers]
    Merge = namedtuple('Merge', 'dest affine')  
    return Merge(dest, affine)


def merge_alike(with_template: str, to_merge: list) -> namedtuple:
    bounds, res = fetch_metadata(('bounds', 'res'), with_template)
    return merge_from(to_merge, bounds=bounds, res=res)


def write(data: np.ndarray, to_path: str, **kwargs):
    if len(data.shape) == 3:
        idx, height, width = data.shape  # z, y, x
    else:
        idx = 1  # z
        height, width = data.shape  # y, x
        data = np.reshape(data.copy(), (idx, height, width))
    
    dtype = data.dtype
    kwargs.update(
        count=idx,
        height=height,
        width=width,
        dtype=dtype
    )
    
    with rasterio.open(to_path, 'w', **kwargs) as dst:
        for i in range(idx):
            dst.write(data[i], i+1)  # rasterio band index start at one, thus we increment by one
    
    return to_path


def worker(to_reproject, to_merge):
    pass

In [46]:
gfc_mask = gpd.read_file(str(dirs.core / 'gfc.shp'))
gl30_mask = gpd.read_file(str(dirs.core / 'gl30.shp'))

intersect = gpd.overlay(gfc_mask, gl30_mask, how='intersection')

for val in intersect.groupby('REMARK').groups.values():
    idx = [intersect.index[ele] for ele in val]
    tiles = intersect.iloc[idx]
    print(tiles)

                               prop0                                 prop1  \
28  Hansen_GFC2013_gain_20N_090W.tif  Hansen_GFC2013_lossyear_20N_090W.tif   
29  Hansen_GFC2013_gain_20N_080W.tif  Hansen_GFC2013_lossyear_20N_080W.tif   
36  Hansen_GFC2013_gain_20N_090W.tif  Hansen_GFC2013_lossyear_20N_090W.tif   
37  Hansen_GFC2013_gain_20N_090W.tif  Hansen_GFC2013_lossyear_20N_090W.tif   
38  Hansen_GFC2013_gain_20N_080W.tif  Hansen_GFC2013_lossyear_20N_080W.tif   
44  Hansen_GFC2013_gain_20N_090W.tif  Hansen_GFC2013_lossyear_20N_090W.tif   

                                        prop2 NS  UTMZONE  ROW CONTINENT  \
28  Hansen_GFC2013_treecover2000_20N_090W.tif  N       17   15   America   
29  Hansen_GFC2013_treecover2000_20N_080W.tif  N       17   15   America   
36  Hansen_GFC2013_treecover2000_20N_090W.tif  N       17   15   America   
37  Hansen_GFC2013_treecover2000_20N_090W.tif  N       17   15   America   
38  Hansen_GFC2013_treecover2000_20N_080W.tif  N       17   15   America 

## Spatial harmonization
Workflow
- consider to use additional classes from gl30 wetlands or tundra
- initial
    - select forest (class value 20) from dataset gl30 - 2000
    - recode values to binary format 20 = 1, 0 = 0
    - select forest (class value 0 - 100) from hansen tree cover 2000
    - recode values to binary format 1 - 100 = 1, 0 = 0
    - calculate Jaccard Index with chen and hansen
- looping
    - select forest (0 + 10) - 100 from hansen tree cover 2000
    - recode values to binary format (0 + 10) - 100 = 1, 0 = 0
    - calculate Jaccard Index with chen and hansen
    - do till 30 or Jaccard Index is max
Potential Images
- world agreement map with different 
    - compare chen and hansen treccover in one image
    - sum of both dataset
    - 2 = agreement, 1 = disagreement

In [14]:
def binary_jaccard(arr1, arr2, return_matrix=False):
    """
    Calculates the Jaccard Index (JI) of two equal sized binary arrays or vectors.
    If return_matrix is set to true the method provides the JI and the necessary 
    calculation matrix as a named tuple. Attention, this method does not work in-place!
    
    :param arr1, arr2: numpy.ndarray, list, tuple
        Both array alike objects sized in equal dimensions should contain exclusively 
        binary data (1,0). 
    :param return_matrix: boolean
        Optional, a boolean value determining the return of the calculation matrix. 
    :return: float OR (float, namedtuple)
        Defaultly, the method returns only the JI if, return_matrix is set to true the 
        method returns the JI and the computation matrix.
        The Matrix contains the following attributes:
        m11 = total number of attributes where arr1 == 1 and arr2 == 1
        m10 = total number of attributes where arr1 == 1 and arr2 == 0
        m01 = total number of attributes where arr1 == 0 and arr2 == 1
        m00 = not required, set to 0
    """
    A, B = np.array(arr1, dtype=np.int8), np.array(arr2, dtype=np.int8)
    
    if np.sum(np.logical_or(A<0,A>1)) != 0 or np.sum(np.logical_or(B<0,B>1)) != 0:
        raise ValueError('Attributes should contain only binary values')
  
    C = A + B
    a = (B - C) + B  # a = (A - C) + A, m10 = a == 1
    b = (A - C) + A  # b = (B - C) + B, m01 = b == 1

    # Total number of attributes where A == 1 and B == 1
    m11 = np.sum(C==2)
    # Total number of attributes where A == 1 and B == 0
    m10 = np.sum(a==-1)
    # Total number of attributes where A == 0 and B == 1
    m01 = np.sum(b==-1)
    
    jaccard = m11 / (m10 + m01 + m11)
    
    if return_matrix:
        Matrix = namedtuple('Matrix', 'm11 m10 m01 m00')
        return jaccard, Matrix(m11, m10, m01, 0)
    return jaccard


def simple_matching_coefficient(arr1, arr2, return_matrix=False):
    """
    Calculates the Simple Matching Coefficient (SMC) of two equal sized arrays or vectors.
    If return_matrix is set to true the method provides the SMC and the necessary calculation 
    matrix as a named tuple. Attention, this method does not work in-place!
    
    :param arr1, arr2: numpy.ndarray, list, tuple
        Both array alike objects sized in equal dimensions should contain exclusively 
        binary data (1,0).
    :param return_matrix: boolean
        Optional, a boolean value determining the return of the calculation matrix.
    :return: float OR (float, namedtuple)
        Defaultly, the method returns only the SMC, if return_matrix is
        set to true the method returns the SMC and the computation matrix.
        The Matrix contains the following attributes:
        m11 = total number of attributes where arr1 == 1 and arr2 == 1
        m10 = total number of attributes where arr1 == 1 and arr2 == 0
        m01 = total number of attributes where arr1 == 0 and arr2 == 1
        m00 = total number of attributes where arr1 == 0 and arr2 == 0
    """
    _, matrix = binary_jaccard(arr1, arr2, True)
    A = np.array(arr1, dtype=np.int8)
    
    # Total number of attributes where A == 0 and B == 0
    m00 = A.size - sum(matrix)
    
    smc = (matrix.m11 + m00) / A.size

    if return_matrix:
        matrix = matrix._replace(m00=m00)
        return smc, matrix
    return smc