# Preprocessing

## Preparation

In [6]:
%matplotlib inline
import os
import re
import pyproj
import shapely
import rasterio
import threading
import numpy as np
import pandas as pd
import geopandas as gpd
from collections import namedtuple
from src.decorators import benchmark
from src.structures import DefaultOrderedDict

In [3]:
cache_folders = {
    os.path.split(root)[-1]: root
    for root, dirs, files in os.walk('data')
}

Directories = namedtuple('Directories', cache_folders.keys())
dirs = Directories(**cache_folders)

print(dirs)

Directories(gfc='data/core/gfc', data='data', core='data/core', gc='data/core/gc', gl_10='data/core/gl30/gl_10', auxiliary='data/auxiliary', gl30='data/core/gl30', gl_00='data/core/gl30/gl_00', masks='data/auxiliary/masks')


# GFC mask

In [4]:
gfc_files = sorted(os.listdir(dirs.gfc))
sub = int(len(gfc_files) / 3)
gain, lossyear, treecover = gfc_files[0:sub], gfc_files[sub:2*sub], gfc_files[2*sub:3*sub]
gfc_files = list(zip(gain, lossyear, treecover))

In [8]:
def get_raster_meta(path: str)-> tuple:
    Meta = namedtuple('Meta', 'bounds crs') 
    
    with rasterio.open(path, 'r') as src:
        bounds = src.bounds
        crs = src.crs
        
    return Meta(bounds=bounds, crs=crs)


def bounds_to_polygon(bounds: tuple) -> shapely.geometry.Polygon:
    x_points = ['left', 'left', 'right', 'right']
    y_points = ['top', 'bottom', 'bottom', 'top']
    
    polygon_bounds = [
        (bounds.__getattribute__(x), bounds.__getattribute__(y))
        for x, y in zip(x_points, y_points)
    ]  
    
    return shapely.geometry.Polygon(polygon_bounds)


def reproject_bounds(bounds: tuple, source_crs: dict, target_crs: dict) -> tuple:
    BoundingBox = namedtuple('BoundingBox', 'left bottom right top')
    p1 = pyproj.Proj(**source_crs)
    p2 = pyproj.Proj(**target_crs)
    left, bottom = pyproj.transform(p1, p2, bounds.left, bounds.bottom)
    right, top = pyproj.transform(p1, p2, bounds.right, bounds.top)
    return BoundingBox(left, bottom, right, top)


def make_properties_table(raster_files) -> pd.DataFrame:
    # if raster_files contains tuples or lists of different size sort them in ascending order
    tmp = DefaultOrderedDict(list)
    column_name = 'prop'
   
    for item in raster_files:
        if isinstance(item, str):
            tmp[column_name + '1'].append(item)
            # implement fill with Nan if more then one key in dictionary
        else:
            for idx, value in enumerate(item):
                tmp[column_name + str(idx)].append(value)

    return pd.DataFrame(tmp)


def tile_index(path_to: str, raster_files: list, target_crs: dict) -> gpd.GeoDataFrame:
    # REFACTOR put raster_files to args and properties should be provided as a extra list
    polygons = []
    
    for item in raster_files:
        if isinstance(item, str):
            raster = os.path.join(path_to, item)
        else:
            raster = os.path.join(path_to, item[0])           
        bounds, crs = get_raster_meta(raster)        
        if crs != target_crs:
            bounds = reproject_bounds(bounds, crs, target_crs)
        polygons.append(bounds_to_polygon(bounds))
    
    properties = make_properties_table(raster_files)
    geometry = gpd.GeoSeries(polygons)
    layer = gpd.GeoDataFrame(properties, geometry=geometry)
    layer.crs = target_crs    
    return layer


hansen_mask = tile_index(dirs.gfc, gfc_files, {'init': 'epsg:4326'})
hansen_mask.to_file(os.path.join(dirs.masks, 'gfc_mask.shp'))

# GL30 mask

In [5]:
gl30_files = sorted(os.listdir(dirs.gl30))

In [6]:
layer = tile_index(dirs.gl30, gl30_files, {'init': 'epsg:4326'})
layer.to_file(os.path.join(dirs.data, 'gfc_mask.shp'))

edge tiles have coordinate system issues -> x/long coords overflow bounding box of applied coordinate system

In [50]:
BoundingBox = namedtuple('BoundingBox', 'left bottom right top')
meta = get_raster_meta(os.path.join(dirs.gl30, 'n01_00_2010lc030.tif'))
meta2 = get_raster_meta(os.path.join(dirs.gl30, 'n02_15_2010lc030.tif'))
tmp = BoundingBox(203394.629525, meta.bounds.bottom, meta.bounds.right, meta.bounds.top)
bounds = reproject_bounds(tmp, meta.crs, {'init': 'epsg:4326'})
bounds2 = reproject_bounds(meta2.bounds, meta2.crs, {'init': 'epsg:4326'})
print(meta)
print(tmp)
print(bounds)
print(meta2)
print(bounds2)

Meta(bounds=BoundingBox(left=165406.4430837062, bottom=-601.0131174263079, right=834586.4430837199, top=554038.9868832752), crs=CRS({'init': 'epsg:32601'}))
BoundingBox(left=203394.629525, bottom=-601.0131174263079, right=834586.4430837199, top=554038.9868832752)
BoundingBox(left=-179.6645501420702, bottom=-0.005431635641659461, right=-173.9831051386183, top=5.005478418984218)
Meta(bounds=BoundingBox(left=176734.03821238442, bottom=1657719.0263254964, right=823264.038233795, top=2214909.026326404), crs=CRS({'init': 'epsg:32602'}))
BoundingBox(left=-174.0053601744084, bottom=14.974692376549765, right=-167.91075246245728, top=20.00401663536249)


# Spatial harmonization
Workflow
- initial
    - select forest (class value 20) from dataset gl30 - 2000
    - recode values to binary format 20 = 1, 0 = 0
    - select forest (class value 0 - 100) from hansen tree cover 2000
    - recode values to binary format 1 - 100 = 1, 0 = 0
    - calculate Jaccard Index with chen and hansen
- looping
    - select forest (0 + 10) - 100 from hansen tree cover 2000
    - recode values to binary format (0 + 10) - 100 = 1, 0 = 0
    - calculate Jaccard Index with chen and hansen
    - do till 30 or Jaccard Index is max
Potential Images
- world agreement map with different 
    - compare chen and hansen treccover in one image
    - sum of both dataset
    - 2 = agreement, 1 = disagreement

In [1]:
# TODO should it work inplace? Benchmark memory occupation! Check equal dimensions 
def binary_jaccard(arr1, arr2, return_matrix=False):
    """
    Calculates the Jaccard Index (JI) of two equal sized binary arrays or vectors.
    If return_matrix is set to true the method provides the JI and the necessary 
    calculation matrix as a named tuple.
    
    :param arr1, arr2: numpy.ndarray, list, tuple
        Both array alike objects sized in equal dimensions should contain exclusively 
        binary data (1,0). 
    :param return_matrix: boolean
        Optional, a boolean value determining the return of the calculation matrix. 
    :return: float OR (float, namedtuple)
        Defaultly, the method returns only the JI if, return_matrix is set to true the 
        method returns the JI and the computation matrix.
        The Matrix contains the following attributes:
        m11 = total number of attributes where arr1 == 1 and arr2 == 1
        m10 = total number of attributes where arr1 == 1 and arr2 == 0
        m01 = total number of attributes where arr1 == 0 and arr2 == 1
        m00 = not required, set to 0
    """
    A, B = np.array(arr1, dtype=np.int8), np.array(arr2, dtype=np.int8)
    
    if np.sum(np.logical_or(A<0,A>1)) != 0 or np.sum(np.logical_or(B<0,B>1)) != 0:
        raise ValueError('Attributes should contain only binary values')
  
    C = A + B
    a = (B - C) + B  # a = (A - C) + A, m10 = a == 1
    b = (A - C) + A  # b = (B - C) + B, m01 = b == 1

    # Total number of attributes where A == 1 and B == 1
    m11 = np.sum(C==2)
    # Total number of attributes where A == 1 and B == 0
    m10 = np.sum(a==-1)
    # Total number of attributes where A == 0 and B == 1
    m01 = np.sum(b==-1)
    
    jaccard = m11 / (m10 + m01 + m11)
    
    if return_matrix:
        Matrix = namedtuple('Matrix', 'm11 m10 m01 m00')
        return jaccard, Matrix(m11, m10, m01, 0)
    return jaccard

# TODO should it work inplace? Benchmark memory occupation! 
def simple_matching_coefficient(arr1, arr2, return_matrix=False):
    """
    Calculates the Simple Matching Coefficient (SMC) of two equal sized arrays or vectors.
    If return_matrix is set to true the method provides the SMC and the necessary calculation 
    matrix as a named tuple.
    
    :param arr1, arr2: numpy.ndarray, list, tuple
        Both array alike objects sized in equal dimensions should contain exclusively 
        binary data (1,0).
    :param return_matrix: boolean
        Optional, a boolean value determining the return of the calculation matrix.
    :return: float OR (float, namedtuple)
        Defaultly, the method returns only the SMC, if return_matrix is
        set to true the method returns the SMC and the computation matrix.
        The Matrix contains the following attributes:
        m11 = total number of attributes where arr1 == 1 and arr2 == 1
        m10 = total number of attributes where arr1 == 1 and arr2 == 0
        m01 = total number of attributes where arr1 == 0 and arr2 == 1
        m00 = total number of attributes where arr1 == 0 and arr2 == 0
    """
    _, matrix = binary_jaccard(arr1, arr2, True)
    A = np.array(arr1, dtype=np.int8)
    
    # Total number of attributes where A == 0 and B == 0
    m00 = A.size - sum(matrix)
    
    smc = (matrix.m11 + m00) / A.size

    if return_matrix:
        matrix = matrix._replace(m00=m00)
        return smc, matrix
    return smc

In [9]:
latin_america = hansen_mask.cx[,]