# Preprocessing

## Preparation

In [53]:
%matplotlib inline
import os
import pyproj
import shapely
import rasterio
import numpy as np
import pandas as pd
import geopandas as gpd
import affine
from pathlib import Path
from src.utils import get_data_dir
from collections import namedtuple
from src.structures import DefaultOrderedDict
from rasterio import warp, merge

In [3]:
dirs = get_data_dir(str(Path('data').resolve()))
dirs

Directories(gl_10=PosixPath('/home/tobi/Documents/Master/code/python/Master/data/core/gl30/gl_10'), core=PosixPath('/home/tobi/Documents/Master/code/python/Master/data/core'), america2=PosixPath('/home/tobi/Documents/Master/code/python/Master/data/tmp/meeting_06_11_2017/america2'), africa2=PosixPath('/home/tobi/Documents/Master/code/python/Master/data/tmp/meeting_06_11_2017/africa2'), meeting_06_11_2017=PosixPath('/home/tobi/Documents/Master/code/python/Master/data/tmp/meeting_06_11_2017'), masks=PosixPath('/home/tobi/Documents/Master/code/python/Master/data/auxiliary/masks'), data=PosixPath('/home/tobi/Documents/Master/code/python/Master/data'), esvd=PosixPath('/home/tobi/Documents/Master/code/python/Master/data/core/esvd'), gl_00=PosixPath('/home/tobi/Documents/Master/code/python/Master/data/core/gl30/gl_00'), auxiliary=PosixPath('/home/tobi/Documents/Master/code/python/Master/data/auxiliary'), gl30=PosixPath('/home/tobi/Documents/Master/code/python/Master/data/core/gl30'), tmp=Posix

## GFC mask

In [16]:
gfc_files = sorted(dirs.gfc.glob('*.tif'))

In [20]:
def fetch_metadata(path: str)-> namedtuple:
    Meta = namedtuple('Meta', 'bounds crs')    
    with rasterio.open(path, 'r') as src:
        bounds = src.bounds
        crs = src.crs       
    return Meta(bounds=bounds, crs=crs)


def bounds_to_polygon(bounds: namedtuple) -> shapely.geometry.Polygon:
    x_points = ['left', 'left', 'right', 'right']
    y_points = ['top', 'bottom', 'bottom', 'top']    
    polygon_bounds = [
        (bounds.__getattribute__(x), bounds.__getattribute__(y))
        for x, y in zip(x_points, y_points)
    ]      
    return shapely.geometry.Polygon(polygon_bounds)


def reproject_bounds(bounds: namedtuple, source_crs: dict, target_crs: dict) -> namedtuple:
    BoundingBox = namedtuple('BoundingBox', 'left bottom right top')
    p1 = pyproj.Proj(**source_crs)
    p2 = pyproj.Proj(**target_crs)
    left, bottom = pyproj.transform(p1, p2, bounds.left, bounds.bottom)
    right, top = pyproj.transform(p1, p2, bounds.right, bounds.top)
    return BoundingBox(left, bottom, right, top)


def geoseries(rasters: list, target_crs: dict) -> gpd.GeoSeries:
    polygons = []
    for raster in rasters:
        bounds, crs = fetch_metadata(raster)
        if crs != target_crs:
            bounds = reproject_bounds(bounds, crs, target_crs)
        polygon = bounds_to_polygon(bounds)
        polygons.append(polygon)
    geometry = gpd.GeoSeries(polygons)
    geometry.crs = target_crs
    return geometry


def attributes(**kwargs):
    pass
    
rasters = list(map(str, gfc_files))
hansen_mask = geoseries(rasters[:int(len(rasters)/3)], {'init': 'epsg:4326'})
hansen_mask
#hansen_mask.to_file(os.path.join(dirs.masks, 'gfc_mask.shp'))

0      POLYGON ((-0.0001388888888982365 0.00013888888...
1      POLYGON ((9.999861111111102 0.0001388888888840...
2      POLYGON ((-10.0001388888889 0.0001388888888840...
3      POLYGON ((19.9998611111111 0.00013888888888402...
4      POLYGON ((-20.0001388888889 0.0001388888888840...
5      POLYGON ((29.9998611111111 0.00013888888888402...
6      POLYGON ((-30.0001388888889 0.0001388888888840...
7      POLYGON ((39.9998611111111 0.00013888888888402...
8      POLYGON ((-40.0001388888889 0.0001388888888840...
9      POLYGON ((49.9998611111111 0.00013888888888402...
10     POLYGON ((-50.0001388888889 0.0001388888888840...
11     POLYGON ((59.9998611111111 0.00013888888888402...
12     POLYGON ((-60.0001388888889 0.0001388888888840...
13     POLYGON ((69.9998611111111 0.00013888888888402...
14     POLYGON ((-70.0001388888889 0.0001388888888840...
15     POLYGON ((79.9998611111111 0.00013888888888402...
16     POLYGON ((-80.0001388888889 0.0001388888888840...
17     POLYGON ((89.99986111111

## GL30 mask

- edge tiles of gl30 have coordinate system issues, x overflows boundaries of applied coor system
- result: in wgs84 polygon of a certain tile covers the entire globe

## Raster alignment
- store in files in a folder processed 
- reproject all files to wgs84 epsg4326 for convenience (entire gl30 dataset must be reprojected)
- intersect gl30 mask with gfc mask
- find gfc datasets covering a gl30 tile 
- merge them and crop them to the extent of gl30 tile

In [71]:
gl30 = dirs.gl_10 / 's24_00_2010lc030.tif'
with rasterio.open(str(gl30), 'r') as src:
    aff = rasterio.warp.calculate_default_transform(src.crs, {'init': 'epsg:4326'}, src.width, src.height, *src.bounds)
    print(src.transform)
    print(aff)

| 30.00, 0.00, 165406.44|
| 0.00,-30.00, 10000615.00|
| 0.00, 0.00, 1.00|
(Affine(0.00027041676765106055, 0.0, -42.01695836575989,
       0.0, -0.00027041676765106055, 0.00556409792285255), 22313, 18556)


## Spatial harmonization
Workflow
- consider to use additional classes from gl30 wetlands or tundra
- initial
    - select forest (class value 20) from dataset gl30 - 2000
    - recode values to binary format 20 = 1, 0 = 0
    - select forest (class value 0 - 100) from hansen tree cover 2000
    - recode values to binary format 1 - 100 = 1, 0 = 0
    - calculate Jaccard Index with chen and hansen
- looping
    - select forest (0 + 10) - 100 from hansen tree cover 2000
    - recode values to binary format (0 + 10) - 100 = 1, 0 = 0
    - calculate Jaccard Index with chen and hansen
    - do till 30 or Jaccard Index is max
Potential Images
- world agreement map with different 
    - compare chen and hansen treccover in one image
    - sum of both dataset
    - 2 = agreement, 1 = disagreement

In [14]:
# TODO should it work inplace? Benchmark memory occupation! Check equal dimensions 
def binary_jaccard(arr1, arr2, return_matrix=False):
    """
    Calculates the Jaccard Index (JI) of two equal sized binary arrays or vectors.
    If return_matrix is set to true the method provides the JI and the necessary 
    calculation matrix as a named tuple.
    
    :param arr1, arr2: numpy.ndarray, list, tuple
        Both array alike objects sized in equal dimensions should contain exclusively 
        binary data (1,0). 
    :param return_matrix: boolean
        Optional, a boolean value determining the return of the calculation matrix. 
    :return: float OR (float, namedtuple)
        Defaultly, the method returns only the JI if, return_matrix is set to true the 
        method returns the JI and the computation matrix.
        The Matrix contains the following attributes:
        m11 = total number of attributes where arr1 == 1 and arr2 == 1
        m10 = total number of attributes where arr1 == 1 and arr2 == 0
        m01 = total number of attributes where arr1 == 0 and arr2 == 1
        m00 = not required, set to 0
    """
    A, B = np.array(arr1, dtype=np.int8), np.array(arr2, dtype=np.int8)
    
    if np.sum(np.logical_or(A<0,A>1)) != 0 or np.sum(np.logical_or(B<0,B>1)) != 0:
        raise ValueError('Attributes should contain only binary values')
  
    C = A + B
    a = (B - C) + B  # a = (A - C) + A, m10 = a == 1
    b = (A - C) + A  # b = (B - C) + B, m01 = b == 1

    # Total number of attributes where A == 1 and B == 1
    m11 = np.sum(C==2)
    # Total number of attributes where A == 1 and B == 0
    m10 = np.sum(a==-1)
    # Total number of attributes where A == 0 and B == 1
    m01 = np.sum(b==-1)
    
    jaccard = m11 / (m10 + m01 + m11)
    
    if return_matrix:
        Matrix = namedtuple('Matrix', 'm11 m10 m01 m00')
        return jaccard, Matrix(m11, m10, m01, 0)
    return jaccard

# TODO should it work inplace? Benchmark memory occupation! 
def simple_matching_coefficient(arr1, arr2, return_matrix=False):
    """
    Calculates the Simple Matching Coefficient (SMC) of two equal sized arrays or vectors.
    If return_matrix is set to true the method provides the SMC and the necessary calculation 
    matrix as a named tuple.
    
    :param arr1, arr2: numpy.ndarray, list, tuple
        Both array alike objects sized in equal dimensions should contain exclusively 
        binary data (1,0).
    :param return_matrix: boolean
        Optional, a boolean value determining the return of the calculation matrix.
    :return: float OR (float, namedtuple)
        Defaultly, the method returns only the SMC, if return_matrix is
        set to true the method returns the SMC and the computation matrix.
        The Matrix contains the following attributes:
        m11 = total number of attributes where arr1 == 1 and arr2 == 1
        m10 = total number of attributes where arr1 == 1 and arr2 == 0
        m01 = total number of attributes where arr1 == 0 and arr2 == 1
        m00 = total number of attributes where arr1 == 0 and arr2 == 0
    """
    _, matrix = binary_jaccard(arr1, arr2, True)
    A = np.array(arr1, dtype=np.int8)
    
    # Total number of attributes where A == 0 and B == 0
    m00 = A.size - sum(matrix)
    
    smc = (matrix.m11 + m00) / A.size

    if return_matrix:
        matrix = matrix._replace(m00=m00)
        return smc, matrix
    return smc

In [21]:
# find tile with the most deforested pixels
latin_america = hansen_mask.cx[-120:-40,:]
africa = hansen_mask.cx[-30:50,:]
asia = hansen_mask.cx[60:180,:]

maxele = 0
relevant = ''
for items in zip(latin_america.prop1, latin_america.prop2):
    loss, cover = items
    with rasterio.open(os.path.join(dirs.gfc, loss), 'r') as src:
        data = src.read(1)
        tmp = np.sum(np.logical_and(data>0, data<11))
        if tmp > maxele:
            maxele = tmp
            relevant = items
            print(relevant)
            print(tmp)

('Hansen_GFC2013_lossyear_00N_040W.tif', 'Hansen_GFC2013_treecover2000_00N_040W.tif')
6459322
('Hansen_GFC2013_lossyear_00N_050W.tif', 'Hansen_GFC2013_treecover2000_00N_050W.tif')
58532941
('Hansen_GFC2013_lossyear_00N_060W.tif', 'Hansen_GFC2013_treecover2000_00N_060W.tif')
66095218
('Hansen_GFC2013_lossyear_10S_060W.tif', 'Hansen_GFC2013_treecover2000_10S_060W.tif')
76557728


Asia = ('Hansen_GFC2013_lossyear_10N_100E.tif', 'Hansen_GFC2013_treecover2000_10N_100E.tif') 41781778, N48_0<br>
Africa = ('Hansen_GFC2013_lossyear_00N_020E.tif', 'Hansen_GFC2013_treecover2000_00N_020E.tif') 26827468, S35_5, S35_0<br>
Latin america = ('Hansen_GFC2013_lossyear_10S_060W.tif', 'Hansen_GFC2013_treecover2000_10S_060W.tif') 76557728, S21_15, S21_10

In [19]:
# agreement map
def agreement_map(arr1, arr2):
    return arr1 + arr2

chen_files = 'gl30_america_1_binary.tif gl30_america_2_binary.tif gl30_africa_1_binary.tif gl30_africa_2_binary.tif'.split()
gfc_ids = ['all', '10', '20']
for chen_handle in zip([dirs.america1, dirs.america2, dirs.africa1, dirs.africa2], chen_files):
    path, chen_name = chen_handle
    with rasterio.open('/'.join(chen_handle), 'r') as chen:
        chen_data = chen.read(1)
        for gfc_id in gfc_ids:
            gfc_name = 'gfc_%s_%s_binary_%s.tif' % (chen_name.split('_')[1], chen_name.split('_')[2], gfc_id)
            with rasterio.open(os.path.join(path, gfc_name), 'r') as gfc:
                gfc_data = gfc.read(1)
                ji = binary_jaccard(chen_data, gfc_data)
                smc = simple_matching_coefficient(chen_data, gfc_data)
                agree_name = 'agree_%s_%s_%s.tif' % (chen_name.split('_')[1], chen_name.split('_')[2], gfc_id)
                print('{} <-> {}: JI = {:.3f}, SMC = {:.3f}'.format(chen_name, gfc_name, ji, smc))
                agree_data = gfc_data + chen_data
                with rasterio.open(os.path.join(path, agree_name), 'w', **gfc.profile) as dst:
                    dst.write(agree_data, 1)

gl30_america_1_binary.tif <-> gfc_america_1_binary_all.tif: JI = 0.678, SMC = 0.772
gl30_america_1_binary.tif <-> gfc_america_1_binary_10.tif: JI = 0.690, SMC = 0.784
gl30_america_1_binary.tif <-> gfc_america_1_binary_20.tif: JI = 0.701, SMC = 0.795
gl30_america_2_binary.tif <-> gfc_america_2_binary_all.tif: JI = 0.387, SMC = 0.632
gl30_america_2_binary.tif <-> gfc_america_2_binary_10.tif: JI = 0.402, SMC = 0.656
gl30_america_2_binary.tif <-> gfc_america_2_binary_20.tif: JI = 0.417, SMC = 0.677
gl30_africa_1_binary.tif <-> gfc_africa_1_binary_all.tif: JI = 0.737, SMC = 0.748
gl30_africa_1_binary.tif <-> gfc_africa_1_binary_10.tif: JI = 0.744, SMC = 0.757
gl30_africa_1_binary.tif <-> gfc_africa_1_binary_20.tif: JI = 0.797, SMC = 0.821
gl30_africa_2_binary.tif <-> gfc_africa_2_binary_all.tif: JI = 0.426, SMC = 0.507
gl30_africa_2_binary.tif <-> gfc_africa_2_binary_10.tif: JI = 0.427, SMC = 0.510
gl30_africa_2_binary.tif <-> gfc_africa_2_binary_20.tif: JI = 0.461, SMC = 0.581


![America](img/america.png)
![Africa](img/africa.png)