##### Prerequisites

##### Abstract

# Introduction

# Data and methods

In [1]:
import os
import re
import queue
import zipfile
import logging
import threading
import multiprocessing

import numpy as np
import pandas as pd
import geopandas as gpd

from tropicly import *
from tropicly.utils import *
from pathlib import Path
# from bokeh.transform import jitter
# from bokeh.layouts import gridplot
from collections import defaultdict
from IPython.display import clear_output
# from bokeh.models import HoverTool, FactorRange
# from bokeh.plotting import output_notebook, show, figure, ColumnDataSource


directories = """
data
data.log
data.ana
data.driv
data.proc
data.core
data.core.ifl
data.core.gfc
data.core.soil
data.core.gl30
data.core.esvd
data.core.biomass
data.auxiliary
data.auxiliary.masks
"""

for item in directories.split():
    path = os.sep.join(item.split('.'))
    try:
        os.mkdir(path)
    except OSError:
        pass

# Convenient access to data directory, is a namedtuple with folder names as attributes
DIRS = get_data_dir(str(Path('data').resolve()))
    
# init Logging
formater = logging.Formatter('%(asctime)s %(levelname)s: %(message)s', '%d/%m/%y %H:%M:%S')
handler = logging.FileHandler(str(DIRS.log / 'utils.log'), 'a+')
handler.setLevel(logging.DEBUG)
handler.setFormatter(formater)

# LOGGER.setLevel(logging.DEBUG)
# LOGGER.addHandler(handler)

# Many functions of the processing pipeline are multi-threaded this attribute controls
# max number of threads
THREADLIMIT = 12

SPOOF_AGENT = {'headers': {'User-Agent': "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"}}

WGS84 = {'init': 'epsg:4326'}

# force bokeh plot output to jupyter notebook
# output_notebook()

def callback(msg, ratio):
    clear_output()
    print(msg.format(ratio))

## Data

### Core data

#### Global Forest Change
[**Global Forest Change 2000-2012 Version 1.0**](https://earthenginepartners.appspot.com/science-2013-global-forest/download_v1.0.html) (GFC) is the first high resolution dataset that provides a comprehensive view on the annual global forest cover change between 2000 and 2012 \cite{Hansen2013, Li2017}. The initial GFC dataset released by Hansen et al. is extended by recent releases which encompass the annual forest cover changes between [2000-2013 (Version 1.1)](https://earthenginepartners.appspot.com/science-2013-global-forest/download_v1.1.html), [2000-2014 (Version 1.2)](https://earthenginepartners.appspot.com/science-2013-global-forest/download_v1.2.html), [2000-2015 (Version 1.3)](https://earthenginepartners.appspot.com/science-2013-global-forest/download_v1.3.html) and [2000-2016 (Version 1.4)](https://earthenginepartners.appspot.com/science-2013-global-forest/download_v1.4.html) respectively. All versions of this dataset has in common, that they are derived from growing season imagery captured by the remote sensing satellite Landsat 7 Enhanced Thematic Mapper Plus (ETM+) at a spatial resolution of 30 meters per pixel \cite{Hansen2013a}. On the satellite imagery a time-series spectral metrics analysis is applied to gather the global forest extent at 2000 as well as the annual forest loss and gain. Hence, GFC comprises three independent data layers  tree cover, annually forest loss and  forest gain divided into 10x10 degree tiles by the geodetic coordinate system *World Geodetic System 1984* (EPSG:4326). Furthermore, across the provided layers the pixel data is coded in unsigned 8 bit integers. Hansen et al. defined trees as all vegetation taller than 5 meters for their study. Forest loss is defined as a stand displacement disturbance leading from a forest state to a non forest-state. To compute this losses 

[Global Forest Watch](http://www.globalforestwatch.org/) interactive map

- Flow general what is gfc then detailed info monitoring method, details of the different layers, how certain is the info
- trees defined as all vegetation higher than 5 meters Hansen2013, Hansen2013a
- forest loss defined as a stand displacement disturbance (> x% crown cover to 0% crown cover)  Hansen2013, Hansen2013a
- monitored by a reference percent tree cover stratum Hansen2013, Hansen2013a
- forest degeneration for example selective removals btw. all impacts on forest which are not lead to a non forest state are not considered Hansen2013a
- term forest refer to tree cover Hansen2013a
- gain is the inverse of loss e.g. the change of a non forest state to forest (crown cover densities >50%)
- Forest loss detection is less uncertain then gain detection (loss is more reliable) Li2017
- Gain is a more gradual and ecological complex process, signal is more difficult to detect Li2017
- Li2017 compares 4 different forest cover change products on their performance to estimate loss and gain patterns in china
- at the end show a example picture of the data


\cite{Hansen2013}
\cite{Hansen2013a}
\cite{Tropek2014}
\cite{Bellot2014}
\cite{Li2017}
\cite{Li2017a}

![Hansen preview](img/hansen_preview.png)

In [None]:
# data source URL
head = 'http://commondatastorage.googleapis.com/earthenginepartners-hansen/GFC2013/'
# files to download from source url
tails = 'treecover2000.txt gain.txt lossyear.txt'.split()

data_urls = []
for tail in tails:
    content = download(head + tail)
    data_urls += content.decode('utf-8').splitlines()

threads = []
for url in data_urls:
    lat_lon = re.search(r'(\d{2}\w_\d{3}\w)(?=\.tif)', url).groups()[0]
    lat = orientation_to_int(lat_lon.split('_')[0])
    if -20 <= lat <= 30:
        path = str(DIRS.gfc / url.split('/')[-1])
        threads.append(threading.Thread(target=download_worker, args=(url, path)))

execute_concurrent(threads, THREADLIMIT, 'Downloaded {} of 100 %', callback=callback)

#### GlobalLand30
[GlobLand30](http://www.globallandcover.com/GLC30Download/index.aspx) (GL30)

![Chen preview](img/chen_preview.png)

#### Aboveground live woody biomass density
[Aboveground Live Woody Biomass Density](http://data.globalforestwatch.org/datasets/8f93a6f94a414f9588ce4657a39c59ff_1) (LWBD)

In [None]:
url = 'http://data.globalforestwatch.org/datasets/8f93a6f94a414f9588ce4657a39c59ff_1.geojson'
path = str(DIRS.masks / 'biomass.geojson')

content = download(url)
write_binary(content, path)

biomass_mask = gpd.read_file(path)
to_download = list(biomass_mask.download) + list(biomass_mask.confidence) 

threads = []
for url in to_download:
    path = str(DIRS.biomass / url.split('/')[-1])
    threads.append(
        threading.Thread(target=download_worker, args=(url, path), kwargs=SPOOF_AGENT)
    )

execute_concurrent(threads, THREADLIMIT, msg='Donwloaded {} % of 100 %', callback=callback)

#### Global soil organic carbon map
[The Global Soil Organic Carbon Map](http://www.fao.org/world-soil-day/global-soil-organic-carbon-map/en/) (GSOCmap)

In [None]:
# Download broken fix it please
url = 'https://unfao-my.sharepoint.com/personal/guillermo_olmedo_fao_org/_layouts/15/guestaccess.aspx?docid=059b0b724d08a42e4931c35cff99a15c1&authkey=AcRgiPkRQvm_kuJb-K0-e2o&e=e70096969c4e4ce084d2ee1d1ca8bc44'

content = download(url, **SPOOF_AGENT)
write_binary(content, str(DIRS.soil / 'GSOCmap.tif'))

#### Ecosystem service valuation database
[Ecosystem Service Valuation Database](https://www.es-partnership.org/services/data-knowledge-sharing/ecosystem-service-valuation-database/) (ESVD)

In [None]:
url = 'https://www.es-partnership.org/wp-content/uploads/2016/06/ESVD-TEEB-database.xls'

content = download(url, **SPOOF_AGENT)
write_binary(content, str(DIRS.esvd / url.split('/')[-1]))

#### Intact Forest Landscapes
[Intact Forest Landscapes](http://intactforests.org/index.html) (IFL2000)

In [None]:
url = 'http://intactforests.org/shp/IFL_2000.zip'

content = download(url, **SPOOF_AGENT)
write_binary(content, str(DIRS.ifl / url.split('/')[-1]))

zipfile.ZipFile(str(DIRS.ifl / url.split('/')[-1])).extractall(str(DIRS.ifl))
os.remove(str(DIRS.ifl / url.split('/')[-1]))

### Auxiliary data

[Natural Earth Data](http://www.naturalearthdata.com/)

In [None]:
# departement map needed
url = 'http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_populated_places.zip'

content = download(url, **SPOOF_AGENT)
write_binary(content, str(DIRS.masks / url.split('/')[-1]))

zipfile.ZipFile(str(DIRS.masks / url.split('/')[-1])).extractall(str(DIRS.masks))
os.remove(str(DIRS.masks / url.split('/')[-1]))

url = 'http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip'

content = download(url, **SPOOF_AGENT)
write_binary(content, str(DIRS.masks / url.split('/')[-1]))

zipfile.ZipFile(str(DIRS.masks / url.split('/')[-1])).extractall(str(DIRS.masks))
os.remove(str(DIRS.masks / url.split('/')[-1]))

## Methods

### Preprocessing

#### Masking

In [None]:
# GFC mask
gfc = sorted(DIRS.gfc.glob('*.tif'))

data_len = int(len(gfc)/3)

kwargs = {
    'gain': [i.name for i in gfc[:data_len]],
    'loss': [i.name for i in gfc[data_len:2*data_len]],
    'cover': [i.name for i in gfc[2*data_len:]],
}

gfc_mask = tile_index(gfc[:data_len], WGS84, **kwargs)
gfc_mask.to_file(str(DIRS.masks / 'gfc_mask.shp'))
gfc_mask.head()

In [2]:
# GL30 mask
gl30 = sorted(DIRS.gl30.glob('*.tif'), key=lambda key: (key.name[7:11], key.name[0:6]))

exclude = 'n01_00 s01_00 s01_10 s01_15 s01_20 s60_00 s60_05 s60_10 s60_15 n53_00'.split()
gl30 = [item for item in gl30 if item.name[0:6] not in exclude]
data_len = int(len(gl30)/2)

kwargs = {
    'gl30_00': [i.name for i in gl30[:data_len]],
    'gl30_10': [i.name for i in gl30[data_len:]],
    'key': [i.name[0:6] for i in gl30[:data_len]]
}

gl30_mask = tile_index(gl30[data_len:], WGS84, **kwargs)
gl30_mask.to_file(str(DIRS.masks / 'gl30_mask.shp'))
gl30_mask.head()

Unnamed: 0,gl30_00,gl30_10,key,geometry
0,n02_15_2000lc030.tif,n02_15_2010lc030.tif,n02_15,POLYGON ((-174.0053601744084 20.00401663536249...
1,n03_05_2000lc030.tif,n03_05_2010lc030.tif,n03_05,POLYGON ((-168.0054833302891 10.00519024901941...
2,n03_20_2000lc030.tif,n03_20_2010lc030.tif,n03_20,POLYGON ((-168.0051433812486 25.00312959291788...
3,n04_00_2000lc030.tif,n04_00_2010lc030.tif,n04_00,POLYGON ((-162.0055192236557 5.005478418984219...
4,n04_05_2000lc030.tif,n04_05_2010lc030.tif,n04_05,"POLYGON ((-162.0054833302891 10.0051902490194,..."


In [None]:
# Biomass mask
biomass = gpd.read_file(str(DIRS.masks / 'biomass.geojson'))
biomass_mask = biomass.drop(biomass.columns[[0, 1, 4, 5]], axis=1)
biomass_mask.rename(columns={'download': 'biomass'}, inplace=True)

for idx, row in biomass_mask.iterrows():
    biomass = row.biomass.split('/')[-1]
    confidence = row.confidence.split('/')[-1]
    
    biomass_mask.at[idx, 'biomass'] = biomass
    biomass_mask.at[idx, 'confidence'] = confidence

biomass_mask.to_file(str(DIRS.masks / 'biomass_mask.shp'))
biomass_mask.head()

#### Raster alignment

In [None]:
gl30_mask = gpd.read_file(str(DIRS.masks / 'gl30_mask.shp'))
gfc_mask = gpd.read_file(str(DIRS.masks / 'gfc_mask.shp'))
biomass_mask = gpd.read_file(str(DIRS.masks / 'biomass_mask.shp'))

intersect = gpd.overlay(gfc_mask, gl30_mask, how='intersection')
intersect = gpd.overlay(intersect, biomass_mask, how='intersection')

threads = []
for key, values in intersect.groupby(by='key', sort=False):
    to_reproject = [
        str(DIRS.gl30 / name)
        for name in list(*zip(set(values.gl30_10), set(values.gl30_00)))
    ]
    to_reproject.append(str(DIRS.soil / 'GSOCmapV1.1.tif'))
    to_merge = [
        [str(DIRS.gfc / name) for name in set(values.cover)],
        [str(DIRS.gfc / name) for name in set(values.loss)],
        [str(DIRS.gfc / name) for name in set(values.gain)],
        [str(DIRS.biomass / name) for name in set(values.biomass)],
        [str(DIRS.biomass / name) for name in set(values.confidence)],
    ]
    generic_name = '{}.tif'.format(key)
    
    threads.append(
        threading.Thread(target=alignment_worker,
                         args=(to_reproject, WGS84, to_merge, str(DIRS.proc), generic_name))
    )
    
execute_concurrent(threads, THREADLIMIT, msg='Aligned {} % of 100 %', callback=callback)

#### Cropping and masking

In [None]:
files = defaultdict(list)
regex = re.compile(r'.*(?P<key>(?:n|s)\d{2}_\d{2}).*', re.I)

for path in DIRS.proc.glob('*.tif'):
    match = regex.match(str(path))
    files[match.group('key')].append(path)
    files[match.group('key')] = sorted(files[match.group('key')])

threads = []
features = []
polygons = []
for key, values in files.items():   
    bounds, profile = fetch_metadata(values[0], 'bounds', 'profile')
    bounds = round_bounds(bounds)

    threads.append(
        threading.Thread(target=clip_worker, args=(values, bounds, profile, str(DIRS.proc),))
    )
    
    key = int_to_orient(bounds.left, bounds.top)
    feature = dict([dispatch_name('{0[0]}_{0[1]}'.format(item.name.split('_')), key, idx)
                    for idx, item in enumerate(values)])
    feature['key'] = key
    features.append(feature)
    polygons.append(polygon_from(bounds))
    
execute_concurrent(threads, THREADLIMIT, msg='Cropped {} % of 100 %', callback=callback)

geometry = gpd.GeoSeries(polygons)
df = pd.DataFrame(features)
layer = gpd.GeoDataFrame(df, geometry=geometry)
layer.crs = WGS84

layer.to_file(str(DIRS.masks / 'final_mask.shp'))

In [7]:
countries = gpd.read_file(str(DIRS.masks / 'ne_10m_admin_0_countries.shp'))
tiles = gpd.read_file(str(DIRS.masks / 'final_mask.shp'))
tiles.crs = WGS84

aoi = countries.cx[:,-23:23]
aoi = aoi[['REGION_UN', 'geometry']]
continents = aoi.dissolve(by='REGION_UN')

layer = gpd.sjoin(tiles, continents, how='left', op='intersects')
layer.columns = 'biomass confidence cover gain gl30_00 gl30_10 key loss soil geometry region'.split()
layer.to_file(str(DIRS.masks / 'final_region_mask.shp'))

#### Class harmonization

In [3]:
mask = gpd.read_file(str(DIRS.masks / 'final_region_mask.shp'))
records = []

threads = [
    threading.Thread(target=hworker,
                     args=(str(DIRS.proc/row.gl30_00), str(DIRS.proc/row.cover), 
                           records, row.key, row.region))
    for idx, row in mask.iterrows()
]

execute_concurrent(threads, THREADLIMIT, msg='Harmonization {} % of 100 %', callback=callback)

df = pd.DataFrame(records)
df.to_csv(str(DIRS.ana/'harmonization.csv'), index=False)

Harmonization 100.0 % of 100 %


### Processing

#### Deforestation drivers

In [3]:
mask = gpd.read_file(str(DIRS.masks/'final_region_mask.shp'))

processes = []
for idx, row in mask.iterrows():
    treecover = str(DIRS.proc/row.cover)
    landcover = str(DIRS.proc/row.gl30_10)
    loss = str(DIRS.proc/row.loss)
    gain = str(DIRS.proc/row.gain)
    
    name = 'driver_{}.tif'.format(row.key)
    path = str(DIRS.driv / name)
    
    process = multiprocessing.Process(target=cworker, args=(landcover, treecover, gain, loss, path))
    processes.append(process)
    
execute_concurrent(processes, THREADLIMIT, msg='Finished {} % of 100 %', callback=callback)

Finished 100.0 % of 100 %


In [9]:
# Update mask
mask = gpd.read_file(str(DIRS.masks/'final_region_mask.shp'))
regex = re.compile(r'.+(?P<key>\d{2}(?:N|S).\d{3}(?:W|E))\..+')

records = [
    [regex.match(str(item)).group('key'), item.name]
    for item in DIRS.driv.glob('driver*.tif')
]

df = pd.DataFrame(records, columns=['key', 'driver_2'])

product = mask.merge(df, on='key')
product.to_file(str(DIRS.masks/'final_region_mask.shp'))

#### Accuracy assessment

In [11]:
mask = gpd.read_file(str(DIRS.masks/'final_region_mask.shp'))
records = []

threads = [
    threading.Thread(target=fworker, args=(str(DIRS.driv / row.driver_2), records, row.key, row.region)) 
    for idx, row in mask.iterrows()
]

execute_concurrent(threads, THREADLIMIT, msg='Frequenc {} % of 100 %', callback=callback)

Frequenc 100.0 % of 100 %


In [45]:
df = pd.DataFrame.from_records(records)
df.fillna(0, inplace=True)
df['total_px'] = df.apply(lambda row: sum(row[:9]) + sum(row[11:]) , axis=1)
df['data_px'] = df.apply(lambda row: sum(row[1:9]) + sum(row[11:13]), axis=1)
df['relevant_px'] = df.apply(lambda row: sum(row[2:9]) + sum(row[11:13]), axis=1)

df.to_csv(str(DIRS.ana / 'driver.csv'), index=False)

In [49]:
random = np.random.RandomState(42)
mask = gpd.read_file(str(DIRS.masks/'final_region_mask.shp'))

tiles = []
for region, df in mask.groupby(by='region'):
    if region in ('Africa', 'Americas', 'Asia'):
        tiles += list(random.choice(df.driver_2, size=10))

In [56]:
records = []

processes = [
    threading.Thread(target=sworker, args=(str(DIRS.driv/f), records),
                            kwargs=({'samples': 200, 'seed': 42}))
    for f in tiles
]

execute_concurrent(processes, THREADLIMIT, msg='Sampling {} % from 100 %', callback=callback)

for idx, samples in enumerate(records):
    name = 'sample_{}.csv'.format(idx + 1)
    df = df.from_dict(samples)
    df.to_csv(str(DIRS.ana/name), index=False)

Sampling 100.0 % from 100 %


In [3]:
reference = []
prediction = []

for v in DIRS.ana.glob('sample_*.csv'):
    df = pd.read_csv(str(v))
    prediction += list(df.label)
    reference += list(df.validation)

cm = ConfusionMatrix.from_lists(reference, prediction)

In [8]:
print(cm.label)
print(cm)
print(cm.label)
print(cm.normalize(method='com'))

[10, 20, 25, 30, 40, 50, 60, 80, 90]
[[ 688   38   62   10   15    2    3    5    0  823]
 [  42  728   56  187   31   12    0   17    2 1075]
 [  24  167 1117  168   16   10    5    9    4 1520]
 [  15  176   32 1339   70   21    0   17    0 1670]
 [   7   12    4   34  324    0    1    2    0  384]
 [   0    2    3    8    2   42    0    1    0   58]
 [   2    1    0    3    0    2   18    2    0   28]
 [   3    3    0    1    1    1    0   50    0   59]
 [   0    0    0    1    0    0    0    3    5    9]
 [ 781 1127 1274 1751  459   90   27  106   11 5626]]
[10, 20, 25, 30, 40, 50, 60, 80, 90]
[[0.88 0.03 0.05 0.01 0.03 0.02 0.11 0.05 0.   0.  ]
 [0.05 0.65 0.04 0.11 0.07 0.13 0.   0.16 0.18 0.  ]
 [0.03 0.15 0.88 0.1  0.03 0.11 0.19 0.08 0.36 0.  ]
 [0.02 0.16 0.03 0.76 0.15 0.23 0.   0.16 0.   0.  ]
 [0.01 0.01 0.   0.02 0.71 0.   0.04 0.02 0.   0.  ]
 [0.   0.   0.   0.   0.   0.47 0.   0.01 0.   0.  ]
 [0.   0.   0.   0.   0.   0.02 0.67 0.02 0.   0.  ]
 [0.   0.   0.   0.   0.

#### Emissions

In [2]:
def callback(*args, **kwargs):
    ratio = (kwargs['pending'] / kwargs['total']) * 100
    ratio = round(ratio, 2)
    print('{} % of 100 %'.format(ratio))

In [3]:
mask = gpd.read_file(str(DIRS.masks/'final_region_mask.shp'))
regex = re.compile(r'.+(?P<key>\d{2}(?:N|S).\d{3}(?:W|E))\..+')

sheduler = TaskSheduler('soc', 4)
sheduler.on_progress.connect(callback)

for idx, row in mask.iterrows():
    driver = str(DIRS.driv/row.driver_2)
    soc = str(DIRS.proc/row.soil)
    
    key = regex.match(row.driver_2).group('key')
    name = 'soc_min_{}.tif'.format(key)
    namee = 'soc_max_{}.tif'.format(key)
    path_min = str(DIRS.driv/name)
    path_max = str(DIRS.driv/namee)
    
    sheduler.add_task(
        threading.Thread(target=socworker, args=(driver, soc, path_min), kwargs=({'method': 'min'}))
    )
    sheduler.add_task(
        threading.Thread(target=socworker, args=(driver, soc, path_max), kwargs=({'method': 'max'}))
    )

99.83 % of 100 %
99.66 % of 100 %
99.31 % of 100 %
99.14 % of 100 %
98.97 % of 100 %
98.79 % of 100 %
98.62 % of 100 %
98.28 % of 100 %
98.1 % of 100 %
97.93 % of 100 %
97.76 % of 100 %
97.59 % of 100 %
97.41 % of 100 %
97.24 % of 100 %
97.07 % of 100 %
96.9 % of 100 %
96.72 % of 100 %
96.55 % of 100 %
96.38 % of 100 %
96.21 % of 100 %
96.03 % of 100 %
95.86 % of 100 %
95.69 % of 100 %
95.52 % of 100 %
95.34 % of 100 %
95.17 % of 100 %
95.0 % of 100 %
94.83 % of 100 %
94.66 % of 100 %
94.48 % of 100 %
94.31 % of 100 %
94.14 % of 100 %
93.97 % of 100 %
93.79 % of 100 %
93.62 % of 100 %
93.45 % of 100 %
93.28 % of 100 %
93.1 % of 100 %
92.93 % of 100 %
92.76 % of 100 %
92.59 % of 100 %
92.41 % of 100 %
92.24 % of 100 %
92.07 % of 100 %
91.9 % of 100 %
91.72 % of 100 %
91.55 % of 100 %
91.38 % of 100 %
91.21 % of 100 %
91.03 % of 100 %
90.86 % of 100 %
90.69 % of 100 %
90.52 % of 100 %
90.34 % of 100 %
90.17 % of 100 %
90.0 % of 100 %
89.83 % of 100 %
89.66 % of 100 %
89.48 % of 100 %
89.

In [22]:
regex = re.compile(r'.+(?P<key>\d{2}(?:N|S).\d{3}(?:W|E))\..+')
agb = [str(obj) for obj in DIRS.driv.glob('agbe*.tif')]
smin = [str(obj) for obj in DIRS.driv.glob('soc_min*.tif')]
smax = [str(obj) for obj in DIRS.driv.glob('soc_max*.tif')]

agb = sorted(agb, key=lambda name: regex.match(name).group('key'))
smin = sorted(smin, key=lambda name: regex.match(name).group('key'))
smax = sorted(smax, key=lambda name: regex.match(name).group('key'))

In [23]:
from collections import defaultdict
import rasterio as rio
bib = defaultdict(list)

In [24]:
for items in [agb, smin, smax]:
    for path in items:
        key = regex.match(path).group('key')
        bib[key].append(path)

In [25]:
def create_record(key, vals, records):
    record = {'tile': key}
    
    for idx, val in enumerate(vals):
        total = 0
        name = 'arg{}'.format(idx)

        with rio.open(val, 'r') as src:
            total = np.sum(src.read(1))
        record[name] = total
    
    records.append(record)

In [26]:
sheduler = TaskSheduler('sum', 8)
sheduler.on_progress.connect(callback)

records = []
for key, vals in bib.items():
    sheduler.add_task(
        threading.Thread(target=create_record, args=(key, vals, records))
    )

99.63 % of 100 %
99.26 % of 100 %
98.88 % of 100 %
98.51 % of 100 %
98.14 % of 100 %
97.4 % of 100 %
97.03 % of 100 %
96.65 % of 100 %
96.28 % of 100 %
95.91 % of 100 %
95.54 % of 100 %
95.17 % of 100 %
94.05 % of 100 %
93.68 % of 100 %
93.31 % of 100 %
92.94 % of 100 %
92.57 % of 100 %
91.82 % of 100 %
91.45 % of 100 %
91.08 % of 100 %
90.71 % of 100 %
89.96 % of 100 %
89.59 % of 100 %
89.22 % of 100 %
88.85 % of 100 %
88.1 % of 100 %
87.73 % of 100 %
87.36 % of 100 %
86.99 % of 100 %
86.62 % of 100 %
86.25 % of 100 %
85.5 % of 100 %
85.13 % of 100 %
84.76 % of 100 %


Exception in thread Thread-633:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.5/threading.py", line 862, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-25-8efd01477452>", line 8, in create_record
    with rio.open(val, 'r') as src:
  File "/home/tobi/.local/lib/python3.5/site-packages/rasterio/__init__.py", line 263, in open
    s = DatasetReader(fp)
  File "rasterio/_base.pyx", line 125, in rasterio._base.DatasetBase.__init__
  File "rasterio/_err.pyx", line 188, in rasterio._err.exc_wrap_pointer
rasterio._err.CPLE_AppDefinedError: MissingRequired:TIFF directory is missing required "ImageLength" field



84.39 % of 100 %
84.01 % of 100 %
83.64 % of 100 %
83.27 % of 100 %
82.9 % of 100 %
82.53 % of 100 %
82.16 % of 100 %
81.78 % of 100 %
81.41 % of 100 %
81.04 % of 100 %
80.67 % of 100 %
80.3 % of 100 %
79.93 % of 100 %
79.55 % of 100 %
79.18 % of 100 %
78.81 % of 100 %
78.44 % of 100 %
78.07 % of 100 %
77.7 % of 100 %
77.32 % of 100 %
76.95 % of 100 %
76.58 % of 100 %
76.21 % of 100 %
75.84 % of 100 %
75.46 % of 100 %
75.09 % of 100 %
74.72 % of 100 %
74.35 % of 100 %
73.98 % of 100 %
73.61 % of 100 %
73.23 % of 100 %
72.86 % of 100 %
72.49 % of 100 %
72.12 % of 100 %
71.75 % of 100 %
71.38 % of 100 %
71.0 % of 100 %
70.63 % of 100 %
70.26 % of 100 %
69.89 % of 100 %
69.52 % of 100 %
69.14 % of 100 %
68.77 % of 100 %
68.4 % of 100 %
68.03 % of 100 %
67.66 % of 100 %
67.29 % of 100 %
66.91 % of 100 %
66.54 % of 100 %
66.17 % of 100 %
65.8 % of 100 %
65.43 % of 100 %
65.06 % of 100 %
64.68 % of 100 %
64.31 % of 100 %
63.94 % of 100 %
63.57 % of 100 %
63.2 % of 100 %
62.83 % of 100 %
62.4

In [27]:
records

[{'arg0': 0.0, 'arg1': 0.0, 'arg2': 0.0, 'tile': '05S_048E'},
 {'arg0': 0.0, 'arg1': 0.0, 'arg2': 0.0, 'tile': '20N_144E'},
 {'arg0': 0.0, 'arg1': 0.0, 'arg2': 0.0, 'tile': '05N_150E'},
 {'arg0': 462560420.0,
  'arg1': 3898321.2,
  'arg2': 7352931.0,
  'tile': '05N_078W'},
 {'arg0': 0.0, 'arg1': 0.0, 'arg2': 0.0, 'tile': '00N_168E'},
 {'arg0': 4933264.5, 'arg1': 9020.769, 'arg2': 20207.379, 'tile': '05N_006W'},
 {'arg0': 78655224.0,
  'arg1': 1734525.9,
  'arg2': 2620261.2,
  'tile': '10N_102E'},
 {'arg0': 47728484.0,
  'arg1': 1408329.5,
  'arg2': 2641836.5,
  'tile': '10N_030E'},
 {'arg0': 887230400.0,
  'arg1': 6499906.5,
  'arg2': 10099889.0,
  'tile': '00N_114E'},
 {'arg0': 1535400.6, 'arg1': 42917.918, 'arg2': 65059.13, 'tile': '15N_078E'},
 {'arg0': 4780143.0, 'arg1': 116889.65, 'arg2': 210246.92, 'tile': '20N_108W'},
 {'arg0': 39784.383, 'arg1': 2426.4197, 'arg2': 5538.7495, 'tile': '25N_072W'},
 {'arg0': 78140270.0, 'arg1': 34916.5, 'arg2': 79286.26, 'tile': '00N_126E'},
 {'ar

In [29]:
df = pd.DataFrame.from_records(records)

In [33]:
df.to_csv('/home/tobi/Documents/test.csv', index=False)

# Results

## Preprocessing

#### Class harmonization

In [22]:
src = pd.read_csv(str(DIRS.ana / 'harmonization.csv'))

# initial data clean up
src.rename(columns=lambda x: x.upper() if x[:2] == 'jc' else x, inplace=True)
src.dropna(axis=0, how='any', inplace=True)
src.columns = 'JC0 JC10 JC20 JC30 tile region'.split()

# scatterplot data prep
melted = src.melt(id_vars='tile region'.split(), var_name='jc_class', value_name='score')
melted['colors'] = '#ffffff'
melted.loc[melted['jc_class'] == 'JC0', 'colors'] = '#e66101'
melted.loc[melted['jc_class'] == 'JC10', 'colors'] = '#fdb863'
melted.loc[melted['jc_class'] == 'JC20', 'colors'] = '#b2abd2'
melted.loc[melted['jc_class'] == 'JC30', 'colors'] = '#5e3c99'
melted.sort_values(by=['region', 'jc_class'], ascending=[True, True], inplace=True)

# boxplot data prep
frames = []
for key, df in src.groupby('region'):
    boxplot = df.quantile(q=(0.25, 0.5, 0.75)).T
    boxplot.columns = ['q1', 'q2', 'q3']
    boxplot['iqr'] = boxplot.q3 - boxplot.q1
    boxplot['tukey_lower_whisker'] = boxplot.q1 - 1.5 * boxplot.iqr
    boxplot['tukey_upper_whisker'] = boxplot.q3 + 1.5 * boxplot.iqr
    boxplot['q_lower_whisker'] = df.quantile(q=0.025)
    boxplot['q_upper_whisker'] = df.quantile(q=0.975)
    boxplot['min_whisker'] = df.min()
    boxplot['max_whisker'] = df.max()
    boxplot['means'] = df.mean()
    boxplot['region'] = pd.unique(df.region)[0]

    frames.append(boxplot)

box = pd.concat(frames)

In [23]:
# titel and histogram
# scatterplot
source = ColumnDataSource({'x': list(zip(melted.region, melted.jc_class)),
                           'y': melted.score,
                           'id': melted.tile,
                           'colors': melted.colors})
hover = HoverTool(tooltips=[('Region/Class', '@x'),
                            ('Tile', '@id'),
                            ('JC-Score', '@y'),])
factors = [(reg, cls) 
           for reg in pd.unique(melted.region) 
           for cls in pd.unique(melted.jc_class)]

scatter = figure(x_range=FactorRange(*factors), plot_width=950, plot_height=600,
              tools=[hover, 'pan', 'wheel_zoom', 'save', 'reset', 'box_zoom'],
              title="Jaccard score per forest cover class")

scatter.x(x=jitter('x', width=0.6, range=scatter.x_range), y='y', color='colors', source=source)

scatter.xgrid.grid_line_color = None
scatter.xaxis.axis_label = "Region/Class"
scatter.yaxis.axis_label = "Jaccard score"
scatter.y_range.start = -0.01

# boxplot
source = ColumnDataSource({'x': list(zip(box.region, box.index)),
                           'q1': box.q1,
                           'q2': box.q2,
                           'q3': box.q3,
                           'iqr': box.iqr,
                           'lw': box.min_whisker,
                           'uw': box.max_whisker,
                           'means': box.means})
hover = HoverTool(tooltips=[("Region/Class", "@x"),
                            ("Q1", "@q1"),
                            ("Q2", "@q2"),
                            ("Q3", "@q3"),
                            ("IQR", "@iqr"),
                            ("lWhisker", "@lw"),
                            ("uWhisker", "@uw"),
                            ("Mean", "@means"),])

plot = figure(x_range=scatter.x_range, y_range=scatter.y_range,
              plot_width=950, plot_height=300,
              tools=[hover, 'pan', 'wheel_zoom', 'save', 'reset', 'box_zoom'])

# box
plot.vbar(x='x', width=0.7, bottom='q1', top='q2',
          line_color='black', fill_color='#f7f7f7', fill_alpha=0.7, source=source)
plot.vbar(x='x', width=0.7, bottom='q2', top='q3',
          line_color='black', fill_color='#67a9cf', fill_alpha=0.7, source=source)

# whiskers
plot.rect(x='x', y='lw', width=0.2, height=0.001,
          line_color="black", source=source)
plot.rect(x='x', y='uw', width=0.2, height=0.001,
          line_color="black", source=source)

# stems
plot.segment(x0='x', y0='lw', x1='x', y1='q1',
             line_color='black', source=source)
plot.segment(x0='x', y0='q3', x1='x', y1='uw',
             color='black', source=source)

# mean cross
plot.x(x='x', y='means', color='#ef8a62', size=10, source=source)

plot.xgrid.grid_line_color = None
plot.xaxis.axis_label = "Region/Class"
plot.yaxis.axis_label = "Jaccard score"
plot.y_range.start = -0.01

# display plots
show(gridplot([[scatter],[plot]]))

# Discussion

# References

[<a id="cit-Hansen2013" href="#call-Hansen2013">1</a>] C. M., V. P., Moore R. <em>et al.</em>, ``_High-Resolution Global Maps of 21st-Century Forest Cover Change_'', Science, vol. 342, number 6160, pp. 850--853, November 2013.

[<a id="cit-Li2017" href="#call-Li2017">2</a>] Li Yan, Sulla-Menashe Damien, Motesharrei Safa <em>et al.</em>, ``_Inconsistent estimates of forest cover change in China between 2000 and 2013 from multiple datasets: differences in parameters, spatial resolution, and definitions_'', Scientific Reports, vol. 7, number 8748, pp. , August 2017.

[<a id="cit-Hansen2013a" href="#call-Hansen2013a">3</a>] C. M., V. P., Moore R. <em>et al.</em>, ``_Supplementary Materials for: High-Resolution Global Maps of 21st-Century Forest Cover Change_'', Sciene, vol. 342, number 6160, pp. 1--32, November 2013.  [online](http://science.sciencemag.org/content/suppl/2013/11/14/342.6160.850.DC1)

[<a id="cit-Tropek2014" href="#call-Tropek2014">4</a>] Tropek Robert, Sedl{\'{a}}{\v{c}}ek Ond{\v{r}}ej, Beck Jan <em>et al.</em>, ``_Comment on High-resolution global maps of 21st-century forest cover change_'', Science, vol. 344, number 981, pp. ,  2014.

[<a id="cit-Bellot2014" href="#call-Bellot2014">5</a>] Bellot Franz-Fabian, Bertram Mathias, Navratilb Peter <em>et al.</em>, ``_The high-resolution global map of 21st-century forest cover change from the University of Maryland (Hansen Map) is hugely overestimating deforestation in Indonesia_'', FORCLIME Press release, vol. , number , pp. ,  2014.  [online](http://www.forclime.org/documents/press_release/FORCLIME_Overestimation%20of%20Deforestation.pdf)

[<a id="cit-Li2017a" href="#call-Li2017a">6</a>] Li Yan, Sulla-Menashe Damien, Motesharrei Safa <em>et al.</em>, ``_Supplementary Information for Inconsistent estimates of forest cover change in China between 2000 and 2013 from multiple datasets_'', Scientific reports, vol. 7, number 8748, pp. , August 2017.

