In [1]:
import json
import os
import pickle
from pathlib import Path
from joblib import Parallel, delayed
from statistics import mode

import geopandas as gpd
import shapely
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import rioxarray
import cupy

from xrspatial import hillshade
from xrspatial import convolution
from datashader.colors import Set1
from datashader.transfer_functions import shade
from datashader.transfer_functions import stack
from datashader.transfer_functions import dynspread
from datashader.transfer_functions import set_background
from datashader.colors import Elevation

from xrspatial import focal, slope
import seaborn as sns
from tqdm import tqdm
from joblib_progress import joblib_progress
from xrspatial.multispectral import ndvi, savi
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay)
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.model_selection import RandomizedSearchCV as RSCV
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

In [2]:
# paths
high_high_path = '/home/michael/TreeMortality/data/helena/treatment_polys/code15_n5.gpkg'
high_un_path = '/home/michael/TreeMortality/data/helena/treatment_polys/code12_n5.gpkg'
un_high_path = '/home/michael/TreeMortality/data/helena/treatment_polys/code3_n5.gpkg'
un_un_path = '/home/michael/TreeMortality/data/helena/treatment_polys/code0_n5.gpkg'
poly_paths = [high_high_path, high_un_path, un_high_path, un_un_path]


helena_path = Path.cwd().parent / 'data' / 'helena'
geomorph_dir = helena_path / 'geomorphons'
crown_path = helena_path / 'crowns'
crown_path_list = [
    c for c
    in crown_path.iterdir()
    if c.suffix == '.gpkg'
    ]



# open treatment polygons
df = pd.concat([gpd.read_file(p) for p in poly_paths])
df = df.drop('area_', axis=1)

We will first check each crown to see if it falls completely within one of the treatment class areas.  If so it will be appended to a datframe of crowns.


In [3]:
# jobs to run in ||
n_jobs = 23

def is_in_treatment(crown_df, row, buf):
    '''Returns only crowns with buffer completely within polygon'''
    crown_df.loc[
        buf.within(row.geometry),
        'treatment'] = row.attribute
    
    return crown_df[crown_df.treatment >= 0 ]


def label_treatment(f):
    crown_df = gpd.read_file(f)
    crown_df = crown_df[crown_df.geometry.area > 10]
    
    # get total bounds of tile as polygon
    bounds = crown_df.total_bounds
    bbox = shapely.geometry.box(*bounds)

    # use only treatment geometries which touch the tile
    sub_df = df[df.geometry.intersects(bbox)]
    if len(sub_df) > 0:
        # add treatment column
        crown_df['treatment'] = -99
        #buffer crowns
        buf = crown_df.geometry.buffer(10)
        # label treatments of crowns lying completely within poly
        return Parallel(n_jobs=n_jobs)(
            delayed(is_in_treatment)(crown_df, row, buf)
            for _, row in sub_df.iterrows()
            )
    else:
        # return empty df, but add treatment column first
        cols = list(crown_df.columns) + ['treatment']
        empty_df = pd.DataFrame(columns=cols)
        return [empty_df]


In [5]:
with joblib_progress('', total=len(crown_path_list)):
    results =  Parallel(n_jobs=n_jobs)(delayed(label_treatment)(f) for f in crown_path_list)
    

Output()

In [7]:
# results is a list of lists of dfs, so we must flatten to concat
crown_df = pd.concat([item for sublist in results for item in sublist])

In [None]:
    
results = [label_treatment(f) for f in tqdm(crown_path_list)]
crown_df = pd.concat(results)

At this point we will also add a unique identifier.  Then save `crowns_df` so in case ware interrupted,  we will be able to resume without running the 5 hour block of code above again.

In [8]:
def make_unique_ID(crowns, utm_zone):
    '''
    returns copy of dataframe with new uniqueID column
    with entries of form 'utm_zone_x_y where x and y 
    are rounded to the nearest meter.
    TODO: make it round to nearest even meter to lower precision
    '''
    crowns['UniqueID'] = crowns.geometry.centroid.apply(
        lambda p: f'{utm_zone}_{p.x:.0f}_{p.y:.0f}')
    
    return crowns

# add unique ID
crown_df_ = make_unique_ID(crown_df, '10N')
crown_df_.head()

Unnamed: 0,IDdalponte,zmax,zmean,zsd,zskew,zkurt,zentropy,pzabovezmean,pzabove2,zq5,...,p2th,p3th,p4th,p5th,pground,n,area,geometry,treatment,UniqueID
0,2.0,5.29,4.97,0.549181,-1.123796,2.307702,0.313845,75.0,100.0,4.303,...,0.0,0.0,0.0,0.0,0.0,4,0.0672,"POLYGON ((496566.730 4511249.660, 496566.620 4...",12,10N_496567_4511250
1,3.0,5.93,5.684,0.387337,-1.405252,3.124655,0.0,80.0,100.0,5.156,...,20.0,0.0,0.0,0.0,0.0,5,0.076,"POLYGON ((496570.930 4511249.740, 496570.640 4...",12,10N_496571_4511250
2,4.0,9.26,7.12,2.037727,-0.419361,1.899452,0.60206,50.0,100.0,4.8005,...,25.0,0.0,0.0,0.0,0.0,4,0.0638,"POLYGON ((496589.250 4511249.710, 496589.000 4...",12,10N_496589_4511250
3,10.0,13.39,11.101111,1.725648,0.357462,1.321602,0.435405,44.444444,100.0,9.444,...,44.444444,0.0,0.0,0.0,0.0,9,0.1551,"POLYGON ((496743.460 4511249.750, 496743.330 4...",12,10N_496743_4511250
4,11.0,9.49,6.605,2.987673,-0.013348,1.025824,0.439247,50.0,100.0,3.63,...,50.0,16.666667,0.0,0.0,0.0,6,0.1148,"POLYGON ((496775.400 4511249.590, 496775.330 4...",12,10N_496775_4511250


In [9]:
# save
crown_df_.to_file(helena_path / 'crowns_with_treatment_label.gpkg')

In [11]:
# make sure the number of treatments is reasonable
crown_df_.treatment.value_counts()

treatment
0     2794929
3      601844
12     468077
15      11980
Name: count, dtype: int64

In [4]:
# now finally we know it is safe to make crown_df = crown_df_ 
#crown_df = crown_df_

# or load it from file if you were interrupted
crown_df = gpd.read_file(helena_path / 'crowns_with_treatment_label.gpkg')

In [6]:
# OMG, I forgot to do this at the begining, probably wasted a lot of time on tiny polys
# fixed above for next time
crown_df = crown_df[crown_df.geometry.area > 10]

In [10]:
# also save fixed version, lets use geoparquet
crown_df.to_parquet(helena_path / 'crowns_with_treatment_label.parquet')

## Geomorphons
In order to look at the effects of slope position on tree mortality we will use the geomorphons algorithm  as implemented in Whitebox Tools.  We will use the geoporphons rasters that were calculated in `src/helena_geomorphon.ipynb`. 

In [1]:
import json
import os
import pickle
from pathlib import Path
from joblib import Parallel, delayed
from statistics import mode

import geopandas as gpd
import shapely
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import rioxarray
import cupy

from xrspatial import hillshade
from xrspatial import convolution
from datashader.colors import Set1
from datashader.transfer_functions import shade
from datashader.transfer_functions import stack
from datashader.transfer_functions import dynspread
from datashader.transfer_functions import set_background
from datashader.colors import Elevation

from xrspatial import focal, slope
import seaborn as sns
from tqdm import tqdm
from joblib_progress import joblib_progress
from xrspatial.multispectral import ndvi, savi
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay)
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.model_selection import RandomizedSearchCV as RSCV
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

# paths
high_high_path = '/home/michael/TreeMortality/data/helena/treatment_polys/code15_n5.gpkg'
high_un_path = '/home/michael/TreeMortality/data/helena/treatment_polys/code12_n5.gpkg'
un_high_path = '/home/michael/TreeMortality/data/helena/treatment_polys/code3_n5.gpkg'
un_un_path = '/home/michael/TreeMortality/data/helena/treatment_polys/code0_n5.gpkg'
poly_paths = [high_high_path, high_un_path, un_high_path, un_un_path]


helena_path = Path.cwd().parent / 'data' / 'helena'
geomorph_dir = helena_path / 'geomorphons'
crown_path = helena_path / 'crowns'
crown_path_list = [
    c for c
    in crown_path.iterdir()
    if c.suffix == '.gpkg'
    ]


In [2]:
# or open the parquet,  way faster
crown_df = gpd.read_parquet(helena_path / 'crowns_with_treatment_label.parquet')
crown_df.treatment.value_counts()

treatment
0     2666168
3      527050
12     415553
15       6409
Name: count, dtype: int64

In [3]:
samp_size = 15_000
crown_df = pd.concat(
    [
        crown_df[crown_df.treatment == 15],
        crown_df[crown_df.treatment == 12].sample(samp_size, random_state=1),
        crown_df[crown_df.treatment == 3].sample(samp_size, random_state=1),
        crown_df[crown_df.treatment == 0].sample(samp_size, random_state=1)
        ]
    )

print(f'total: {len(crown_df)}\nValue counts:')
print(crown_df.treatment.value_counts())

total: 51409
Value counts:
treatment
12    15000
3     15000
0     15000
15     6409
Name: count, dtype: int64


In [4]:
for r in tqdm([100, 250, 500, 1000, 2000]):
    # open geomorphon tif
    tif = geomorph_dir / f'geomorph_{r}.tif' 
    gmorph = rioxarray.open_rasterio(tif)
      
    # attach landform to crowns
    centroids = [(c.x, c.y) for c in crown_df.geometry.centroid.to_list()]
    crown_df[f'geomorph_{r}'] = [gmorph.sel(x=x, y=y, method='nearest').item() for x, y in centroids]
    

100%|██████████| 5/5 [03:45<00:00, 45.14s/it]


Check to make sure all the scales have the same landforms.

In [11]:
for r in [100, 250, 500, 1000, 2000]:
    landforms = list(crown_df[f'geomorph_{r}'].unique())
    landforms.sort()
    print(f'{landforms} for {r}')

[2, 3, 5, 6, 7, 9, 10] for 100
[2, 3, 5, 6, 7, 9, 10] for 250
[2, 3, 5, 6, 7, 9, 10] for 500
[2, 3, 5, 6, 7, 9, 10] for 1000
[2, 3, 5, 6, 7, 9, 10] for 2000


Save the crowns now that they have treatment and geomorphon attached.

In [13]:
spectral_crowns_path = helena_path / 'spectral_crowns'
os.makedirs(spectral_crowns_path, exist_ok=True)

for r in [100, 250, 500, 1000, 2000]:
    crown_df.to_parquet(spectral_crowns_path / f'crowns.parquet')

AttributeError: 'Series' object has no attribute 'to_parquet'

# ignore below here

In [6]:
def make_samples(r):
    # dict to hold samples
    dict_of_samples = {}

    # split into groups based on treatment
    for tr in [0, 3, 12, 15]:
        df_x = crown_df[crown_df.treatment == tr]

        # for each treatment, split based on landform.
        sub_dict = {}
        for pos in landforms:
            sub_dict[f'geomorph_{r}'] = df_x[df_x[f'geomorph_{r}'] == pos]
        dict_of_samples[f'treatment_{tr}'] = sub_dict

    # find the size of the smallest smallest treatment/landform population
    n = np.inf
    for key1, sub_dict in dict_of_samples.items():
        
        lengths = [len(d) for d in dict_of_samples[key1]]
        n = min(n, min(lengths))
    '''         
    print(f'For {r}, ')    
    print(f'the smallest treatment/landform population is {n}.')
    print(f'It is treatment {tr}')
    '''
    return(dict_of_samples)

    
# dict to hold dicts of samples for each geomorphon scale
dict_of_scales = {}
for r in [100, 250, 500, 1000, 2000]:
    dict_of_scales[f'geomorph_{r}'] = make_samples(r)

In [7]:
rows = []
for r in [100, 250, 500, 1000, 2000]:
    for tr in [0, 3, 12, 15]:
        length = len(dict_of_scales[f'geomorph_{r}'][f'treatment_{tr}'][f'geomorph_{r}'])
        rows.append((r, tr, length))

sample_stats = pd.DataFrame(rows, columns=['scale', 'treatment_code', 'n_samples'])
sample_stats.pivot(index='scale', columns='treatment_code', values='n_samples')

treatment_code,0,3,12,15
scale,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,100,33,31,0
250,255,82,51,8
500,443,163,113,0
1000,831,290,236,0
2000,1256,551,417,0


In [8]:
samples = {}

for r in [100, 250, 500, 1000, 2000]:
    samps = []
    for tr in [0, 3, 12, 15]:
        # just hardcoded the tr value for sample size based on sample_stats
        sample_size = len(dict_of_scales[f'geomorph_{r}'][f'treatment_{3}'][f'geomorph_{r}'])
        # get desired dataframe
        df_ = dict_of_scales[f'geomorph_{r}'][f'treatment_{tr}'][f'geomorph_{r}']
        # if df_ is larger than sample_size, reduce it to sample_size
        if len(df_) >= sample_size:
            df_ = df_.sample(sample_size, random_state=123)
            
        # stick in list
        samps.append(df_)
    # concat dfs
    samps = pd.concat(samps)
        
    # put samps into the samples dict
    samples[f'geomorph_{r}'] = samps


In [9]:
rows = []
for r in [100, 250, 500, 1000, 2000]:
    df_ = samples[f'geomorph_{r}']
    for tr in [0, 3, 12, 15]:
        length = len(df_[df_.treatment == tr])
        rows.append((r, tr, length))

sample_stats = pd.DataFrame(rows, columns=['scale', 'treatment_code', 'n_samples'])
sample_stats.pivot(index='scale', columns='treatment_code', values='n_samples')

treatment_code,0,3,12,15
scale,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,33,33,31,0
250,82,82,51,8
500,163,163,113,0
1000,290,290,236,0
2000,551,551,417,0


In [None]:
spectral_crowns_path = helena_path / 'spectral_crowns'
os.makedirs(spectral_crowns_path, exist_ok=True)
for r in [100, 250, 500, 1000, 2000]:
    df_ = samples[f'geomorph_{r}']
    df_.to_parquet(spectral_crowns_path / f'crowns_{r}.parquet')