In [1]:
import json
import os
import pickle
from pathlib import Path
from joblib import Parallel, delayed
from statistics import mode

import geopandas as gpd
import shapely
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import rioxarray

from xrspatial import hillshade
from xrspatial import convolution
from datashader.colors import Set1
from datashader.transfer_functions import shade
from datashader.transfer_functions import stack
from datashader.transfer_functions import dynspread
from datashader.transfer_functions import set_background
from datashader.colors import Elevation

from xrspatial import focal, slope
import seaborn as sns
from tqdm import tqdm
from joblib_progress import joblib_progress
from xrspatial.multispectral import ndvi, savi
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay)
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.model_selection import RandomizedSearchCV as RSCV
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

# paths
high_high_path = '/home/michael/TreeMortality/data/helena/treatment_polys/code15_n5.gpkg'
high_un_path = '/home/michael/TreeMortality/data/helena/treatment_polys/code12_n5.gpkg'
un_high_path = '/home/michael/TreeMortality/data/helena/treatment_polys/code3_n5.gpkg'
un_un_path = '/home/michael/TreeMortality/data/helena/treatment_polys/code0_n5.gpkg'
poly_paths = [high_high_path, high_un_path, un_high_path, un_un_path]


helena_path = Path.cwd().parent / 'data' / 'helena'
spectral_crowns_dir = helena_path / 'spectral_crowns'
naip_dir = helena_path / 'NAIP'
save_path = helena_path / 'features'
geomorph_dir = helena_path / 'geomorphons'
crown_path = helena_path / 'crowns'
crown_path_list = [
    c for c
    in crown_path.iterdir()
    if c.suffix == '.gpkg'
    ]


Next we need to engineer the features for our model, as done in `src/mortality_classification.ipynb`.  Because the Naip imagery for the area is so large, in order for this to run on my machine, the dtype of the imagery has been changed to `np.float32`.  This should not make a difference in the results, but is worth noting.  in `mortality_classification_geographic_holdouts.ipynb` feature creation was run in parallel, but tha tis not possible with sucha large image.

In [2]:
def make_model_inputs(crowns, xa, save_path, y, gk, label=None, IDcolumn=None):
    '''
    Returns DataFrame with features for use in classification model.
    The resulting DataFrame has 'ID' column which matches that in crowns.
    The DataFrame also has a 'label' column, see params for more detail.  

    params:
        crowns   - str - path to OGR readable vector file containing tree crowns.
        xa      - xr data array - image used in producing features, already read
                         with rioxarray.
        label    - str - specifies column containing labels.  If specified 'label'
                         column in resulting DataFrame will contain contents of 
                         specified column. Otherwise 'label' column contain -99.
        IDcolumn - str - column to use as matching ID with crowns
    '''

    # normalized the band_data
    print(f'\t\t--normalizing (step 1/7)...')
    band_data = xa.band_data.to_numpy().astype(np.float16)
    band_data = (band_data - np.nanmin(band_data)) * (255 / (np.nanmax(band_data) - np.nanmin(band_data)))

    # calculate relative greenness
    print(f'\t\t--calculating RGI (step 2/7)...')
    red = band_data[0]
    green = band_data[1]
    blue = band_data[2]
    nir = band_data[3]
    rgi = green / (red + green + blue)
    xa['rgi'] = (('y', 'x'), rgi)

    # calculate pixel by pixel normalized R, G, B, and NIR
    print(f'\t\t--pix norming (step 3/7)...')
    rgbn_tot = red + green + blue + nir
    xa['red_'] = (('y', 'x'), red  / rgbn_tot)
    xa['blue_'] = (('y', 'x'), blue  / rgbn_tot)
    xa['green_'] = (('y', 'x'), green  / rgbn_tot)
    xa['nir_'] = (('y', 'x'), nir  / rgbn_tot)

    # calculate NDVI and SAVI
    print(f'\t\t--NDVI and SAVI (step 4/7)...')
    nir_agg = xa.band_data[3].astype(np.float32)
    red_agg = xa.band_data[2].astype(np.float32)
    ndvi_agg = ndvi(nir_agg, red_agg)
    savi_agg = savi(nir_agg, red_agg)
    xa['NDVI'] = ndvi_agg.astype(np.float16)
    xa['SAVI'] = savi_agg.astype(np.float16)
    
    del nir_agg, red_agg, ndvi_agg, savi_agg

    # calculate RGB luminosity
    print(f'\t\t--luminosity (step 5/7)...')
    luminosity = band_data[:3].mean(axis=0) / 255
    xa['luminosity'] = (('y', 'x'), luminosity)

    # mask out shadows and soil for RGI,NDVI, and normed pix colors
    print(f'\t\t--masking (step 6/7)...')
    mask = (luminosity > 0.176) & (luminosity < 0.569) 
    masked_rgi = xa.rgi.where(mask)
    masked_ndvi = xa.NDVI.where(mask)
    r_ = xa.red_.where(mask)
    g_ = xa.green_.where(mask)
    b_ = xa.blue_.where(mask)
    n_ = xa.nir_.where(mask)
    
    print(f'\t\t--adding index data (step 7/7)...')
    data = []
    masked_count = 0
    total = len(crowns)
    bins = np.arange(0.1, 1.1, 0.1)
    with tqdm(total=total) as progress_bar:
        for _, row in crowns.iterrows():
            # calculate luminosity fractions
            lum = xa.luminosity.rio.clip([row.geometry]).to_numpy().flatten()
            lum_tot = lum.shape[0]
            lum_fracs = [((lum < f).sum() - (lum < f - 0.1).sum()) / lum_tot for f in bins]

            # calculate rgi fracs
            rgi = masked_rgi.rio.clip([row.geometry]).to_numpy().flatten()
            rgi = rgi[~np.isnan(rgi)]
            rgi_tot = len(rgi)
            if rgi_tot == 0:
                rgi_fracs = [-99] * 10
            else:
                rgi_fracs = [((rgi < f).sum() - (rgi < f - 0.1).sum()) / rgi_tot for f in bins]
                
            # and normed pix colr fracs
            r = r_.rio.clip([row.geometry]).to_numpy().flatten()
            r = r[~np.isnan(r)]
            c_tot = len(r)
            
            g = g_.rio.clip([row.geometry]).to_numpy().flatten()
            g = g[~np.isnan(g)]

            b = b_.rio.clip([row.geometry]).to_numpy().flatten()
            b = b[~np.isnan(b)]

            n = n_.rio.clip([row.geometry]).to_numpy().flatten()
            n = n[~np.isnan(n)]

            if c_tot == 0:
                r_fracs = [-99] * 10
                g_fracs = [-99] * 10
                b_fracs = [-99] * 10
                n_fracs = [-99] * 10
            else:
                r_fracs = [((r < f).sum() - (r < f - 0.1).sum()) / c_tot for f in bins]
                g_fracs = [((g < f).sum() - (g < f - 0.1).sum()) / c_tot for f in bins]
                b_fracs = [((b < f).sum() - (b < f - 0.1).sum()) / c_tot for f in bins]
                n_fracs = [((n < f).sum() - (n < f - 0.1).sum()) / c_tot for f in bins]
                        
            # calculate means and stdevs
            if rgi_tot == 0:
                ndvi_mean, ndvi_std = -99, -99
                rgi_mean, rgi_std = -99, -99
                savi_mean, savi_std = -99, -99
                r_mean, r_std = -99, -99
                g_mean, g_std = -99, -99
                b_mean, b_std = -99, -99
                n_mean, n_std = -99, -99
            else:
                #NOTE: .values * 1 casts 1 item DataArray to float
                ndvi_mean = masked_ndvi.mean(skipna=True).values * 1
                ndvi_std = masked_ndvi.std(skipna=True).values * 1

                rgi_mean = rgi.mean()
                rgi_std = rgi.std()

                savi_mean = xa.SAVI.mean(skipna=True).values * 1
                savi_std = xa.SAVI.std(skipna=True).values * 1

                r_mean = r.mean()
                r_std = r.std()

                g_mean = g.mean()
                g_std = g.std()

                b_mean = b.mean()
                b_std = b.std()

                n_mean = n.mean()
                n_std = n.std()

            if label is None:
                row[label] = -99

            data.append(
                [row[IDcolumn], (row[label] + 1) / 2] +
                lum_fracs +
                rgi_fracs + 
                r_fracs + 
                g_fracs + 
                b_fracs + 
                n_fracs +
                [ndvi_mean, ndvi_std, rgi_mean, rgi_std, savi_mean, savi_std] +
                [r_mean, r_std, g_mean, g_std, b_mean, b_std, n_mean, n_std]
                )

            #count polygon if has masked pixels            
            if rgi_tot < len(xa.rgi.rio.clip([row.geometry]).to_numpy().flatten()):
                masked_count = masked_count + 1

            progress_bar.update(1)

    cols = [IDcolumn, 'label',
            'lum10', 'lum20', 'lum30', 'lum40', 'lum50', 'lum60' ,'lum70', 'lum80', 'lum90', 'lum100',
            'rgi10', 'rgi20', 'rgi30', 'rgi40', 'rgi50', 'rgi60' ,'rgi70', 'rgi80', 'rgi90', 'rgi100',
            'r10', 'r20', 'r30', 'r40', 'r50', 'r60' ,'r70', 'r80', 'r90', 'r100',
            'g10', 'g20', 'g30', 'g40', 'g50', 'g60' ,'g70', 'g80', 'g90', 'g100',
            'b10', 'b20', 'b30', 'b40', 'b50', 'b60' ,'b70', 'b80', 'b90', 'b100',
            'n10', 'n20', 'n30', 'n40', 'n50', 'n60' ,'n70', 'n80', 'n90', 'n100',
            'ndvi_mean', 'ndvi_std', 'rgi_mean', 'rgi_std', 'savi_mean', 'savi_std',
            'r_mean', 'r_std', 'g_mean', 'g_std', 'b_mean', 'b_std', 'n_mean', 'n_std']

    data = pd.DataFrame(data, columns=cols)
    data.to_parquet(save_path / f'features_{y}_{gk}.parquet')
    print(y, gk, 'saved to ', str(save_path / f'features_{y}_{gk}.parquet'))
    del data


In [3]:
os.makedirs(save_path, exist_ok=True)

def treatment_keys(gk):
        if gk == 250:
            return [0, 3, 12, 15]
        return [3, 12, 15]


In [4]:
_= [
    print(f'{f} has {len(gpd.read_parquet(spectral_crowns_dir / f))} crowns')
    for f in os.listdir(spectral_crowns_dir)
    ]

crowns_2000.parquet has 1519 crowns
crowns_100.parquet has 97 crowns
crowns_500.parquet has 439 crowns
crowns_1000.parquet has 816 crowns
crowns_250.parquet has 223 crowns


In [5]:
y = 2022 #2020 2018
r = 100

tif_path = naip_dir / str(y) / f'{y}.vrt'

# read crowns
crown_path = spectral_crowns_dir / f'crowns_{r}.parquet'
crowns = gpd.read_parquet(crown_path)
# get the extent of the crowns
xmin, ymin, xmax, ymax = crowns.total_bounds

# clip the naip image
print('\t--clipping image to total bounds ...')
xa = rioxarray.open_rasterio(tif_path).astype(np.float16).rio.clip_box(
    minx=xmin,
    miny=ymin,
    maxx=xmax,
    maxy=ymax
    ).to_dataset(name='band_data')

# for file labeling
gk = f'geomorph_{r}'

# make inputs
make_model_inputs(
    crowns,
    xa,
    save_path, y, gk,
    label=None,
    IDcolumn='UniqueID'
    )

	--clipping image to total bounds ...
		--normalizing (step 1/7)...
		--calculating RGI (step 2/7)...
		--pix norming (step 3/7)...
		--NDVI and SAVI (step 4/7)...
		--luminosity (step 5/7)...
		--masking (step 6/7)...
		--adding index data (step 7/7)...


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwarg

2022 geomorph_100 saved to  /home/michael/TreeMortality/data/helena/features/features_2022_geomorph_100.parquet





In [5]:
scales = [100, 250, 500, 1000, 2000]

for i, y in enumerate([2018, 2020, 2022]):
    print(f'--------------- {y} - ({i+1}/3) ----------------')
    tif_path = naip_dir / str(y) / f'{y}.vrt'
    for r in scales:
        # read crowns
        crown_path = spectral_crowns_dir / f'crowns_{r}.parquet'
        crowns = gpd.read_parquet(crown_path)
        # get the extent of the crowns
        xmin, ymin, xmax, ymax = crowns.total_bounds
        
        # clip the naip image
        print('\t--clipping image to total bounds ...')
        xa = rioxarray.open_rasterio(tif_path).astype(np.float16).rio.clip_box(
            minx=xmin,
            miny=ymin,
            maxx=xmax,
            maxy=ymax
            ).to_dataset(name='band_data')
        
        # for file labeling
        gk = f'geomorph_{r}'
        
        # make inputs
        make_model_inputs(
            crowns,
            xa,
            save_path, y, gk,
            label=None,
            IDcolumn='UniqueID'
            )



--------------- 2018 - (1/3) ----------------
	--clipping image to total bounds ...
		--normalizing (step 1/7)...
		--calculating RGI (step 2/7)...
		--pix norming (step 3/7)...
		--NDVI and SAVI (step 4/7)...
		--luminosity (step 5/7)...
		--masking (step 6/7)...
		--adding index data (step 7/7)...


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwarg

2018 geomorph_100 saved to  /home/michael/TreeMortality/data/helena/features_2018_geomorph_100.parquet
	--clipping image to total bounds ...
		--normalizing (step 1/7)...
		--calculating RGI (step 2/7)...
		--pix norming (step 3/7)...
		--NDVI and SAVI (step 4/7)...
		--luminosity (step 5/7)...
		--masking (step 6/7)...
		--adding index data (step 7/7)...


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwarg

: 

: 

We will use the model which was tuned and trained in `src/mortality_classification.ipynb`.  It was pickled.

In [None]:
# load model created in src/mortality_classification.ipynb
pickle_path = Path.cwd() / 'RF_model.sav'
model = pickle.load(open(pickle_path, 'rb'))

Now we will make model predictions for the samples and create a timeseries of survival probabilities for each sample over the years for which we have NAIP data.

In [None]:
treatment_keys = dict_of_samples.keys()
geomorphon_keys = range(1,11)
years = [2018, 2020, 2022]

for tk in treatment_keys:
    for gk in geomorphon_keys:
        ...
def sample_mortality_timeseries(sample_dict, years):
    '''
    Takes a dict of years for a given sample,
    returns a df of probabilities of being alive
    by year.
    '''
    t_series = []
    for y in years:
        cols = sample_dict[y].drop(['y', 'label', 'UniqueID'], axis=1).columns
        X = sample_dict[y][cols]
        lil_df = pd.DataFrame()
        lil_df['UniqueID'] = sample_dict[y]['UniqueID']
        lil_df['pred'] = model.predict_proba(X)[:, 1]
        t_series.append(lil_df)
        
    t_series = [t_series[0].join(df_, on='UniqueID') for df_ in t_series[1:]][0]
        
    return t_series


