In this notebook we make predictions for all of the crowns which have treatment and geopmorphon information attached.  We will then attach the predictions to the crown df and save to `/TreeMortality/data/helena/predictions.parquet`.

In [17]:
import json
import os
import pickle
from pathlib import Path
from joblib import Parallel, delayed
from statistics import mode

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import rioxarray

from xrspatial import focal, slope
import seaborn as sns
from tqdm import tqdm
from joblib_progress import joblib_progress
from xrspatial.multispectral import ndvi, savi
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay)
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.model_selection import RandomizedSearchCV as RSCV
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss


In [18]:
# paths
helena_path = Path.cwd().parent / 'data' / 'helena'
feature_dir = helena_path / 'features'
crowns_path = helena_path / 'spectral_crowns' / 'crowns.parquet'

In [19]:
# years we will be looking at
years = [2018, 2020, 2022]

# get paths for features for years
parquets = {}
for y in years:
    parquets[y] = [feature_dir / p for p in os.listdir(feature_dir) if f'features_{y}_' in p]


parquets[2018]

[PosixPath('/home/michael/TreeMortality/data/helena/features/features_2018_1.parquet'),
 PosixPath('/home/michael/TreeMortality/data/helena/features/features_2018_2.parquet'),
 PosixPath('/home/michael/TreeMortality/data/helena/features/features_2018_4.parquet'),
 PosixPath('/home/michael/TreeMortality/data/helena/features/features_2018_17.parquet'),
 PosixPath('/home/michael/TreeMortality/data/helena/features/features_2018_10.parquet'),
 PosixPath('/home/michael/TreeMortality/data/helena/features/features_2018_13.parquet'),
 PosixPath('/home/michael/TreeMortality/data/helena/features/features_2018_8.parquet'),
 PosixPath('/home/michael/TreeMortality/data/helena/features/features_2018_16.parquet'),
 PosixPath('/home/michael/TreeMortality/data/helena/features/features_2018_3.parquet'),
 PosixPath('/home/michael/TreeMortality/data/helena/features/features_2018_21.parquet'),
 PosixPath('/home/michael/TreeMortality/data/helena/features/features_2018_18.parquet'),
 PosixPath('/home/michael/

We will use the model which was tuned and trained in `src/mortality_classification.ipynb`.  It was pickled.

In [20]:
# load model created in src/mortality_classification.ipynb
pickle_path = Path.cwd() / 'RF_model.sav'
model = pickle.load(open(pickle_path, 'rb'))

Now we will make model predictions for the samples and create a timeseries of survival probabilities for each sample over the years for which we have NAIP data.

In [45]:
pred_list = []
for y in years:   
    
    # read parquet
    df = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in parquets[y]
    )
    
    # drop duplicated rows which are result of chunked feature creation
    df = df[~df.duplicated(subset='UniqueID')]
    
    # make input features, X
    cols = list(model.feature_names_in_)
    X = df[cols]
    
    # make predictions
    pred = pd.DataFrame()
    pred['UniqueID'] = df['UniqueID']
    pred[f'pred_{y}'] = model.predict_proba(X)[:, 1]
    pred = pred.set_index('UniqueID')
    pred_list.append(pred)

In [46]:
ii = [set(d.index) for d in pred_list]
print('indices match:', ii[0] == ii[1] == ii[2] )
is_unique = [len(set(d.index)) == len(d.index.to_list()) for d in pred_list]
is_unique

indices match: True


[True, True, True]

In [59]:
# join all years into one df
pred_df = pd.concat(pred_list, axis=1)
len(pred_df)

46685

In [61]:

# open desired columns of crowns
crowns = gpd.read_parquet(crowns_path)[[
    'UniqueID',
    'area',
    'zq95',
    'treatment',
    'geomorph_100',
    'geomorph_250',
    'geomorph_500',
    'geomorph_1000',
    'geomorph_2000',
    'geometry'
]].set_index('UniqueID')

# join crowns to predictions and move unique id back to a column
crowns = crowns.merge(pred_df, left_index=True, right_index=True).reset_index()

print(len(crowns), ' crowns.')
crowns.head()

51200  crowns.


Unnamed: 0,UniqueID,area,zq95,treatment,geomorph_100,geomorph_250,geomorph_500,geomorph_1000,geomorph_2000,geometry,pred_2018,pred_2020,pred_2022
0,10N_483827_4524602,21.5787,25.335,3,9,6,6,3,5,"POLYGON ((483828.120 4524603.000, 483828.200 4...",0.072072,0.746866,0.198198
1,10N_483828_4512551,56.173,30.962,3,5,9,7,7,7,"POLYGON ((483831.730 4512550.380, 483831.830 4...",0.804042,0.395577,0.099099
2,10N_483828_4514248,40.9836,10.44,3,3,2,2,2,3,"POLYGON ((483831.960 4514248.890, 483831.950 4...",0.927923,0.963029,0.873504
3,10N_483829_4514263,42.712,7.738,3,3,2,2,2,3,"POLYGON ((483831.490 4514263.700, 483831.870 4...",0.799634,0.909905,0.818944
4,10N_483830_4510126,57.1359,8.5665,3,6,3,3,3,6,"POLYGON ((483834.480 4510126.440, 483834.470 4...",0.745946,0.936843,0.162162


In [72]:
len(set(crowns.UniqueID)) == len(crowns.UniqueID.to_list())

True

In [73]:
crowns = crowns[~crowns.duplicated(subset='UniqueID')]

In [74]:
len(set(crowns.UniqueID)) == len(crowns.UniqueID.to_list())

True

In [70]:
crowns.to_parquet(helena_path / 'predictions.parquet')