In [2]:
import json
import os
import pickle
from pathlib import Path
from joblib import Parallel, delayed
from statistics import mode

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import rioxarray

from xrspatial import focal, slope
import seaborn as sns
from tqdm import tqdm
from joblib_progress import joblib_progress
from xrspatial.multispectral import ndvi, savi
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay)
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.model_selection import RandomizedSearchCV as RSCV
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss


In [3]:
# paths
helena_path = Path.cwd().parent / 'data' / 'helena'
feature_dir = helena_path / 'features'

In [5]:
# get paths for features using 100 m geomorphons
parquets = [p for p in os.listdir(feature_dir) if p.endswith('_100.parquet')]
parquets.sort()

parquets

['features_2018_geomorph_100.parquet',
 'features_2020_geomorph_100.parquet',
 'features_2022_geomorph_100.parquet']

We will use the model which was tuned and trained in `src/mortality_classification.ipynb`.  It was pickled.

In [7]:
# load model created in src/mortality_classification.ipynb
pickle_path = Path.cwd() / 'RF_model.sav'
model = pickle.load(open(pickle_path, 'rb'))

Now we will make model predictions for the samples and create a timeseries of survival probabilities for each sample over the years for which we have NAIP data.

In [12]:
predictions = []
for f in parquets:
    print(f'-------{f}----------')
    
    # get year and geomorphon radius
    split_fname = f.split('_')
    y = split_fname[1]
    r = split_fname[3].split('.')[0]
    
    # read parquet, make input feature df (X)
    df = pd.read_parquet(feature_dir / f)
    cols = list(model.feature_names_in_)
    X = df[cols]
    y = pd.DataFrame()
    y['UniqueID'] = df['UniqueID']
    y[f'pred_{y}'] = model.predict_proba(X)[:, 1]
    predictions.append(y)
    
predictions = [predictions[0].join(df_, on='UniqueID') for df_ in predictions[1:]][0]


-------features_2018_geomorph_100.parquet----------


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- b100
- b70
- b80
- b90
- g10
- ...
