In [1]:
import glob
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import rasterio as rio

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report 
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

import imblearn as imb
from imblearn.under_sampling import RandomUnderSampler

import scipy
import itertools
import random
import ast

## Data prep

In [2]:
def img_to_df(img_path, cols):
    with rio.open(img_path) as src:
        as_array = src.read()
        band_arrs = []
        
        for i in np.arange(src.count):
            band_arr = as_array[i].flatten()
            band_arr = band_arr/10000
            
            band_arrs.append(band_arr)
        
        df = pd.DataFrame(band_arrs,cols).T
        df['Date'] = img_path.split('/')[-1].split('_')[0]
        
        # Optional: drops entries with no severity rating. Comment out if unnecessary.
#         df = df.dropna(subset=['total_dis_sev']) 
        
        return df

In [3]:
# def add_vis(df):
#     df['pri']= (df['green']- df['greenI'])/(df['green']+df['greenI'])
#     df['ndre'] = (df['nir'] - df['rede']) / (df['nir'] + df['rede'])
#     df['tcari'] = 3 * (( (df['rede'] - df['red']) - 0.2 * (df['rede'] - df['green']) ) * (df['rede'] / df['red']))
#     df['ndvi']= (df['nir']-df['red'])/(df['nir']+df['red'])
#     df['evi']= 2.5*(df['nir']-df['red'])/(df['nir']+6*df['red']-7.5*df['blue']+1)
#     df['savi']= 1.5*(df['nir']-df['red'])/(df['nir']+df['red']+0.5)
#     df['arvi']=(df['nir']-(2*df['red']-df['blue']))/(df['nir']+(2*df['red']-df['blue']))
#     df['grvi'] = (df['green']-df['red'])/(df['green']+df['red'])

#     return df

In [4]:
data_dir = '/Users/kathleenkanaley/Desktop/grapes_from_space/data/'
dis_ras_20 = glob.glob(data_dir +'images/2020/PScope/dis_band_*/20*.tif')
dis_ras_21 = glob.glob(data_dir +'images/2021/PScope/dis_band_*/20*.tif')
dis_ras_22 = glob.glob(data_dir +'images/2022/PScope/dis_band_*/20*.tif')

In [5]:
# Get dataframes for all years
def data_prep_ps(rasters, cols):
    dfs = [img_to_df(rasters[i], cols) for i in np.arange(len(rasters))]
    df = pd.concat(dfs)
    df['total_dis_sev'] = df['total_dis_sev']*10000
    df['dm_sev'] = df['dm_sev']*10000
    df['dm_inc'] = df['dm_inc']*10000
    df['inc_class'] = 0
    df.loc[df['dm_inc']>25,'inc_class']= 1
    df['severity_class'] = 0
    df.loc[df['dm_sev']>10,'severity_class']= 1
    #df = add_vis(df)
    return df 
    
    
# dfs_2020 = [img_to_df(dis_ras_20[i], cols) for i in np.arange(len(dis_ras_20))]
# df_2020 = pd.concat(dfs_2020)

# df_2020['total_dis_sev'] = df_2020['total_dis_sev']*10000
# df_2020['dm_sev'] = df_2020['dm_sev']*10000
# df_2020['dm_inc'] = df_2020['dm_inc']*10000

# df_2020['inc_class'] = 'low'
# df_2020.loc[df_2020['dm_inc']>25,'inc_class']= 'high'
    
# df_2020['severity_class'] = 'low'
# df_2020.loc[df_2020['dm_sev']>10,'severity_class']= 'high'

# vis_2020 = add_vis(df_2020)

# vis_2020.columns

In [6]:
cols = ['c_blue','blue','greenI', 'green', 'yellow', 'red', 'rede', 'nir','total_dis_sev','dm_sev', 'dm_inc']
ps_dfs = []

for ras in [dis_ras_20, dis_ras_21, dis_ras_22]:
    ps_df = data_prep_ps(ras,cols)
    ps_dfs.append(ps_df)
    
len(ps_dfs)

3

In [7]:
ps_dfs[0].Date.unique()

array(['20200801', '20200713', '20200616'], dtype=object)

In [8]:
ps_dfs[1].Date.unique()

array(['20210808', '20210804', '20210726', '20210715'], dtype=object)

In [9]:
ps_dfs[1] = ps_dfs[1][ps_dfs[1].Date !='20210804']

In [10]:
ps_dfs[2].Date.unique()

array(['20220726', '20220801', '20220828', '20220630', '20220704',
       '20220624'], dtype=object)

In [11]:
ps_dfs[2] = ps_dfs[2][ps_dfs[2].Date!='20220828']

In [12]:
for date in (ps_dfs[2].Date.unique()):
    print(date)

20220726
20220801
20220630
20220704
20220624


In [13]:
ps_dfs[0].columns

Index(['c_blue', 'blue', 'greenI', 'green', 'yellow', 'red', 'rede', 'nir',
       'total_dis_sev', 'dm_sev', 'dm_inc', 'Date', 'inc_class',
       'severity_class'],
      dtype='object')

In [14]:
# set up variables
data_2022 = ps_dfs[2]
data_2021 = ps_dfs[1]
data_2020 = ps_dfs[0]
#data_all = pd.concat([data_2020,data_2021,data_2022])
sev = 'severity_class'
#inc = 'inc_class'
sbs = ['c_blue', 'blue', 'greenI', 'green', 'yellow', 'red', 'rede', 'nir']
# vis = ['pri', 'ndre', 'tcari', 'ndvi', 'evi', 'savi', 'arvi','grvi']

In [15]:
def data_prep(data, expl_vars, resp_var, state, seed):
    X = data[expl_vars]
    y = data[resp_var]

    rus = RandomUnderSampler(random_state=state)
    X_rus, y_rus = rus.fit_resample(X, y)

    mm = MinMaxScaler()
    X_scaled = mm.fit_transform(X_rus)
    
    return X_scaled, y_rus #features (scaled) and labels (unscaled)

In [16]:
# def bal_split_norm(data, expl_vars, resp_var, state, seed):
#     X = data[expl_vars]
#     y = data[resp_var]

#     rus = RandomUnderSampler(random_state=state)
#     X_rus, y_rus = rus.fit_resample(X, y)

#     # Split dataset into training set and test set
#     X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus,
#                                                         test_size=0.3, 
#                                                         random_state=seed,
#                                                         stratify = y_rus)  # 70% training and 30% test
#     mm = MinMaxScaler()
#     X_train_scaled = mm.fit_transform(X_train)
#     X_test_scaled = mm.transform(X_test)
    
#     return X_train_scaled, y_train, X_test_scaled, y_test

In [17]:
# training data

Xtrain_2020,ytrain_2020 = data_prep(data_2020,sbs,sev,2020,14)


Xtrain_2021,ytrain_2021 = data_prep(data_2021,sbs,sev,2021,14)


Xtrain_2022,ytrain_2022 = data_prep(data_2022,sbs,sev,2022,14)

In [19]:
# testing data

mm=MinMaxScaler()

Xtest_2020all = mm.fit_transform(data_2020[sbs])
ytest_2020all = data_2020[sev].to_numpy().reshape(-1, 1)

Xtest_2021all = mm.fit_transform(data_2021[sbs])
ytest_2021all = data_2021[sev].to_numpy().reshape(-1, 1)

Xtest_2022all = mm.fit_transform(data_2022[sbs])
ytest_2022all = data_2022[sev].to_numpy().reshape(-1, 1)

In [22]:
# hyperparams

hp_2020 = {'bootstrap': True,
 'max_depth': 3,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200}

hp_2021 = {'bootstrap': True,
 'max_depth': 7,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 12,
 'n_estimators': 100}

hp_2022 = {'bootstrap': False,
 'max_depth': 7,
 'max_features': 'log2',
 'min_samples_leaf': 37,
 'min_samples_split': 23,
 'n_estimators': 900}

hp_all = {'bootstrap': False,
 'max_depth': 8,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 700}

In [23]:
# Create RF classifiers
rfc_2020 = RandomForestClassifier().set_params(**hp_2020)
rfc_2021 = RandomForestClassifier().set_params(**hp_2021)
rfc_2022 = RandomForestClassifier().set_params(**hp_2022)

In [24]:
# Fit RF classifiers
rfc_2020.fit(Xtrain_2020, ytrain_2020)
rfc_2021.fit(Xtrain_2021, ytrain_2021)
rfc_2022.fit(Xtrain_2022, ytrain_2022)

In [25]:
# 2020 applied to 2021
ypredict_20on21 = rfc_2020.predict(Xtest_2021all)
cf_matrix = confusion_matrix(ytest_2021all, ypredict_20on21)
print('Classification metrics\n train = 2020, test=2021\n')
print(cf_matrix)
report = classification_report(ytest_2021all, ypredict_20on21)
print(report)

# 2020 applied to 2022
ypredict_20on22 = rfc_2020.predict(Xtest_2022all)
cf_matrix = confusion_matrix(ytest_2022all, ypredict_20on22)
print('Classification metrics\n train = 2020, test=2022\n')
print(cf_matrix)
report = classification_report(ytest_2022all, ypredict_20on22)
print(report)

Classification metrics
 train = 2020, test=2021

[[3759   29]
 [ 115    3]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      3788
           1       0.09      0.03      0.04       118

    accuracy                           0.96      3906
   macro avg       0.53      0.51      0.51      3906
weighted avg       0.94      0.96      0.95      3906

Classification metrics
 train = 2020, test=2022

[[6315   31]
 [ 164    0]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      6346
           1       0.00      0.00      0.00       164

    accuracy                           0.97      6510
   macro avg       0.49      0.50      0.49      6510
weighted avg       0.95      0.97      0.96      6510



In [26]:
## 2021 applied to 2020
ypredict_21on20 = rfc_2021.predict(Xtest_2020all)
cf_matrix = confusion_matrix(ytest_2020all, ypredict_21on20)
print('Classification metrics\n train = 2021, test=2020\n')
print(cf_matrix)
report = classification_report(ytest_2020all, ypredict_21on20)
print(report)

## 2021 applied to 2022
ypredict_21on22 = rfc_2021.predict(Xtest_2022all)
cf_matrix = confusion_matrix(ytest_2022all, ypredict_21on22)
print('Classification metrics\n train = 2021, test=2022\n')
print(cf_matrix)
report = classification_report(ytest_2022all, ypredict_21on22)
print(report)

Classification metrics
 train = 2021, test=2020

[[2668   18]
 [  20    0]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2686
           1       0.00      0.00      0.00        20

    accuracy                           0.99      2706
   macro avg       0.50      0.50      0.50      2706
weighted avg       0.99      0.99      0.99      2706

Classification metrics
 train = 2021, test=2022

[[6249   97]
 [ 164    0]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      6346
           1       0.00      0.00      0.00       164

    accuracy                           0.96      6510
   macro avg       0.49      0.49      0.49      6510
weighted avg       0.95      0.96      0.95      6510



In [27]:
## 2022 applied to 2020
ypredict_22on20 = rfc_2022.predict(Xtest_2020all)
cf_matrix = confusion_matrix(ytest_2020all, ypredict_22on20)
print('Classification metrics\n train = 2022, test=2020\n')
print(cf_matrix)
report = classification_report(ytest_2020all, ypredict_22on20)
print(report)

## 2022 applied to 2021
ypredict_22on21 = rfc_2022.predict(Xtest_2021all)
cf_matrix = confusion_matrix(ytest_2021all, ypredict_22on21)
print('Classification metrics\n train = 2022, test=2021\n')
print(cf_matrix)
report = classification_report(ytest_2021all, ypredict_22on21)
print(report)

Classification metrics
 train = 2022, test=2020

[[2311  375]
 [   0   20]]
              precision    recall  f1-score   support

           0       1.00      0.86      0.92      2686
           1       0.05      1.00      0.10        20

    accuracy                           0.86      2706
   macro avg       0.53      0.93      0.51      2706
weighted avg       0.99      0.86      0.92      2706

Classification metrics
 train = 2022, test=2021

[[3199  589]
 [ 107   11]]
              precision    recall  f1-score   support

           0       0.97      0.84      0.90      3788
           1       0.02      0.09      0.03       118

    accuracy                           0.82      3906
   macro avg       0.49      0.47      0.47      3906
weighted avg       0.94      0.82      0.88      3906

