In [1]:
import glob
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import rasterio as rio

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report 
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

import imblearn as imb
from imblearn.under_sampling import RandomUnderSampler

import scipy
import itertools
import random
import ast

## Data prep

In [2]:
# Open img-scout dfs

data_dir = '/Users/kathleenkanaley/Desktop/grapes_from_space/data/'
data_20 = pd.read_csv(data_dir +'img_scout_dfs/2020/smr_vis_skysat_scout_2020.csv').dropna()
data_21 = pd.read_csv(data_dir +'img_scout_dfs/2021/smr_vis_skysat_scout_2021.csv').dropna()
data_22 = pd.read_csv(data_dir +'img_scout_dfs/2022/smr_vis_skysat_scout_2022.csv').dropna()

data_21 = data_21[data_21['Date']!='2021-08-03']
data_20 = data_20[data_20['Date']!='2020-09-09']

img_dfs = [data_20, data_21, data_22]
#all_years = pd.concat(img_dfs)

In [3]:
prepped_dfs = []
for df in img_dfs:
    df = df.rename(columns={"green_red": "grvi"})
    
    # Create binary numeric label
    # 0 = low GDM
    # 1 = high GDM

    df['GDM_sev']=0
    df.loc[df['DM_severity'] > 10, 'GDM_sev'] =1
    
    df['GDM_inc']=0
    df.loc[df['DM_inc'] > 25, 'GDM_inc'] = 1
    prepped_dfs.append(df)

In [8]:
# set up variables
data_2022 = prepped_dfs[2]
data_2021 = prepped_dfs[1]
data_2020 = prepped_dfs[0]
#data_all = pd.concat([data_2020,data_2021,data_2022])
sev = 'GDM_sev'
#inc = 'GDM_inc'
sbs = ['blue','green','red','nir']


In [9]:
def data_prep(data, expl_vars, resp_var, state, seed):
    X = data[expl_vars]
    y = data[resp_var]

    rus = RandomUnderSampler(random_state=state)
    X_rus, y_rus = rus.fit_resample(X, y)

    mm = MinMaxScaler()
    X_scaled = mm.fit_transform(X_rus)
    
    return X_scaled, y_rus #features (scaled) and labels (unscaled)

In [10]:
# def bal_split_norm(data, expl_vars, resp_var, state, seed):
#     X = data[expl_vars]
#     y = data[resp_var]

#     rus = RandomUnderSampler(random_state=state)
#     X_rus, y_rus = rus.fit_resample(X, y)

#     # Split dataset into training set and test set
#     X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus,
#                                                         test_size=0.3, 
#                                                         random_state=seed,
#                                                         stratify = y_rus)  # 70% training and 30% test
#     mm = MinMaxScaler()
#     X_train_scaled = mm.fit_transform(X_train)
#     X_test_scaled = mm.transform(X_test)
    
#     return X_train_scaled, y_train, X_test_scaled, y_test

In [11]:
# training data

Xtrain_2020,ytrain_2020 = data_prep(data_2020,sbs,sev,2020,14)


Xtrain_2021,ytrain_2021 = data_prep(data_2021,sbs,sev,2021,14)


Xtrain_2022,ytrain_2022 = data_prep(data_2022,sbs,sev,2022,14)

In [12]:
# testing data

mm=MinMaxScaler()

Xtest_2020all = mm.fit_transform(data_2020[sbs])
ytest_2020all = data_2020[sev].to_numpy().reshape(-1, 1)

Xtest_2021all = mm.fit_transform(data_2021[sbs])
ytest_2021all = data_2021[sev].to_numpy().reshape(-1, 1)

Xtest_2022all = mm.fit_transform(data_2022[sbs])
ytest_2022all = data_2022[sev].to_numpy().reshape(-1, 1)

In [15]:
# hyperparams

hp_2020 = {'bootstrap': True,
 'max_depth': 13,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 12,
 'n_estimators': 100}

hp_2021 = {'bootstrap': True,
 'max_depth': 7,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 700}

hp_2022 = {'bootstrap': False,
 'max_depth': 3,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

hp_all = {'bootstrap': True,
 'max_depth': 12,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 12,
 'n_estimators': 100}

In [16]:
# Create RF classifiers
rfc_2020 = RandomForestClassifier().set_params(**hp_2020)
rfc_2021 = RandomForestClassifier().set_params(**hp_2021)
rfc_2022 = RandomForestClassifier().set_params(**hp_2022)

In [17]:
# Fit RF classifiers
rfc_2020.fit(Xtrain_2020, ytrain_2020)
rfc_2021.fit(Xtrain_2021, ytrain_2021)
rfc_2022.fit(Xtrain_2022, ytrain_2022)

In [18]:
# 2020 applied to 2021
ypredict_20on21 = rfc_2020.predict(Xtest_2021all)
cf_matrix = confusion_matrix(ytest_2021all, ypredict_20on21)
print('Classification metrics\n train = 2020, test=2021\n')
print(cf_matrix)
report = classification_report(ytest_2021all, ypredict_20on21)
print(report)

# 2020 applied to 2022
ypredict_20on22 = rfc_2020.predict(Xtest_2022all)
cf_matrix = confusion_matrix(ytest_2022all, ypredict_20on22)
print('Classification metrics\n train = 2020, test=2022\n')
print(cf_matrix)
report = classification_report(ytest_2022all, ypredict_20on22)
print(report)

Classification metrics
 train = 2020, test=2021

[[800  60]
 [ 89   7]]
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       860
           1       0.10      0.07      0.09        96

    accuracy                           0.84       956
   macro avg       0.50      0.50      0.50       956
weighted avg       0.82      0.84      0.83       956

Classification metrics
 train = 2020, test=2022

[[447 263]
 [  6   4]]
              precision    recall  f1-score   support

           0       0.99      0.63      0.77       710
           1       0.01      0.40      0.03        10

    accuracy                           0.63       720
   macro avg       0.50      0.51      0.40       720
weighted avg       0.97      0.63      0.76       720



In [19]:
## 2021 applied to 2020
ypredict_21on20 = rfc_2021.predict(Xtest_2020all)
cf_matrix = confusion_matrix(ytest_2020all, ypredict_21on20)
print('Classification metrics\n train = 2021, test=2020\n')
print(cf_matrix)
report = classification_report(ytest_2020all, ypredict_21on20)
print(report)

## 2021 applied to 2022
ypredict_21on22 = rfc_2021.predict(Xtest_2022all)
cf_matrix = confusion_matrix(ytest_2022all, ypredict_21on22)
print('Classification metrics\n train = 2021, test=2022\n')
print(cf_matrix)
report = classification_report(ytest_2022all, ypredict_21on22)
print(report)

Classification metrics
 train = 2021, test=2020

[[337 110]
 [ 16  11]]
              precision    recall  f1-score   support

           0       0.95      0.75      0.84       447
           1       0.09      0.41      0.15        27

    accuracy                           0.73       474
   macro avg       0.52      0.58      0.50       474
weighted avg       0.91      0.73      0.80       474

Classification metrics
 train = 2021, test=2022

[[653  57]
 [ 10   0]]
              precision    recall  f1-score   support

           0       0.98      0.92      0.95       710
           1       0.00      0.00      0.00        10

    accuracy                           0.91       720
   macro avg       0.49      0.46      0.48       720
weighted avg       0.97      0.91      0.94       720



In [20]:
## 2022 applied to 2020
ypredict_22on20 = rfc_2022.predict(Xtest_2020all)
cf_matrix = confusion_matrix(ytest_2020all, ypredict_22on20)
print('Classification metrics\n train = 2022, test=2020\n')
print(cf_matrix)
report = classification_report(ytest_2020all, ypredict_22on20)
print(report)

## 2022 applied to 2021
ypredict_22on21 = rfc_2022.predict(Xtest_2021all)
cf_matrix = confusion_matrix(ytest_2021all, ypredict_22on21)
print('Classification metrics\n train = 2022, test=2021\n')
print(cf_matrix)
report = classification_report(ytest_2021all, ypredict_22on21)
print(report)

Classification metrics
 train = 2022, test=2020

[[286 161]
 [ 11  16]]
              precision    recall  f1-score   support

           0       0.96      0.64      0.77       447
           1       0.09      0.59      0.16        27

    accuracy                           0.64       474
   macro avg       0.53      0.62      0.46       474
weighted avg       0.91      0.64      0.73       474

Classification metrics
 train = 2022, test=2021

[[463 397]
 [ 15  81]]
              precision    recall  f1-score   support

           0       0.97      0.54      0.69       860
           1       0.17      0.84      0.28        96

    accuracy                           0.57       956
   macro avg       0.57      0.69      0.49       956
weighted avg       0.89      0.57      0.65       956

