In [1]:
import os
import glob

import pandas as pd
import numpy as np
import rasterio as rio

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from scipy import stats
#import statannotations
from statannot import add_stat_annotation

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report 
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import imblearn as imb
from imblearn.under_sampling import RandomUnderSampler

import scipy

In [2]:
def img_to_df(img_path, cols):
    with rio.open(img_path) as src:
        as_array = src.read()
        band_arrs = []
        
        for i in np.arange(src.count):
            band_arr = as_array[i].flatten()
            band_arr = band_arr/10000
            
            band_arrs.append(band_arr)
        
        df = pd.DataFrame(band_arrs,cols).T
        df['Date'] = img_path.split('/')[-1].split('_')[0]
        
        # Optional: drops entries with no severity rating. Comment out if unnecessary.
#         no_nans = df.dropna(subset=['total_dis_sev']) 
        
        #return no_nans
        return df

In [3]:
def add_vis(df):
    df['pri']= (df['green']- df['greenI'])/(df['green']+df['greenI'])
    df['ndre'] = (df['nir'] - df['rede']) / (df['nir'] + df['rede'])
    df['tcari'] = 3 * (( (df['rede'] - df['red']) - 0.2 * (df['rede'] - df['green']) ) * (df['rede'] / df['red']))
    df['ndvi']= (df['nir']-df['red'])/(df['nir']+df['red'])
    df['evi']= 2.5*(df['nir']-df['red'])/(df['nir']+6*df['red']-7.5*df['blue']+1)
    df['savi']= 1.5*(df['nir']-df['red'])/(df['nir']+df['red']+0.5)
    df['arvi']=(df['nir']-(2*df['red']-df['blue']))/(df['nir']+(2*df['red']-df['blue']))
    df['green_red'] = (df['green']-df['red'])/(df['green']+df['red'])

    return df

In [6]:
data_dir = '/Users/kathleenkanaley/Desktop/grapes_from_space/data/'
dis_ras_20 = glob.glob(data_dir +'images/2020/PScope/dis_band_*/20*.tif')
dis_ras_21 = glob.glob(data_dir +'images/2021/PScope/dis_band_*/20*.tif')
dis_ras_22 = glob.glob(data_dir +'images/2022/PScope/dis_band_*/20*.tif')

In [7]:
dis_ras_20

['/Users/kathleenkanaley/Desktop/grapes_from_space/data/images/2020/PScope/dis_band_2020/20200801_151354_03_2212_3B_AnalyticMS_SR_8b_harmonized_clip_clipped.tif_disease.tif',
 '/Users/kathleenkanaley/Desktop/grapes_from_space/data/images/2020/PScope/dis_band_2020/20200713_151457_44_2278_3B_AnalyticMS_SR_8b_harmonized_clip_clipped.tif_disease.tif',
 '/Users/kathleenkanaley/Desktop/grapes_from_space/data/images/2020/PScope/dis_band_2020/20200616_151605_21_2304_3B_AnalyticMS_SR_8b_harmonized_clip_clipped.tif_disease.tif']

In [10]:
cols = ['c_blue','blue','greenI', 'green', 'yellow', 'red', 'rede', 'nir','total_dis_sev','dm_sev', 'dm_inc']

# 2020

dfs_2020 = [img_to_df(dis_ras_20[i], cols) for i in np.arange(len(dis_ras_20))]
df_2020 = pd.concat(dfs_2020)

df_2020['total_dis_sev'] = df_2020['total_dis_sev']*10000
df_2020['dm_sev'] = df_2020['dm_sev']*10000
df_2020['dm_inc'] = df_2020['dm_inc']*10000

df_2020['inc_class'] = 'low'
df_2020.loc[df_2020['dm_inc']>25,'inc_class']= 'high'
    
df_2020['severity_class'] = 'low'
df_2020.loc[df_2020['dm_sev']>10,'severity_class']= 'high'

vis_2020 = add_vis(df_2020)


#2021
dfs_2021 = [img_to_df(dis_ras_21[i], cols) for i in np.arange(len(dis_ras_21))]
df_2021 = pd.concat(dfs_2021)

df_2021['total_dis_sev'] = df_2021['total_dis_sev']*10000
df_2021['dm_sev'] = df_2021['dm_sev']*10000
df_2021['dm_inc'] = df_2021['dm_inc']*10000

df_2021['inc_class'] = 'low'
df_2021.loc[df_2021['dm_inc']>25,'inc_class']= 'high'
    
df_2021['severity_class'] = 'low'
df_2021.loc[df_2021['dm_sev']>10,'severity_class']= 'high'

vis_2021 = add_vis(df_2021)
vis_2021 = vis_2021[vis_2021['Date']!='20210804']

#2022
dfs_2022 = [img_to_df(dis_ras_22[i], cols) for i in np.arange(len(dis_ras_22))]
df_2022 = pd.concat(dfs_2022)

df_2022['total_dis_sev'] = df_2022['total_dis_sev']*10000
df_2022['dm_sev'] = df_2022['dm_sev']*10000
df_2022['dm_inc'] = df_2022['dm_inc']*10000

df_2022['inc_class'] = 'low'
df_2022.loc[df_2022['dm_inc']>25,'inc_class']= 'high'
    
df_2022['severity_class'] = 'low'
df_2022.loc[df_2022['dm_sev']>10,'severity_class']= 'high'

vis_2022 = add_vis(df_2022)
vis_2022 = vis_2022[vis_2022['Date']!='20220828']

vis_2022.columns

Index(['c_blue', 'blue', 'greenI', 'green', 'yellow', 'red', 'rede', 'nir',
       'total_dis_sev', 'dm_sev', 'dm_inc', 'Date', 'inc_class',
       'severity_class', 'pri', 'ndre', 'tcari', 'ndvi', 'evi', 'savi', 'arvi',
       'green_red'],
      dtype='object')

In [11]:
print(vis_2022.Date.unique())
print(vis_2021.Date.unique())
print(vis_2020.Date.unique())


['20220726' '20220801' '20220630' '20220704' '20220624']
['20210808' '20210726' '20210715']
['20200801' '20200713' '20200616']


In [12]:
vis_allyears = pd.concat([vis_2020,vis_2021,vis_2022])

# RF for GDM Inc

In [14]:
# SB 
expl_vars = ['c_blue', 'blue', 'greenI', 'green', 'yellow', 'red', 'rede', 'nir']#,
             #'pri', 'ndre', 'tcari', 'ndvi', 'evi', 'savi', 'arvi','green_red']
resp_var = 'inc_class'
df = vis_allyears

In [15]:
X = df[expl_vars]
y = df[resp_var]
    
rus = RandomUnderSampler(random_state=2024)
X_rus, y_rus = rus.fit_resample(X, y)

# Split dataset into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus,
                                                    test_size=0.3, 
                                                    random_state=123,
                                                    stratify = y_rus)  # 70% training and 30% test
mm = MinMaxScaler()
X_train_scaled = mm.fit_transform(X_train)
X_test_scaled = mm.transform(X_test)
    
    
from sklearn.ensemble import RandomForestClassifier
    
rf = RandomForestClassifier(random_state=456)

rf_model = rf.fit(X_train_scaled, y_train)
    
y_pred = rf_model.predict(X_test_scaled)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        high       0.89      0.90      0.89       192
         low       0.90      0.88      0.89       191

    accuracy                           0.89       383
   macro avg       0.89      0.89      0.89       383
weighted avg       0.89      0.89      0.89       383



In [16]:
# Now run model with just an August 2021 image

df.Date.unique()
#test_set['predicted'] = y_pred.tolist()
    


array(['20200801', '20200713', '20200616', '20210808', '20210726',
       '20210715', '20220726', '20220801', '20220630', '20220704',
       '20220624'], dtype=object)

In [17]:
# # Filter to just Aug 08 2021
# aug08 = df[df['Date']=='20210808']

# # Separate to expl. and resp.

# X_aug08 = aug08[expl_vars]
# y_aug08 = aug08[resp_var]

# mm = MinMaxScaler()
# X_aug08_scaled = mm.fit_transform(X_aug08)

# y_aug08_pred = rf_model.predict(X_aug08_scaled)

# aug08['GDMInc_pred'] = y_aug08_pred.tolist()
# print(classification_report(y_aug08, y_aug08_pred))

              precision    recall  f1-score   support

        high       0.20      0.98      0.33       148
         low       0.99      0.49      0.66      1154

    accuracy                           0.55      1302
   macro avg       0.60      0.74      0.49      1302
weighted avg       0.90      0.55      0.62      1302



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aug08['GDMInc_pred'] = y_aug08_pred.tolist()


In [20]:
# aug08.Date.unique()
# aug08.shape

(1302, 23)

In [21]:
# aug08

Unnamed: 0,c_blue,blue,greenI,green,yellow,red,rede,nir,total_dis_sev,dm_sev,...,severity_class,pri,ndre,tcari,ndvi,evi,savi,arvi,green_red,GDMInc_pred
0,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low
1,0.0652,0.046,0.0795,0.0655,0.0723,0.0505,0.1090,0.3717,,,...,low,-0.096552,0.546495,0.322467,0.760777,0.603896,0.522446,0.742208,0.129310,high
2,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low
3,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low
4,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low
1298,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low
1299,0.0650,0.053,0.0862,0.0737,0.0855,0.0659,0.1197,0.3446,,,...,low,-0.078174,0.484385,0.243033,0.678928,0.518994,0.459143,0.627775,0.055874,high
1300,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low


In [37]:
# Recode GDMInc_pred as 0 for low, 1 for high

# aug08['GDMInc_pred_num'] = 0.1
# aug08.loc[aug08['GDMInc_pred']=='high','GDMInc_pred_num']= 1
# aug08

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aug08['GDMInc_pred_num'] = 0.1


Unnamed: 0,c_blue,blue,greenI,green,yellow,red,rede,nir,total_dis_sev,dm_sev,...,pri,ndre,tcari,ndvi,evi,savi,arvi,green_red,GDMInc_pred,GDMInc_pred_num
0,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0.1
1,0.0652,0.046,0.0795,0.0655,0.0723,0.0505,0.1090,0.3717,,,...,-0.096552,0.546495,0.322467,0.760777,0.603896,0.522446,0.742208,0.129310,high,1.0
2,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0.1
3,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0.1
4,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0.1
1298,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0.1
1299,0.0650,0.053,0.0862,0.0737,0.0855,0.0659,0.1197,0.3446,,,...,-0.078174,0.484385,0.243033,0.678928,0.518994,0.459143,0.627775,0.055874,high,1.0
1300,0.0000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0.1


In [38]:
# aug08['GDMInc_pred_num'].shape

(1302,)

In [39]:
# aug08['GDMInc_pred_num']

0       0.1
1       1.0
2       0.1
3       0.1
4       0.1
       ... 
1297    0.1
1298    0.1
1299    1.0
1300    0.1
1301    0.1
Name: GDMInc_pred_num, Length: 1302, dtype: float64

In [41]:
# Add the predictions as a band to the Aug 18 image

# image= '/Users/kathleenkanaley/Desktop/grapes_from_space/data/images/2021/PScope/dis_band_2021/inc_sev_class_20210808_150651_57_2460_3B_AnalyticMS_SR_8b_harmonized_clip_clipped.tif_disease.tif'

# preds = aug08['GDMInc_pred_num']

In [42]:
# preds.values.reshape(1,31,42)

array([[[0.1, 1. , 0.1, ..., 0.1, 0.1, 0.1],
        [0.1, 1. , 1. , ..., 0.1, 0.1, 0.1],
        [0.1, 1. , 1. , ..., 0.1, 0.1, 0.1],
        ...,
        [0.1, 0.1, 0.1, ..., 0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1, ..., 0.1, 0.1, 0.1],
        [0.1, 0.1, 0.1, ..., 1. , 0.1, 0.1]]])

In [43]:
# with rio.open(image) as src:
#     raster_arr = np.array(src.read())
#     print(raster_arr.shape)

(13, 31, 42)


In [44]:
# with rio.open(image) as src:
#     raster_arr = np.array(src.read())
#     stacked_arr = np.vstack([raster_arr,
#                              preds.values.reshape(1,raster_arr.shape[1],raster_arr.shape[2])])    
#     # Save stacked array as raster
#     with rio.open(image) as src:
#         kwargs = src.meta
#         band_ct = stacked_arr.shape[0]
#         kwargs.update(dtype=rio.float32, count=band_ct)
        
#         with rio.open(str(os.path.split(image)[0])+'/preds_incclass_'+str(os.path.basename(image)), 'w', **kwargs) as dst:
#             for b in range(stacked_arr.shape[0]):
#                 dst.write_band(b+1, stacked_arr[b].astype(rio.float32))

In [36]:
# Try writing just the prediction band to a raster

# with rio.open(image) as src:
#     raster_arr = np.array(src.read())
#     preds_arr = preds.values.reshape(raster_arr.shape[1],raster_arr.shape[2])
    
#     kwargs = src.meta
#     band_ct = 1
#     #kwargs.update(dtype=rio.float32, count=band_ct)
    
#     with rio.open(str(os.path.split(image)[0])+'/predsonly_incclass_'+str(os.path.basename(image)), 'w', **kwargs) as dst:
#         dst.write_band(1, preds_arr)#.astype(rio.float32))

In [46]:
# Filter to just Aug 01 2022
aug01 = df[df['Date']=='20220801']

# Separate to expl. and resp.

X_aug01 = aug01[expl_vars]
y_aug01 = aug01[resp_var]

mm = MinMaxScaler()
X_aug01_scaled = mm.fit_transform(X_aug01)

y_aug01_pred = rf_model.predict(X_aug01_scaled)

aug01['GDMInc_pred'] = y_aug01_pred.tolist()
print(classification_report(y_aug01, y_aug01_pred))

              precision    recall  f1-score   support

        high       0.21      0.33      0.26       126
         low       0.92      0.87      0.90      1176

    accuracy                           0.82      1302
   macro avg       0.57      0.60      0.58      1302
weighted avg       0.85      0.82      0.83      1302



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aug01['GDMInc_pred'] = y_aug01_pred.tolist()


In [47]:
aug01

Unnamed: 0,c_blue,blue,greenI,green,yellow,red,rede,nir,total_dis_sev,dm_sev,...,severity_class,pri,ndre,tcari,ndvi,evi,savi,arvi,green_red,GDMInc_pred
0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low
1,0.0414,0.0453,0.0727,0.0770,0.0668,0.0540,0.1295,0.5222,,,...,low,0.028724,0.602578,0.467639,0.812565,0.776992,0.652574,0.785604,0.175573,low
2,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low
3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low
4,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low
1298,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low
1299,0.0578,0.0624,0.0855,0.0937,0.0965,0.0972,0.1565,0.4047,,,...,low,0.045759,0.442267,0.225766,0.612672,0.505790,0.460375,0.508105,-0.018334,low
1300,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,low,,,,,0.000000,0.000000,,,low


In [48]:
# Recode GDMInc_pred as 0 for low, 1 for high

aug01['GDMInc_pred_num'] = 0
aug01.loc[aug01['GDMInc_pred']=='high','GDMInc_pred_num']= 1
aug01

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aug01['GDMInc_pred_num'] = 0


Unnamed: 0,c_blue,blue,greenI,green,yellow,red,rede,nir,total_dis_sev,dm_sev,...,pri,ndre,tcari,ndvi,evi,savi,arvi,green_red,GDMInc_pred,GDMInc_pred_num
0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0
1,0.0414,0.0453,0.0727,0.0770,0.0668,0.0540,0.1295,0.5222,,,...,0.028724,0.602578,0.467639,0.812565,0.776992,0.652574,0.785604,0.175573,low,0
2,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0
3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0
4,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0
1298,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0
1299,0.0578,0.0624,0.0855,0.0937,0.0965,0.0972,0.1565,0.4047,,,...,0.045759,0.442267,0.225766,0.612672,0.505790,0.460375,0.508105,-0.018334,low,0
1300,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,,...,,,,,0.000000,0.000000,,,low,0


In [49]:
aug01['GDMInc_pred_num'].unique()

array([0, 1])

In [52]:
# Add the predictions as a band to the Aug 01 image

image_aug01= '/Users/kathleenkanaley/Desktop/grapes_from_space/data/images/2022/PScope/dis_band_2022/20220801_150200_56_245c_3B_AnalyticMS_SR_8b_harmonized_clip_clipped.tif_disease.tif'
preds_aug01 = aug01['GDMInc_pred_num']

In [56]:
with rio.open(image_aug01) as src:
    raster_arr = np.array(src.read())
    print(raster_arr.shape)

(11, 31, 42)


In [57]:
with rio.open(image_aug01) as src:
    raster_arr = np.array(src.read())
    stacked_arr = np.vstack([raster_arr,
                             preds_aug01.values.reshape(1,raster_arr.shape[1],raster_arr.shape[2])])    
    # Save stacked array as raster
    with rio.open(image_aug01) as src:
        kwargs = src.meta
        band_ct = stacked_arr.shape[0]
        kwargs.update(dtype=rio.float32, count=band_ct)
        
        with rio.open(str(os.path.split(image_aug01)[0])+'/preds_incclass_'+str(os.path.basename(image_aug01)), 'w', **kwargs) as dst:
            for b in range(stacked_arr.shape[0]):
                dst.write_band(b+1, stacked_arr[b].astype(rio.float32))