In [1]:
import glob
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report 
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import imblearn as imb
from imblearn.under_sampling import RandomUnderSampler

import scipy


In [2]:
# Open img-scout dfs

data_dir = '/Users/kathleenkanaley/Desktop/grapes_from_space/data/'
data_20 = pd.read_csv(data_dir +'img_scout_dfs/2020/smr_vis_skysat_scout_2020.csv').dropna()
data_21 = pd.read_csv(data_dir +'img_scout_dfs/2021/smr_vis_skysat_scout_2021.csv').dropna()
data_22 = pd.read_csv(data_dir +'img_scout_dfs/2022/smr_vis_skysat_scout_2022.csv').dropna()

data_21 = data_21[data_21['Date']!='2021-08-03']
data_20 = data_20[data_20['Date']!='2020-09-09']

img_dfs = [data_20, data_21, data_22]
all_years = pd.concat(img_dfs)

In [3]:
all_years['GDM_sev']='low'
all_years.loc[all_years['DM_severity'] > 10, 'GDM_sev'] = 'high'
    
all_years['GDM_inc']='low'
all_years.loc[all_years['DM_inc'] > 25, 'GDM_inc'] = 'high'

# RF for GDM incidence

In [4]:
# SB - Inc  
expl_vars = ['blue', 'green', 'red', 'nir']
resp_var = 'GDM_inc'
df = all_years

In [7]:
X = df[expl_vars]
y = df[resp_var]
    
rus = RandomUnderSampler(random_state=2024)
X_rus, y_rus = rus.fit_resample(X, y)

# Split dataset into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus,
                                                    test_size=0.3, 
                                                    random_state=123,
                                                    stratify = y_rus)  # 70% training and 30% test
mm = MinMaxScaler()
X_train_scaled = mm.fit_transform(X_train)
X_test_scaled = mm.transform(X_test)
    
    
from sklearn.ensemble import RandomForestClassifier
    
rf = RandomForestClassifier(random_state=456)

rf_model = rf.fit(X_train_scaled, y_train)
    
y_pred = rf_model.predict(X_test_scaled)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        high       0.83      0.84      0.83        93
         low       0.84      0.83      0.83        93

    accuracy                           0.83       186
   macro avg       0.83      0.83      0.83       186
weighted avg       0.83      0.83      0.83       186



In [8]:
# Now run model with just an August 2021 image

df.Date.unique()
#test_set['predicted'] = y_pred.tolist()
    


array(['2020-06-18', '2020-06-25', '2020-07-09', '2020-08-06',
       '2020-08-13', '2021-07-07', '2021-07-27', '2021-08-10',
       '2021-08-17', '2022-06-22', '2022-07-06', '2022-07-20'],
      dtype=object)

In [10]:
# Filter to just Aug 10 2021
aug10 = df[df['Date']=='2021-08-10']

# Separate to expl. and resp.

X_aug10 = aug10[expl_vars]
y_aug10 = aug10[resp_var]

mm = MinMaxScaler()
X_aug10_scaled = mm.fit_transform(X_aug10)

y_aug10_pred = rf_model.predict(X_aug10_scaled)

aug10['GDMInc_pred'] = y_aug10_pred.tolist()
print(classification_report(y_aug10, y_aug10_pred))

              precision    recall  f1-score   support

        high       0.23      0.23      0.23        62
         low       0.73      0.73      0.73       178

    accuracy                           0.60       240
   macro avg       0.48      0.48      0.48       240
weighted avg       0.60      0.60      0.60       240



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aug10['GDMInc_pred'] = y_aug10_pred.tolist()


In [11]:
# Filter to just Aug 17 2021
aug17 = df[df['Date']=='2021-08-17']

# Separate to expl. and resp.

X_aug17 = aug17[expl_vars]
y_aug17 = aug17[resp_var]

mm = MinMaxScaler()
X_aug17_scaled = mm.fit_transform(X_aug17)

y_aug17_pred = rf_model.predict(X_aug17_scaled)

aug17['GDMInc_pred'] = y_aug17_pred.tolist()
print(classification_report(y_aug17, y_aug17_pred))

              precision    recall  f1-score   support

        high       0.36      0.36      0.36        69
         low       0.73      0.73      0.73       167

    accuracy                           0.62       236
   macro avg       0.55      0.55      0.55       236
weighted avg       0.62      0.62      0.62       236



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aug17['GDMInc_pred'] = y_aug17_pred.tolist()


In [13]:
aug17

Unnamed: 0,acquired,plot,blue,green,red,nir,veg_smr,savi_exp,savi,msavi,...,PM_severity,DM_severity,total_dis,geometry,centroid,PM_inc,DM_inc,GDM_sev,GDM_inc,GDMInc_pred
959,20210816,1,0.019269,0.072103,0.049700,0.285644,0.550201,0.437747,0.423261,0.226167,...,0.00,0.90,0.90,POLYGON ((-77.0152645742787 42.878300949250196...,POINT (-77.0153084903025 42.8783140936145),0.0,20.0,low,low,low
960,20210816,2,0.022597,0.074333,0.055543,0.261363,0.471446,0.372733,0.377996,0.170108,...,0.00,3.20,3.25,POLYGON ((-77.01535545427274 42.87831851925121...,POINT (-77.0153965503027 42.87833067861654),0.0,30.0,low,high,low
961,20210816,3,0.020103,0.073520,0.054523,0.265283,0.487099,0.383551,0.385627,0.179103,...,0.15,2.55,2.75,POLYGON ((-77.01544069426711 42.87833411925217...,POINT (-77.01548247530272 42.87834674861851),10.0,20.0,low,low,low
962,20210816,4,0.017704,0.070221,0.049225,0.269561,0.517422,0.408057,0.403726,0.196129,...,0.10,3.65,3.75,POLYGON ((-77.01552730426143 42.87835065925314...,POINT (-77.01557007030289 42.87836309362055),10.0,35.0,low,high,high
963,20210816,5,0.015972,0.069834,0.047469,0.279248,0.547258,0.432382,0.420573,0.217456,...,0.00,13.70,13.70,POLYGON ((-77.01561588425558 42.87836680925413...,POINT (-77.01565649030296 42.878378758622546),0.0,75.0,high,high,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1190,20210816,303,0.009360,0.059870,0.038270,0.248173,0.516014,0.404720,0.400398,0.175200,...,13.20,0.00,13.20,POLYGON ((-77.01536426393243 42.87876078918534...,POINT (-77.01540822030267 42.878773523616815),90.0,5.0,low,low,high
1191,20210816,305,0.015918,0.065997,0.046033,0.257685,0.504230,0.396905,0.394950,0.179668,...,0.90,0.00,0.90,POLYGON ((-77.01658187329902 42.87901510909438...,POINT (-77.01662629530372 42.8790284536449),20.0,0.0,low,low,low
1192,20210816,307,0.012376,0.061769,0.041824,0.254628,0.516814,0.405452,0.400873,0.181121,...,16.60,0.25,17.05,POLYGON ((-77.0164079733105 42.878982759092416...,POINT (-77.01645016030366 42.87899534864085),85.0,0.0,low,low,high
1193,20210816,309,0.016079,0.064390,0.045383,0.246759,0.481452,0.378002,0.381393,0.160394,...,0.00,0.00,0.05,POLYGON ((-77.01623108332213 42.87894828909042...,POINT (-77.01627422530342 42.878961353636775),0.0,0.0,low,low,low


In [14]:
#aug17.to_csv('/Users/kathleenkanaley/Desktop/predicted_inc_20210816.csv', index=False)