In [1]:
import glob
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report 
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

import imblearn as imb
from imblearn.under_sampling import RandomUnderSampler

import scipy


In [2]:
data_2020 = pd.read_csv('../data/img_scout_dfs/2020/smr_vis_skysat_scout_2020.csv')
data_2021 = pd.read_csv('../data/img_scout_dfs/2021/smr_vis_skysat_scout_2021.csv')
data_2022 = pd.read_csv('../data/img_scout_dfs/2022/smr_vis_skysat_scout_2022.csv')

df_list = [data_2020, data_2021, data_2022]

In [3]:
# Add column for total duration of DMCast events (in hours) over prior 2 weeks

#2022
data_2022['DMCast_events'] = 0
data_2022.loc[data_2022['Date'] == '2022-06-22', 'DMCast_events'] = 9
data_2022.loc[data_2022['Date'] == '2022-07-06', 'DMCast_events'] = 24
data_2022.loc[data_2022['Date'] == '2022-07-20', 'DMCast_events'] = 25

# 2021
data_2021['DMCast_events'] = 0
data_2021.loc[data_2021['Date'] == '2021-07-07', 'DMCast_events'] = 45
data_2021.loc[data_2021['Date'] == '2021-07-27', 'DMCast_events'] = 78
data_2021.loc[data_2021['Date'] == '2021-08-03', 'DMCast_events'] = 49
data_2021.loc[data_2021['Date'] == '2021-08-10', 'DMCast_events'] = 78
data_2021.loc[data_2021['Date'] == '2021-08-17', 'DMCast_events'] = 74

#2020
data_2020['DMCast_events'] = 0
data_2020.loc[data_2020['Date'] == '2020-06-18', 'DMCast_events'] = 5
data_2020.loc[data_2020['Date'] == '2020-06-25', 'DMCast_events'] = 25
data_2020.loc[data_2020['Date'] == '2020-07-09', 'DMCast_events'] = 12
data_2020.loc[data_2020['Date'] == '2020-08-06', 'DMCast_events'] = 42
data_2020.loc[data_2020['Date'] == '2020-08-13', 'DMCast_events'] = 38


In [4]:
# Add columns for GDM incidence and GDM severity to each df 
for d in df_list:
    
    d['GDM_sev']='low'
    d.loc[d['DM_severity'] > 10, 'GDM_sev'] = 'high'
    
    d['GDM_inc']='low'
    d.loc[d['DM_inc'] > 25, 'GDM_inc'] = 'high'

In [5]:
# All high severity panels are clustered on later image dates
# leaving one of these images out will make test set too small

# Better idea: random forest for 2020, report accuracy metrics and kappa score

# Later test global model on hold-out image from 2020

master_df = pd.concat(df_list)
training_set = master_df[master_df['Date']!='2021-08-17']

In [6]:
training_set.columns

Index(['acquired', 'plot', 'blue', 'green', 'red', 'nir', 'veg_smr',
       'savi_exp', 'savi', 'msavi', 'evi', 'arvi', 'gri', 'ndvi', 'nir_blue',
       'nir_green', 'red_blue', 'green_red', 'green_blue', 'row', 'panel',
       'Date', 'Treatment', 'Block', 'PM_severity', 'DM_severity', 'total_dis',
       'geometry', 'centroid', 'PM_inc', 'DM_inc', 'DMCast_events', 'GDM_sev',
       'GDM_inc'],
      dtype='object')

In [7]:
# First with no DMCast
expl_vars = ['blue', 'green', 'red', 'nir','savi', 'evi', 'arvi', 'gri', 'ndvi']
resp_var = 'GDM_sev'

In [8]:
def run_rf_us(df, expl_vars, resp_var, SPLIT_RND_SEED, rand_state):
    X = df[expl_vars]
    y = df[resp_var]
    
    rus = RandomUnderSampler(random_state=2020)
    X_rus, y_rus = rus.fit_resample(X, y)

# Split dataset into training set and test set

    X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus,
                                                    test_size=0.3, 
                                                    random_state=SPLIT_RND_SEED,
                                                    stratify = y_rus)  # 70% training and 30% test
    from sklearn.ensemble import RandomForestClassifier
    # Create a Gaussian Classifier with 500 trees 
    rf = RandomForestClassifier(n_estimators = 500, 
                                oob_score=True, 
                                random_state=rand_state, # random number to be used, needed to reproduce the same result
                                verbose=False)
    
    rf_model = rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(y_test, y_pred)
    
    from sklearn.metrics import precision_recall_fscore_support
    
    prec, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='weighted')
  
    
    from sklearn.metrics import cohen_kappa_score
        
    cohen_score = cohen_kappa_score(y_test, y_pred)    

    
    metrics = {
        'cohen_kappa': [cohen_score], 
        'accuracy_score': [accuracy],
        'precision': [prec],
        'recall': [recall]
    }
    metrics_df = pd.DataFrame(data=metrics)
    
    return metrics_df


In [10]:
import random
 
rand_list=[]
n=100
for i in range(n):
    rand_list.append(random.randint(100,999))

In [11]:
results_all = []
for r in rand_list:
    res_df = run_rf_us(master_df, expl_vars, resp_var, r, 5890)
    
    results_all.append(res_df)

In [12]:
results_df = pd.concat(results_all)

results_df.mean()

cohen_kappa       0.469143
accuracy_score    0.734526
precision         0.739744
recall            0.734526
dtype: float64

In [13]:
# For incidence
 
rand_list=[]
n=100
for i in range(n):
    rand_list.append(random.randint(100,999))

In [14]:
results_inc = []
for r in rand_list:
    res_df = run_rf_us(master_df, expl_vars, 'GDM_inc', r, 5050)
    
    results_inc.append(res_df)

In [15]:
results_inc_df = pd.concat(results_inc)

results_inc_df.mean()

cohen_kappa       0.628695
accuracy_score    0.814347
precision         0.815253
recall            0.814347
dtype: float64

In [16]:
# Modify function to test on hold-out date

def rf_holdout(df, holdout_date, expl_vars, resp_var, rus_rand_state, rf_rand_state):
    
    # Split dataset into training set and test set
    
    training_set = df[df['Date']!=holdout_date]
    test_set = df[df['Date']==holdout_date]
    
    X = training_set[expl_vars]
    y = training_set[resp_var]
    
    rus = RandomUnderSampler(random_state=rus_rand_state)
    X_train, y_train = rus.fit_resample(X, y)
    
    X_test = test_set[expl_vars]
    y_test = test_set[resp_var]


    from sklearn.ensemble import RandomForestClassifier
    # Create a Gaussian Classifier with 500 trees 
    rf = RandomForestClassifier(n_estimators = 500, 
                                oob_score=True, 
                                random_state=rf_rand_state, # random number to be used, needed to reproduce the same result
                                verbose=False)
    
    rf_model = rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(y_test, y_pred)
  
    
    from sklearn.metrics import precision_recall_fscore_support
    
    prec, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='weighted')
  
    
    from sklearn.metrics import cohen_kappa_score
        
    cohen_score = cohen_kappa_score(y_test, y_pred)    

    
    metrics = {
        'cohen_kappa': [cohen_score], 
        'accuracy_score': [accuracy],
        'precision': [prec],
        'recall': [recall]
    }
    metrics_df = pd.DataFrame(data=metrics)
    
    test_set['predicted'] = y_pred.tolist()
    
#     return metrics_df

    return test_set


In [17]:
predicted_df = rf_holdout(master_df, '2021-08-17', expl_vars, 'GDM_inc', 123, 456)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set['predicted'] = y_pred.tolist()


In [18]:
predicted_df.shape

(236, 35)

In [19]:
predicted_df.columns

Index(['acquired', 'plot', 'blue', 'green', 'red', 'nir', 'veg_smr',
       'savi_exp', 'savi', 'msavi', 'evi', 'arvi', 'gri', 'ndvi', 'nir_blue',
       'nir_green', 'red_blue', 'green_red', 'green_blue', 'row', 'panel',
       'Date', 'Treatment', 'Block', 'PM_severity', 'DM_severity', 'total_dis',
       'geometry', 'centroid', 'PM_inc', 'DM_inc', 'DMCast_events', 'GDM_sev',
       'GDM_inc', 'predicted'],
      dtype='object')

In [20]:
predicted_df.iloc[0,33]

'low'

In [21]:
predicted_df['predict_correct'] = 'correct'

predicted_df.loc[(predicted_df['GDM_inc'] == 'high') & (predicted_df['predicted']=='low'), 
                 'predict_correct'] = 'incorrect'

predicted_df.loc[(predicted_df['GDM_inc'] == 'low') & (predicted_df['predicted']=='high'), 
                 'predict_correct'] = 'incorrect'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df['predict_correct'] = 'correct'


In [22]:
predicted_df.predict_correct.unique()

array(['correct', 'incorrect'], dtype=object)

In [23]:
#predicted_df.to_csv('predicted_inc_20210816.csv', index=False) 

In [52]:
def rf_holdout(df, holdout_df, expl_vars, resp_var, SPLIT_RND_SEED, rand_state):
    
    X = df[expl_vars]
    y = df[resp_var]
    
    rus = RandomUnderSampler(random_state=2022)
    X_rus, y_rus = rus.fit_resample(X, y)

# Split dataset into training set and test set

    X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus,
                                                    test_size=0.3, 
                                                    random_state=SPLIT_RND_SEED,
                                                   stratify = y_rus)
    
    from sklearn.ensemble import RandomForestClassifier
    # Create a Gaussian Classifier with 500 trees 
    rf = RandomForestClassifier(n_estimators = 500, 
                                oob_score=True, 
                                random_state=12345, # random number to be used, needed to reproduce the same result
                                verbose=False)
    rf_model = rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(y_test, y_pred)
  
    
    from sklearn.metrics import cohen_kappa_score
    
    #def kappa_statistics(X_test, y_test):
    y_pred = rf.predict(X_test)    
    cohen_score = cohen_kappa_score(y_test, y_pred)    
    #print("Kappa score:", cohen_score)
    
    # Test the model on the holdout image
    
    holdoutX_test = holdout_df[expl_vars]
    holdouty_test = holdout_df[resp_var]
    
    holdout_pred = rf.predict(holdoutX_test)
    
    holdout_accuracy = accuracy_score(holdouty_test, holdout_pred)
    
    metrics = {'cohen_kappa': [cohen_score], 'accuracy_score': [accuracy], 
               'accuracy_score_holdout':[holdout_accuracy]}
    
    metrics_df = pd.DataFrame(data=metrics)
    
    return metrics_df


In [54]:
training = master_df[master_df['Date']!='2021-08-17']
holdout_df = master_df[master_df['Date']=='2021-08-17']

In [55]:
rand_list=[]
n=100
for i in range(n):
    rand_list.append(random.randint(100,999))

results_plus_ho = []
for r in rand_list:
    rf_us_holdout(training, holdout_df, expl_vars, resp_var, r)
    
    results_plus_ho.append(res_df)

In [56]:
results_df = pd.concat(results_plus_ho)

results_df['cohen_kappa'].mean()

0.5588235294117647

In [57]:
results_df['accuracy_score'].mean()

0.7794117647058825

In [59]:
results_df.head()

Unnamed: 0,cohen_kappa,accuracy_score
0,0.558824,0.779412
0,0.558824,0.779412
0,0.558824,0.779412
0,0.558824,0.779412
0,0.558824,0.779412


In [58]:
results_df['accuracy_score_holdout'].mean()

KeyError: 'accuracy_score_holdout'