In [1]:
import numpy as np
import pandas as pd

from datetime import date
import time

import random
from random import seed
from random import random

import os, os.path
import shutil

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import matplotlib
import matplotlib.pyplot as plt
from pylab import imshow
import pickle
import h5py
import sys

In [2]:
sys.path.append('/Users/hn/Documents/00_GitHub/Ag/NASA/Python_codes/')
import NASA_core as nc
# import NASA_plot_core as rcp

# Metadata

In [3]:
meta_dir = "/Users/hn/Documents/01_research_data/NASA/parameters/"
meta = pd.read_csv(meta_dir+"evaluation_set.csv")
meta_moreThan10Acr=meta[meta.ExctAcr>10]
print (meta.shape)
print (meta_moreThan10Acr.shape)
meta.head(2)

(6340, 8)
(3539, 8)


Unnamed: 0,ID,CropTyp,Irrigtn,DataSrc,Acres,ExctAcr,LstSrvD,county
0,100010_WSDA_SF_2017,alfalfa hay,center pivot,wsda,34,34.310305,2017/09/12,Grant
1,100204_WSDA_SF_2017,alfalfa hay,center pivot,wsda,62,61.826535,2017/08/09,Grant


In [4]:
# print (len(meta.ID.unique()))
# meta_lessThan10Acr=meta[meta.ExctAcr<10]
# print (meta_lessThan10Acr.shape)

# Read Training Set Labels

In [5]:
training_set_dir = "/Users/hn/Documents/01_research_data/NASA/ML_data/"

ground_truth_labels = pd.read_csv(training_set_dir+"train_labels.csv")
print ("Unique Votes: ", ground_truth_labels.Vote.unique())
print (len(ground_truth_labels.ID.unique()))
ground_truth_labels.head(2)

Unique Votes:  [2 1]
1849


Unnamed: 0,ID,Vote
0,99837_WSDA_SF_2017,2
1,114615_WSDA_SF_2017,1


### Detect how many fields are less than 10 acres and report in the paper

In [6]:
print (len(meta[meta.ID.isin(list(ground_truth_labels.ID))].ID.unique()))
meta.head(2)

1849


Unnamed: 0,ID,CropTyp,Irrigtn,DataSrc,Acres,ExctAcr,LstSrvD,county
0,100010_WSDA_SF_2017,alfalfa hay,center pivot,wsda,34,34.310305,2017/09/12,Grant
1,100204_WSDA_SF_2017,alfalfa hay,center pivot,wsda,62,61.826535,2017/08/09,Grant


# Read the data

In [7]:
VI_idx = "NDVI"
data_dir = "/Users/hn/Documents/01_research_data/NASA/VI_TS/04_regularized_TS/"

In [8]:
file_names = ["regular_Walla2015_" + VI_idx + "_JFD.csv", 
              "regular_AdamBenton2016_" + VI_idx + "_JFD.csv", 
              "regular_Grant2017_" + VI_idx + "_JFD.csv", 
              "regular_FranklinYakima2018_" + VI_idx + "_JFD.csv"]

data=pd.DataFrame()

for file in file_names:
    curr_file=pd.read_csv(data_dir + file)
    curr_file['human_system_start_time'] = pd.to_datetime(curr_file['human_system_start_time'])
    
    # These data are for 3 years. The middle one is the correct one
    all_years = sorted(curr_file.human_system_start_time.dt.year.unique())
    if len(all_years)==3 or len(all_years)==2:
        proper_year = all_years[1]
    elif len(all_years)==1:
        proper_year = all_years[0]

    curr_file = curr_file[curr_file.human_system_start_time.dt.year==proper_year]
    data=pd.concat([data, curr_file])

data.reset_index(drop=True, inplace=True)
data.head(2)

Unnamed: 0,ID,human_system_start_time,NDVI
0,135073_WSDA_SF_2015,2015-01-10,0.163569
1,135073_WSDA_SF_2015,2015-01-20,0.028382


In [9]:
ground_truth = data[data.ID.isin(list(ground_truth_labels.ID.unique()))].copy()
len(ground_truth.ID.unique())

1849

In [10]:
ground_truth.head(2)

Unnamed: 0,ID,human_system_start_time,NDVI
2598,145288_WSDA_SF_2015,2015-01-10,0.20899
2599,145288_WSDA_SF_2015,2015-01-20,0.249083


# Toss Smalls

In [11]:
ground_truth_labels_extended = pd.merge(ground_truth_labels, meta, on=['ID'], how='left')
ground_truth_labels = ground_truth_labels_extended[ground_truth_labels_extended.ExctAcr>=10].copy()
ground_truth_labels.reset_index(drop=True, inplace=True)

print ("There are [{:.0f}] fields in total whose area"+ \
       "adds up to [{:.2f}].".format(len(ground_truth_labels_extended), \
                                                                     ground_truth_labels_extended.ExctAcr.sum()))

print ("There are [{:.0f}] fields larger than 10 acres"+ \
        "whose area adds up to [{:.2f}].".format(len(ground_truth_labels), \
                                                                    ground_truth_labels.ExctAcr.sum()))


There are [{:.0f}] fields in total whose areaadds up to [1849.00].
There are [{:.0f}] fields larger than 10 acreswhose area adds up to [1342.00].


In [12]:
ground_truth = ground_truth[ground_truth.ID.isin((list(meta_moreThan10Acr.ID)))].copy()
ground_truth_labels = ground_truth_labels[ground_truth_labels.ID.isin((list(meta_moreThan10Acr.ID)))].copy()

ground_truth.reset_index(drop=True, inplace=True)
ground_truth_labels.reset_index(drop=True, inplace=True)

# Sort

In [13]:
ground_truth.sort_values(by=["ID", 'human_system_start_time'], inplace=True)
ground_truth_labels.sort_values(by=["ID"], inplace=True)

ground_truth.reset_index(drop=True, inplace=True)
ground_truth_labels.reset_index(drop=True, inplace=True)

assert (len(ground_truth.ID.unique()) == len(ground_truth_labels.ID.unique()))

print (list(ground_truth.ID)[0])
print (list(ground_truth_labels.ID)[0])
print ("____________________________________")
print (list(ground_truth.ID)[-1])
print (list(ground_truth_labels.ID)[-1])
print ("____________________________________")
print (list(ground_truth.ID.unique())==list(ground_truth_labels.ID.unique()))

100048_WSDA_SF_2017
100048_WSDA_SF_2017
____________________________________
99909_WSDA_SF_2017
99909_WSDA_SF_2017
____________________________________
True


# Widen

In [14]:
NDVI_colnames = [VI_idx + "_" + str(ii) for ii in range(1, 37) ]
columnNames = ["ID"] + NDVI_colnames
ground_truth_wide = pd.DataFrame(columns=columnNames, 
                                index=range(len(ground_truth.ID.unique())))
ground_truth_wide["ID"] = ground_truth.ID.unique()

for an_ID in ground_truth.ID.unique():
    curr_df = ground_truth[ground_truth.ID==an_ID]
    
    ground_truth_wide_indx = ground_truth_wide[ground_truth_wide.ID==an_ID].index
    ground_truth_wide.loc[ground_truth_wide_indx, "NDVI_1":"NDVI_36"] = curr_df.NDVI.values[:36]

In [15]:
print (len(ground_truth_wide.ID.unique()))
ground_truth_wide.head(2)

1342


Unnamed: 0,ID,NDVI_1,NDVI_2,NDVI_3,NDVI_4,NDVI_5,NDVI_6,NDVI_7,NDVI_8,NDVI_9,...,NDVI_27,NDVI_28,NDVI_29,NDVI_30,NDVI_31,NDVI_32,NDVI_33,NDVI_34,NDVI_35,NDVI_36
0,100048_WSDA_SF_2017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140785,0.157916,...,0.157398,0.176086,0.19269,0.142555,0.136615,0.130675,0.124734,0.118794,0.131257,0.14372
1,100081_WSDA_SF_2017,0.005831,0.003887,0.001944,0.0,0.131036,0.262071,0.419314,0.521772,0.656894,...,0.140417,0.197382,0.196319,0.186397,0.093199,0.0,0.093574,0.187148,0.144835,0.102522


# Split Train and Test Set

#### Make sure rows of ```ground_truth_allBands``` and ```ground_truth_labels``` are in the same order

In [16]:
print (ground_truth_labels.CropTyp.unique())
ground_truth_labels.head(2)

['bean, green' 'wheat' 'onion' 'pea, green' 'corn, field' 'corn, sweet'
 'bean, dry' 'yellow mustard' 'potato' 'canola' 'mint' 'grass seed'
 'carrot' 'buckwheat' 'bluegrass seed' 'grass hay' 'corn seed' 'pea, dry'
 'pea seed' 'barley hay' 'market crops' 'triticale' 'carrot seed'
 'wheat fallow' 'barley' 'alfalfa seed' 'oat hay' 'triticale hay']


Unnamed: 0,ID,Vote,CropTyp,Irrigtn,DataSrc,Acres,ExctAcr,LstSrvD,county
0,100048_WSDA_SF_2017,1,"bean, green",rill,wsda,18,18.03324,2017/05/14,Grant
1,100081_WSDA_SF_2017,1,wheat,rill,wsda,16,15.959744,2017/08/09,Grant


In [17]:
ground_truth.head(2)

Unnamed: 0,ID,human_system_start_time,NDVI
0,100048_WSDA_SF_2017,2017-01-06,0.0
1,100048_WSDA_SF_2017,2017-01-16,0.0


In [77]:
ground_truth_labels = ground_truth_labels.set_index('ID')
ground_truth_labels = ground_truth_labels.reindex(index=ground_truth_wide['ID'])
ground_truth_labels = ground_truth_labels.reset_index()

print (ground_truth_labels.ExctAcr.min().round(2))
ground_truth_labels.head(2)

AttributeError: 'DataFrame' object has no attribute 'ExctAcr'

In [19]:
ground_truth_labels=ground_truth_labels[["ID", "Vote"]]

In [20]:
x_train_df, x_test_df, y_train_df, y_test_df = train_test_split(ground_truth_wide, 
                                                                ground_truth_labels, 
                                                                test_size=0.2, 
                                                                random_state=0,
                                                                shuffle=True,
                                                                stratify=ground_truth_labels.Vote.values)
x_test_df.shape

(269, 37)

# Start Random Forest

# Definitions

  - **Precision** Of all instances we predict $\hat y = 1$, what fraction is actually 1.
     \begin{equation}\label{eq:precision}
        \text{Precision} = \frac{TP}{TP + FP}
     \end{equation}

  - **Recall** Of all instances that are actually $y = 1$, what fraction we predict 1.
     \begin{equation}\label{eq:recall}
         \text{Recall} = \text{TPR} = \frac{TP}{TP + FN}
     \end{equation}
     
  - **Specifity** Fraction of all negative instances that are incorrectly predicted positive.
     \begin{equation}\label{eq:specifity}
        \text{Specifity} = \text{FPR} = \frac{FP}{TN + FP}\\
     \end{equation}
     
  - **F-Score** Adjust $\beta$ for trade off between  precision and recall. For precision oriented task $\beta = 0.5$.
     \begin{equation}\label{eq:Fscore}
        F_\beta = \frac{(1+\beta^2) TP}{ (1+\beta^2) TP + \beta^2 FN + FP}
     \end{equation}



In [21]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

In [22]:
%%time
regular_forest_1_default = RandomForestClassifier(n_estimators=100, 
                                                  criterion='gini', max_depth=None, 
                                                  min_samples_split=2, min_samples_leaf=1, 
                                                  min_weight_fraction_leaf=0.0,
                                                  max_features='sqrt', max_leaf_nodes=None, 
                                                  min_impurity_decrease=0.0, 
                                                  bootstrap=True, oob_score=False, n_jobs=None, 
                                                  random_state=1, verbose=0, 
                                                  warm_start=False, class_weight=None, 
                                                  ccp_alpha=0.0, max_samples=None)

regular_forest_1_default.fit(x_train_df.iloc[:, 1:], y_train_df.iloc[:, 1:].values.ravel())

CPU times: user 262 ms, sys: 2.96 ms, total: 265 ms
Wall time: 264 ms


RandomForestClassifier(max_features='sqrt', random_state=1)

In [23]:
regular_forest_1_default_predictions = regular_forest_1_default.predict(x_test_df.iloc[:, 1:])
regular_forest_1_default_y_test_df = y_test_df.copy()
regular_forest_1_default_y_test_df["prediction"]=list(regular_forest_1_default_predictions)
regular_forest_1_default_y_test_df.head(2)

Unnamed: 0,ID,Vote,prediction
1221,7667_WSDA_SF_2016,1,1
1334,99748_WSDA_SF_2017,1,1


In [24]:
true_single_predicted_single=0
true_single_predicted_double=0

true_double_predicted_single=0
true_double_predicted_double=0

for index_ in regular_forest_1_default_y_test_df.index:
    curr_vote=list(regular_forest_1_default_y_test_df[regular_forest_1_default_y_test_df.index==index_].Vote)[0]
    curr_predict=list(regular_forest_1_default_y_test_df[\
                                            regular_forest_1_default_y_test_df.index==index_].prediction)[0]
    if curr_vote==curr_predict:
        if curr_vote==1: 
            true_single_predicted_single+=1
        else:
            true_double_predicted_double+=1
    else:
        if curr_vote==1:
            true_single_predicted_double+=1
        else:
            true_double_predicted_single+=1
            
regular_forest_default_confus_tbl_test = pd.DataFrame(columns=['None', 'Predict_Single', 'Predict_Double'], 
                                               index=range(2))
regular_forest_default_confus_tbl_test.loc[0, 'None'] = 'Actual_Single'
regular_forest_default_confus_tbl_test.loc[1, 'None'] = 'Actual_Double'
regular_forest_default_confus_tbl_test['Predict_Single']=0
regular_forest_default_confus_tbl_test['Predict_Double']=0

regular_forest_default_confus_tbl_test.loc[0, "Predict_Single"]=true_single_predicted_single
regular_forest_default_confus_tbl_test.loc[0, "Predict_Double"]=true_single_predicted_double
regular_forest_default_confus_tbl_test.loc[1, "Predict_Single"]=true_double_predicted_single
regular_forest_default_confus_tbl_test.loc[1, "Predict_Double"]=true_double_predicted_double
regular_forest_default_confus_tbl_test

Unnamed: 0,None,Predict_Single,Predict_Double
0,Actual_Single,218,1
1,Actual_Double,13,37


In [25]:
FD1_y_test_df_act_1_pred_2=regular_forest_1_default_y_test_df[regular_forest_1_default_y_test_df.Vote==1].copy()
FD1_y_test_df_act_2_pred_1=regular_forest_1_default_y_test_df[regular_forest_1_default_y_test_df.Vote==2].copy()

FD1_y_test_df_act_1_pred_2=FD1_y_test_df_act_1_pred_2[FD1_y_test_df_act_1_pred_2.prediction==2].copy()
FD1_y_test_df_act_2_pred_1=FD1_y_test_df_act_2_pred_1[FD1_y_test_df_act_2_pred_1.prediction==1].copy()

FD1_y_test_df_act_2_pred_1 = pd.merge(FD1_y_test_df_act_2_pred_1, \
                                           ground_truth_labels_extended, on=['ID'], how='left')
FD1_y_test_df_act_1_pred_2 = pd.merge(FD1_y_test_df_act_1_pred_2, \
                                      ground_truth_labels_extended, on=['ID'], how='left')

print (FD1_y_test_df_act_2_pred_1.ExctAcr.sum())
print (FD1_y_test_df_act_1_pred_2.ExctAcr.sum())

print (FD1_y_test_df_act_2_pred_1.ExctAcr.sum()-FD1_y_test_df_act_1_pred_2.ExctAcr.sum())

1097.790529331336
36.938837280005
1060.8516920513312


In [27]:
model_dir = "/Users/hn/Documents/01_research_data/NASA/ML_Models/"
filename = model_dir + "regular" + VI_idx + "_forest_default.sav"
filename
pickle.dump(regular_forest_1_default, open(filename, 'wb'))

In [28]:
# parameters = {'n_jobs':[4],
#               'criterion': ["gini", "entropy"], # log_loss
#               'max_depth':[1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
#               'min_samples_split':[2, 3, 4, 5],
#               'max_features': ["sqrt", "log2", None],
#               # 'min_impurity_decreasefloat':[0, 1, 2],
#               'class_weight':['balanced', 'balanced_subsample', None],
#               'ccp_alpha':[0.0, 1, 2, 3], 
#               'max_samples':[None, 1, 2, 3, 4, 5]} # , 
# forest_classifier_grid = GridSearchCV(RandomForestClassifier(random_state=0), 
#                                       parameters, cv=5, verbose=1,
#                                       error_score='raise')

# forest_classifier_grid.fit(x_train_df.iloc[:, 1:], y_train_df.Vote.values.ravel())

In [29]:
# RandomForestClassifier.get_params().keys()

In [30]:
# (n_estimators=100, 
# criterion='gini', 
# max_depth=None, 
# min_samples_split=2, 
# min_samples_leaf=1, 
# min_weight_fraction_leaf=0.0,
# max_features='sqrt', 
# max_leaf_nodes=None, 
# min_impurity_decrease=0.0, 
# bootstrap=True, 
# oob_score=False, 
# n_jobs=None, 
# random_state=1,
# warm_start=False, 
# class_weight=None, 
# ccp_alpha=0.0,
# max_samples=None)

In [31]:
# %%time
# parameters = {'n_jobs':[4],
#               'criterion': ["gini", "entropy"], # log_loss 
#               'max_depth':[2, 4, 6, 8, 9, 10, 11, 12, 14, 16, 18, 20],
#               'min_samples_split':[2, 3, 4, 5],
#               'max_features': ["sqrt", "log2", None],
#               'class_weight':['balanced', 'balanced_subsample', None],
#               'ccp_alpha':[0.0, 1, 2, 3], 
#              # 'min_impurity_decreasefloat':[0, 1, 2], # roblem with sqrt stuff?
#               'max_samples':[None, 1, 2, 3, 4, 5]
#              } # , 
# forest_grid_1 = GridSearchCV(RandomForestClassifier(random_state=0), 
#                              parameters, cv=5, verbose=1,
#                              error_score='raise')

# forest_grid_1.fit(x_train_df.iloc[:, 1:], y_train_df.Vote.values.ravel())

# print (forest_grid_1.best_params_)
# print (forest_grid_1.best_score_)


# model_dir = "/Users/hn/Documents/01_research_data/NASA/ML_Models/"
# filename = model_dir + 'forest_grid_1.sav'
# pickle.dump(forest_1_default, open(filename, 'wb')) <- why it is saving default as grid_1?

In [32]:
%%time
parameters = {'n_jobs':[6],
              'criterion': ["gini", "entropy"], # log_loss 
              'max_depth':[1, 2, 4, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17],
              'min_samples_split':[4],
              'max_features': ["log2"],
              'class_weight':[None],
              'ccp_alpha':[0.0], 
             # 'min_impurity_decreasefloat':[0, 1, 2], # roblem with sqrt stuff?
              'max_samples':[None]
             } # , 
regular_forest_grid_1 = GridSearchCV(RandomForestClassifier(random_state=0), 
                                     parameters, cv=5, verbose=1,
                                     error_score='raise')

regular_forest_grid_1.fit(x_train_df.iloc[:, 1:], y_train_df.Vote.values.ravel())

print (regular_forest_grid_1.best_params_)
print (regular_forest_grid_1.best_score_)

Fitting 5 folds for each of 26 candidates, totalling 130 fits
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 11, 'max_features': 'log2', 'max_samples': None, 'min_samples_split': 4, 'n_jobs': 6}
0.955266246468159
CPU times: user 10.3 s, sys: 1.15 s, total: 11.4 s
Wall time: 14.3 s


In [33]:
regular_forest_grid_1_predictions = regular_forest_grid_1.predict(x_test_df.iloc[:, 1:])
regular_forest_grid_1_y_test_df=y_test_df.copy()
regular_forest_grid_1_y_test_df["prediction"]=list(regular_forest_grid_1_predictions)
regular_forest_grid_1_y_test_df.head(2)

Unnamed: 0,ID,Vote,prediction
1221,7667_WSDA_SF_2016,1,1
1334,99748_WSDA_SF_2017,1,1


In [34]:
true_single_predicted_single=0
true_single_predicted_double=0

true_double_predicted_single=0
true_double_predicted_double=0

for index_ in regular_forest_grid_1_y_test_df.index:
    curr_vote=list(regular_forest_grid_1_y_test_df[regular_forest_grid_1_y_test_df.index==index_].Vote)[0]
    curr_predict=list(regular_forest_grid_1_y_test_df[regular_forest_grid_1_y_test_df.index==index_].prediction)[0]
    if curr_vote==curr_predict:
        if curr_vote==1: 
            true_single_predicted_single+=1
        else:
            true_double_predicted_double+=1
    else:
        if curr_vote==1:
            true_single_predicted_double+=1
        else:
            true_double_predicted_single+=1
            
regular_forest_grid_1_confus_tbl_test = pd.DataFrame(columns=['None', 'Predict_Single', 'Predict_Double'], 
                               index=range(2))
regular_forest_grid_1_confus_tbl_test.loc[0, 'None'] = 'Actual_Single'
regular_forest_grid_1_confus_tbl_test.loc[1, 'None'] = 'Actual_Double'
regular_forest_grid_1_confus_tbl_test['Predict_Single']=0
regular_forest_grid_1_confus_tbl_test['Predict_Double']=0

regular_forest_grid_1_confus_tbl_test.loc[0, "Predict_Single"]=true_single_predicted_single
regular_forest_grid_1_confus_tbl_test.loc[0, "Predict_Double"]=true_single_predicted_double
regular_forest_grid_1_confus_tbl_test.loc[1, "Predict_Single"]=true_double_predicted_single
regular_forest_grid_1_confus_tbl_test.loc[1, "Predict_Double"]=true_double_predicted_double
regular_forest_grid_1_confus_tbl_test

Unnamed: 0,None,Predict_Single,Predict_Double
0,Actual_Single,218,1
1,Actual_Double,13,37


In [78]:
FG1_y_test_df_act_1_pred_2=regular_forest_grid_1_y_test_df[regular_forest_grid_1_y_test_df.Vote==1].copy()
FG1_y_test_df_act_2_pred_1=regular_forest_grid_1_y_test_df[regular_forest_grid_1_y_test_df.Vote==2].copy()

FG1_y_test_df_act_1_pred_2=FG1_y_test_df_act_1_pred_2[FG1_y_test_df_act_1_pred_2.prediction==2].copy()
FG1_y_test_df_act_2_pred_1=FG1_y_test_df_act_2_pred_1[FG1_y_test_df_act_2_pred_1.prediction==1].copy()

FG1_y_test_df_act_2_pred_1 = pd.merge(FG1_y_test_df_act_2_pred_1, ground_truth_labels_extended, on=['ID'], how='left')
FG1_y_test_df_act_1_pred_2 = pd.merge(FG1_y_test_df_act_1_pred_2, ground_truth_labels_extended, on=['ID'], how='left')

print (FG1_y_test_df_act_2_pred_1.ExctAcr.sum().round(2))
print (FG1_y_test_df_act_1_pred_2.ExctAcr.sum().round(2))
print ((FG1_y_test_df_act_2_pred_1.ExctAcr.sum()-FG1_y_test_df_act_1_pred_2.ExctAcr.sum()).round(2))

1144.29
36.94
1107.35


In [36]:
filename = model_dir + "regular"+ VI_idx +"_forest_grid_1.sav"
print (filename)
pickle.dump(regular_forest_grid_1, open(filename, 'wb'))

/Users/hn/Documents/01_research_data/NASA/ML_Models/regularNDVI_forest_grid_1.sav


### Regular More parameters

In [37]:
%%time

# parameters = {'n_jobs':[6],
#               'criterion': ["gini", "entropy"], # log_loss 
#               'max_depth':[1, 2, 4, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17],
#               'min_samples_split':[4],
#               'max_features': ["log2"],
#               'class_weight':[None],
#               'ccp_alpha':[0.0],
#               'max_samples':[None]
#              }

parameters = {'n_jobs':[5],
              'criterion': ["gini", "entropy"], # log_loss 
              'max_depth':[2, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20],
              'min_samples_split':[2, 3, 4, 5],
              'max_features': ["sqrt", "log2", None],
              'class_weight':['balanced', 'balanced_subsample', None],
              'ccp_alpha':[0.0, 1, 2, 3], 
             # 'min_impurity_decreasefloat':[0, 1, 2], # roblem with sqrt stuff?
              'max_samples':[None, 1, 2, 3, 4, 5]
             }

regular_forest_grid_2 = GridSearchCV(RandomForestClassifier(random_state=0), 
                             parameters, cv=5, verbose=1,
                             error_score='raise')

regular_forest_grid_2.fit(x_train_df.iloc[:, 1:], y_train_df.Vote.values.ravel())

print (regular_forest_grid_2.best_params_)
print (regular_forest_grid_2.best_score_)


regular_forest_grid_2_predictions = regular_forest_grid_2.predict(x_test_df.iloc[:, 1:])
regular_forest_grid_2_y_test_df=y_test_df.copy()
regular_forest_grid_2_y_test_df["prediction"]=list(regular_forest_grid_2_predictions)
regular_forest_grid_2_y_test_df.head(2)

Fitting 5 folds for each of 24192 candidates, totalling 120960 fits
{'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 9, 'max_features': 'log2', 'max_samples': None, 'min_samples_split': 4, 'n_jobs': 5}
0.9627298413388393
CPU times: user 2h 38min 55s, sys: 16min 56s, total: 2h 55min 51s
Wall time: 3h 16min 15s


Unnamed: 0,ID,Vote,prediction
1221,7667_WSDA_SF_2016,1,1
1334,99748_WSDA_SF_2017,1,1


In [38]:
true_single_predicted_single=0
true_single_predicted_double=0

true_double_predicted_single=0
true_double_predicted_double=0

for index_ in regular_forest_grid_2_y_test_df.index:
    curr_vote=list(regular_forest_grid_2_y_test_df[regular_forest_grid_2_y_test_df.index==index_].Vote)[0]
    curr_predict=list(regular_forest_grid_2_y_test_df[regular_forest_grid_2_y_test_df.index==index_].prediction)[0]
    if curr_vote==curr_predict:
        if curr_vote==1: 
            true_single_predicted_single+=1
        else:
            true_double_predicted_double+=1
    else:
        if curr_vote==1:
            true_single_predicted_double+=1
        else:
            true_double_predicted_single+=1
            
regular_forest_grid_2_confus_tbl_test = pd.DataFrame(columns=['None', 'Predict_Single', 'Predict_Double'], 
                               index=range(2))
regular_forest_grid_2_confus_tbl_test.loc[0, 'None'] = 'Actual_Single'
regular_forest_grid_2_confus_tbl_test.loc[1, 'None'] = 'Actual_Double'
regular_forest_grid_2_confus_tbl_test['Predict_Single']=0
regular_forest_grid_2_confus_tbl_test['Predict_Double']=0

regular_forest_grid_2_confus_tbl_test.loc[0, "Predict_Single"]=true_single_predicted_single
regular_forest_grid_2_confus_tbl_test.loc[0, "Predict_Double"]=true_single_predicted_double
regular_forest_grid_2_confus_tbl_test.loc[1, "Predict_Single"]=true_double_predicted_single
regular_forest_grid_2_confus_tbl_test.loc[1, "Predict_Double"]=true_double_predicted_double
regular_forest_grid_2_confus_tbl_test

Unnamed: 0,None,Predict_Single,Predict_Double
0,Actual_Single,217,2
1,Actual_Double,14,36


In [79]:
FG2_y_test_df_act_1_pred_2=regular_forest_grid_2_y_test_df[regular_forest_grid_2_y_test_df.Vote==1].copy()
FG2_y_test_df_act_2_pred_1=regular_forest_grid_2_y_test_df[regular_forest_grid_2_y_test_df.Vote==2].copy()

FG2_y_test_df_act_1_pred_2=FG2_y_test_df_act_1_pred_2[FG2_y_test_df_act_1_pred_2.prediction==2].copy()
FG2_y_test_df_act_2_pred_1=FG2_y_test_df_act_2_pred_1[FG2_y_test_df_act_2_pred_1.prediction==1].copy()

FG2_y_test_df_act_2_pred_1 = pd.merge(FG2_y_test_df_act_2_pred_1, ground_truth_labels_extended, on=['ID'], how='left')
FG2_y_test_df_act_1_pred_2 = pd.merge(FG2_y_test_df_act_1_pred_2, ground_truth_labels_extended, on=['ID'], how='left')

print (FG2_y_test_df_act_2_pred_1.ExctAcr.sum().round(2))
print (FG2_y_test_df_act_1_pred_2.ExctAcr.sum().round(2))
print ((FG2_y_test_df_act_2_pred_1.ExctAcr.sum()-FG2_y_test_df_act_1_pred_2.ExctAcr.sum()).round(2))

1259.47
54.82
1204.65


In [40]:
filename = model_dir + "regular" + VI_idx + "_forest_grid_2.sav"
pickle.dump(regular_forest_grid_2, open(filename, 'wb'))

# Why?

We gave more options to the grid search, and the result in terms of accuracy is lowerd. Why?

In [65]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print ("Confusion")
print(confusion_matrix(list(regular_forest_grid_1_y_test_df.Vote), 
                       list((regular_forest_grid_1_y_test_df.prediction))))
print ("_________________________________________________________________________")
print ("classification_report")
print(classification_report(list(regular_forest_grid_1_y_test_df.Vote), 
                            list((regular_forest_grid_1_y_test_df.prediction))))
print ("_________________________________________________________________________")
print ("accuracy_score")
print(accuracy_score(list(regular_forest_grid_1_y_test_df.Vote), 
                     list((regular_forest_grid_1_y_test_df.prediction))).round(3))

Confusion
[[218   1]
 [ 13  37]]
_________________________________________________________________________
classification_report
              precision    recall  f1-score   support

           1       0.94      1.00      0.97       219
           2       0.97      0.74      0.84        50

    accuracy                           0.95       269
   macro avg       0.96      0.87      0.90       269
weighted avg       0.95      0.95      0.95       269

_________________________________________________________________________
accuracy_score
0.948


In [66]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print ("Confusion")
print(confusion_matrix(list(regular_forest_grid_2_y_test_df.Vote), 
                       list((regular_forest_grid_2_y_test_df.prediction))))
print ("_________________________________________________________________________")
print ("classification_report")
print(classification_report(list(regular_forest_grid_2_y_test_df.Vote), 
                            list((regular_forest_grid_2_y_test_df.prediction))))
print ("_________________________________________________________________________")
print ("accuracy_score")
print(accuracy_score(list(regular_forest_grid_2_y_test_df.Vote), 
                     list((regular_forest_grid_2_y_test_df.prediction))).round(3))

Confusion
[[217   2]
 [ 14  36]]
_________________________________________________________________________
classification_report
              precision    recall  f1-score   support

           1       0.94      0.99      0.96       219
           2       0.95      0.72      0.82        50

    accuracy                           0.94       269
   macro avg       0.94      0.86      0.89       269
weighted avg       0.94      0.94      0.94       269

_________________________________________________________________________
accuracy_score
0.941


# SG

In [41]:
data_dir = "/Users/hn/Documents/01_research_data/NASA/VI_TS/05_SG_TS/"

file_names = ["SG_Walla2015_" + VI_idx + "_JFD.csv", "SG_AdamBenton2016_" + VI_idx + "_JFD.csv", 
              "SG_Grant2017_" + VI_idx + "_JFD.csv", "SG_FranklinYakima2018_"+ VI_idx +"_JFD.csv"]

data=pd.DataFrame()

for file in file_names:
    curr_file=pd.read_csv(data_dir + file)
    curr_file['human_system_start_time'] = pd.to_datetime(curr_file['human_system_start_time'])
    
    # These data are for 3 years. The middle one is the correct one
    all_years = sorted(curr_file.human_system_start_time.dt.year.unique())
    if len(all_years)==3 or len(all_years)==2:
        proper_year = all_years[1]
    elif len(all_years)==1:
        proper_year = all_years[0]

    curr_file = curr_file[curr_file.human_system_start_time.dt.year==proper_year]
    data=pd.concat([data, curr_file])

data.reset_index(drop=True, inplace=True)
data.head(2)

Unnamed: 0,ID,human_system_start_time,NDVI
0,135073_WSDA_SF_2015,2015-01-10,0.115126
1,135073_WSDA_SF_2015,2015-01-20,0.111097


In [84]:
ground_truth = data[data.ID.isin(list(ground_truth_labels.ID.unique()))].copy()

print (len(meta_moreThan10Acr.ID.unique()))
ground_truth_labels_extended = pd.merge(ground_truth_labels, meta, on=['ID'], how='left')
ground_truth_labels = ground_truth_labels_extended[ground_truth_labels_extended.ExctAcr>=10].copy()
ground_truth_labels.reset_index(drop=True, inplace=True)

print ("There are [{:.0f}] fields in total whose"+ \
       "area adds up to [{:.2f}].".format(len(ground_truth_labels_extended),\
                                                            ground_truth_labels_extended.ExctAcr.sum()))


print ("There are [{:.0f}] fields larger than 10 "+ \
        "acres whose area adds up to [{:.2f}].".format(len(ground_truth_labels), \
                                                                    ground_truth_labels.ExctAcr.sum()))


3539


AttributeError: 'DataFrame' object has no attribute 'ExctAcr'

In [43]:
ground_truth = ground_truth[ground_truth.ID.isin((list(meta_moreThan10Acr.ID)))].copy()
ground_truth_labels = ground_truth_labels[ground_truth_labels.ID.isin((list(meta_moreThan10Acr.ID)))].copy()

ground_truth.reset_index(drop=True, inplace=True)
ground_truth_labels.reset_index(drop=True, inplace=True)

In [44]:
ground_truth.sort_values(by=["ID", 'human_system_start_time'], inplace=True)
ground_truth_labels.sort_values(by=["ID"], inplace=True)

ground_truth.reset_index(drop=True, inplace=True)
ground_truth_labels.reset_index(drop=True, inplace=True)

assert (len(ground_truth.ID.unique()) == len(ground_truth_labels.ID.unique()))

print (list(ground_truth.ID)[0])
print (list(ground_truth_labels.ID)[0])
print ("____________________________________")
print (list(ground_truth.ID)[-1])
print (list(ground_truth_labels.ID)[-1])
print ("____________________________________")
print (list(ground_truth.ID.unique())==list(ground_truth_labels.ID.unique()))

100048_WSDA_SF_2017
100048_WSDA_SF_2017
____________________________________
99909_WSDA_SF_2017
99909_WSDA_SF_2017
____________________________________
True


# Widen

In [45]:
NDVI_colnames = [VI_idx + "_" + str(ii) for ii in range(1, 37) ]
columnNames = ["ID"] + NDVI_colnames
ground_truth_wide = pd.DataFrame(columns=columnNames, 
                                index=range(len(ground_truth.ID.unique())))
ground_truth_wide["ID"] = ground_truth.ID.unique()

for an_ID in ground_truth.ID.unique():
    curr_df = ground_truth[ground_truth.ID==an_ID]
    
    ground_truth_wide_indx = ground_truth_wide[ground_truth_wide.ID==an_ID].index
    ground_truth_wide.loc[ground_truth_wide_indx, "NDVI_1":"NDVI_36"] = curr_df.NDVI.values[:36]

In [46]:
ground_truth_labels = ground_truth_labels.set_index('ID')
ground_truth_labels = ground_truth_labels.reindex(index=ground_truth_wide['ID'])
ground_truth_labels = ground_truth_labels.reset_index()

In [47]:
ground_truth_labels=ground_truth_labels[["ID", "Vote"]]
ground_truth_labels.head(2)

Unnamed: 0,ID,Vote
0,100048_WSDA_SF_2017,1
1,100081_WSDA_SF_2017,1


In [48]:
x_train_df, x_test_df, y_train_df, y_test_df = train_test_split(ground_truth_wide, 
                                                                ground_truth_labels, 
                                                                test_size=0.2, 
                                                                random_state=0,
                                                                shuffle=True,
                                                                stratify=ground_truth_labels.Vote.values)

In [49]:
%%time
forest_1_default_SG = RandomForestClassifier(n_estimators=100, 
                                             criterion='gini', max_depth=None, 
                                             min_samples_split=2, min_samples_leaf=1, 
                                             min_weight_fraction_leaf=0.0,
                                             max_features='sqrt', max_leaf_nodes=None, 
                                             min_impurity_decrease=0.0, 
                                             bootstrap=True, oob_score=False, n_jobs=None, 
                                             random_state=1, verbose=0, 
                                             warm_start=False, class_weight=None, 
                                             ccp_alpha=0.0, max_samples=None)

forest_1_default_SG.fit(x_train_df.iloc[:, 1:], y_train_df.iloc[:, 1:].values.ravel())

CPU times: user 271 ms, sys: 6.27 ms, total: 278 ms
Wall time: 277 ms


RandomForestClassifier(max_features='sqrt', random_state=1)

In [50]:
forest_1_default_SG_predictions = forest_1_default_SG.predict(x_test_df.iloc[:, 1:])
forest_1_default_SG_y_test_df=y_test_df.copy()
forest_1_default_SG_y_test_df["prediction"]=list(forest_1_default_SG_predictions)
forest_1_default_SG_y_test_df.head(2)

Unnamed: 0,ID,Vote,prediction
1221,7667_WSDA_SF_2016,1,1
1334,99748_WSDA_SF_2017,1,1


In [51]:
true_single_predicted_single=0
true_single_predicted_double=0

true_double_predicted_single=0
true_double_predicted_double=0

for index_ in forest_1_default_SG_y_test_df.index:
    curr_vote=list(forest_1_default_SG_y_test_df[forest_1_default_SG_y_test_df.index==index_].Vote)[0]
    curr_predict=list(forest_1_default_SG_y_test_df[forest_1_default_SG_y_test_df.index==index_].prediction)[0]
    if curr_vote==curr_predict:
        if curr_vote==1: 
            true_single_predicted_single+=1
        else:
            true_double_predicted_double+=1
    else:
        if curr_vote==1:
            true_single_predicted_double+=1
        else:
            true_double_predicted_single+=1
            
forest_default_SG_confus_tbl_test = pd.DataFrame(columns=['None', 'Predict_Single', 'Predict_Double'], 
                               index=range(2))
forest_default_SG_confus_tbl_test.loc[0, 'None'] = 'Actual_Single'
forest_default_SG_confus_tbl_test.loc[1, 'None'] = 'Actual_Double'
forest_default_SG_confus_tbl_test['Predict_Single']=0
forest_default_SG_confus_tbl_test['Predict_Double']=0

forest_default_SG_confus_tbl_test.loc[0, "Predict_Single"]=true_single_predicted_single
forest_default_SG_confus_tbl_test.loc[0, "Predict_Double"]=true_single_predicted_double
forest_default_SG_confus_tbl_test.loc[1, "Predict_Single"]=true_double_predicted_single
forest_default_SG_confus_tbl_test.loc[1, "Predict_Double"]=true_double_predicted_double
forest_default_SG_confus_tbl_test

Unnamed: 0,None,Predict_Single,Predict_Double
0,Actual_Single,217,2
1,Actual_Double,11,39


In [72]:
FD1_yTest_df_act_1_pred_2=forest_1_default_SG_y_test_df[forest_1_default_SG_y_test_df.Vote==1].copy()
FD1_yTest_df_act_2_pred_1=forest_1_default_SG_y_test_df[forest_1_default_SG_y_test_df.Vote==2].copy()

FD1_yTest_df_act_1_pred_2=FD1_yTest_df_act_1_pred_2[FD1_yTest_df_act_1_pred_2.prediction==2].copy()
FD1_yTest_df_act_2_pred_1=FD1_yTest_df_act_2_pred_1[FD1_yTest_df_act_2_pred_1.prediction==1].copy()

FD1_yTest_df_act_2_pred_1 = pd.merge(FD1_yTest_df_act_2_pred_1, ground_truth_labels_extended, on=['ID'], how='left')
FD1_yTest_df_act_1_pred_2 = pd.merge(FD1_yTest_df_act_1_pred_2, ground_truth_labels_extended, on=['ID'], how='left')

aa=FD1_yTest_df_act_2_pred_1.ExctAcr.sum()
bb=FD1_yTest_df_act_1_pred_2.ExctAcr.sum()
print ("FD1_yTest_df_act_2_pred_1.ExctAcr.sum():", aa.round(2))
print ("FD1_yTest_df_act_1_pred_2.ExctAcr.sum():", bb.round(2))
print (aa-bb)

FD1_yTest_df_act_2_pred_1.ExctAcr.sum(): 919.96
FD1_yTest_df_act_1_pred_2.ExctAcr.sum(): 54.82
865.1432841821751


In [67]:
%%time
parameters = {'n_jobs':[6],
              'criterion': ["gini", "entropy"], # log_loss 
              'max_depth':[2, 4, 6, 8, 9, 10, 11, 12, 14, 16, 18, 20],
              'min_samples_split':[2, 3, 4, 5],
              'max_features': ["sqrt", "log2", None],
              'class_weight':['balanced', 'balanced_subsample', None],
              'ccp_alpha':[0.0, 1, 2, 3], 
             # 'min_impurity_decreasefloat':[0, 1, 2], # roblem with sqrt stuff?
              'max_samples':[None, 1, 2, 3, 4, 5]
             } # , 
forest_grid_1_SG = GridSearchCV(RandomForestClassifier(random_state=0), 
                             parameters, cv=5, verbose=1,
                             error_score='raise')

forest_grid_1_SG.fit(x_train_df.iloc[:, 1:], y_train_df.Vote.values.ravel())

print (forest_grid_1_SG.best_params_)
print (forest_grid_1_SG.best_score_)


model_dir = "/Users/hn/Documents/01_research_data/NASA/ML_Models/"
filename = model_dir + "SG" + VI_idx + "_forest_grid_1.sav"
pickle.dump(forest_grid_1_SG, open(filename, 'wb'))

Fitting 5 folds for each of 20736 candidates, totalling 103680 fits
{'ccp_alpha': 0.0, 'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'max_samples': None, 'min_samples_split': 4, 'n_jobs': 6}
0.9636644207780918
CPU times: user 2h 13min 15s, sys: 14min 35s, total: 2h 27min 50s
Wall time: 2h 45min 10s


In [68]:
forest_grid_1_SG_predictions = forest_grid_1_SG.predict(x_test_df.iloc[:, 1:])
forest_grid_1_SG_y_test_df=y_test_df.copy()
forest_grid_1_SG_y_test_df["prediction"]=list(forest_grid_1_SG_predictions)
forest_grid_1_SG_y_test_df.head(2)

Unnamed: 0,ID,Vote,prediction
1221,7667_WSDA_SF_2016,1,1
1334,99748_WSDA_SF_2017,1,1


In [69]:
true_single_predicted_single=0
true_single_predicted_double=0

true_double_predicted_single=0
true_double_predicted_double=0

for index_ in forest_grid_1_SG_y_test_df.index:
    curr_vote=list(forest_grid_1_SG_y_test_df[forest_grid_1_SG_y_test_df.index==index_].Vote)[0]
    curr_predict=list(forest_grid_1_SG_y_test_df[forest_grid_1_SG_y_test_df.index==index_].prediction)[0]
    if curr_vote==curr_predict:
        if curr_vote==1: 
            true_single_predicted_single+=1
        else:
            true_double_predicted_double+=1
    else:
        if curr_vote==1:
            true_single_predicted_double+=1
        else:
            true_double_predicted_single+=1
            
forest_default_SG_confus_tbl_test = pd.DataFrame(columns=['None', 'Predict_Single', 'Predict_Double'], 
                               index=range(2))
forest_default_SG_confus_tbl_test.loc[0, 'None'] = 'Actual_Single'
forest_default_SG_confus_tbl_test.loc[1, 'None'] = 'Actual_Double'
forest_default_SG_confus_tbl_test['Predict_Single']=0
forest_default_SG_confus_tbl_test['Predict_Double']=0

forest_default_SG_confus_tbl_test.loc[0, "Predict_Single"]=true_single_predicted_single
forest_default_SG_confus_tbl_test.loc[0, "Predict_Double"]=true_single_predicted_double
forest_default_SG_confus_tbl_test.loc[1, "Predict_Single"]=true_double_predicted_single
forest_default_SG_confus_tbl_test.loc[1, "Predict_Double"]=true_double_predicted_double
forest_default_SG_confus_tbl_test

Unnamed: 0,None,Predict_Single,Predict_Double
0,Actual_Single,217,2
1,Actual_Double,14,36


In [76]:
FG1_yTest_df_act_1_pred_2=forest_grid_1_SG_y_test_df[forest_grid_1_SG_y_test_df.Vote==1].copy()
FG1_yTest_df_act_2_pred_1=forest_grid_1_SG_y_test_df[forest_grid_1_SG_y_test_df.Vote==2].copy()

FG1_yTest_df_act_1_pred_2=FG1_yTest_df_act_1_pred_2[FG1_yTest_df_act_1_pred_2.prediction==2].copy()
FG1_yTest_df_act_2_pred_1=FG1_yTest_df_act_2_pred_1[FG1_yTest_df_act_2_pred_1.prediction==1].copy()

FG1_yTest_df_act_2_pred_1 = pd.merge(FG1_yTest_df_act_2_pred_1, ground_truth_labels_extended, on=['ID'], how='left')
FG1_yTest_df_act_1_pred_2 = pd.merge(FG1_yTest_df_act_1_pred_2, ground_truth_labels_extended, on=['ID'], how='left')

print (FG1_yTest_df_act_2_pred_1.ExctAcr.sum().round(2))
print (FG1_yTest_df_act_1_pred_2.ExctAcr.sum().round(2))
print ((np.abs(FG1_yTest_df_act_1_pred_2.ExctAcr.sum() - FG1_yTest_df_act_2_pred_1.ExctAcr.sum())).round(2))

1259.47
54.82
1204.65
