In [3]:
import numpy as np
import pandas as pd
import scipy, scipy.signal

from datetime import date
import time

from random import seed
from random import random

import os, os.path
import shutil

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import matplotlib
import matplotlib.pyplot as plt
from pylab import imshow

import h5py
import sys
sys.path.append('/Users/hn/Documents/00_GitHub/Ag/NASA/Python_codes/')
import NASA_core as nc
# import NASA_plot_core.py as rcp

In [2]:
data_dir = "/Users/hn/Documents/01_research_data/NASA/06_SOS_tables/"

In [4]:
meta_dir = "/Users/hn/Documents/01_research_data/NASA/parameters/"
meta = pd.read_csv(meta_dir+"evaluation_set.csv")
meta_moreThan10Acr=meta[meta.ExctAcr>10]
print (meta.shape)
print (meta_moreThan10Acr.shape)
meta.head(2)

(6340, 8)
(3539, 8)


Unnamed: 0,ID,CropTyp,Irrigtn,DataSrc,Acres,ExctAcr,LstSrvD,county
0,100010_WSDA_SF_2017,alfalfa hay,center pivot,wsda,34,34.310305,2017/09/12,Grant
1,100204_WSDA_SF_2017,alfalfa hay,center pivot,wsda,62,61.826535,2017/08/09,Grant


In [4]:
sorted(meta.county.unique())

['Adams', 'Benton', 'Franklin', 'Grant', 'Walla Walla', 'Yakima']

In [5]:
walla = meta[meta.county=="Walla Walla"]
walla.ID

244     150030_WSDA_SF_2015
247     151525_WSDA_SF_2015
248     154028_WSDA_SF_2015
249     157039_WSDA_SF_2015
250     157040_WSDA_SF_2015
               ...         
6219    144551_WSDA_SF_2015
6220    150442_WSDA_SF_2015
6221    162851_WSDA_SF_2015
6222    173974_WSDA_SF_2015
6223    180631_WSDA_SF_2015
Name: ID, Length: 649, dtype: object

In [5]:
training_set_dir = "/Users/hn/Documents/01_research_data/NASA/ML_data/"
ground_truth_labels = pd.read_csv(training_set_dir+"train_labels.csv")
print ("Unique Votes: ", ground_truth_labels.Vote.unique())
print (len(ground_truth_labels.ID.unique()))
ground_truth_labels.head(2)

Unique Votes:  [2 1]
1849


Unnamed: 0,ID,Vote
0,99837_WSDA_SF_2017,2
1,114615_WSDA_SF_2017,1


In [7]:
ground_truth_labels_extended = pd.merge(ground_truth_labels, meta, on=['ID'], how='left')
ground_truth_labels = ground_truth_labels_extended[ground_truth_labels_extended.ExctAcr>=10].copy()
ground_truth_labels.reset_index(drop=True, inplace=True)

In [8]:
ground_truth_labels.shape

(1342, 9)

# Test set (the 20%)

In [9]:
test_set_dir = "/Users/hn/Documents/01_research_data/NASA/ML_data/"
testset = pd.read_csv(test_set_dir+"test20_split_expertLabels_2Bconsistent.csv")
ground_truth_labels_test = ground_truth_labels[ground_truth_labels.ID.isin(list(testset.ID))]
ground_truth_labels_test.shape

(269, 9)

In [10]:
all_files = os.listdir(data_dir)

NamePattern="irr_NoNASS_SurvCorrect"
files_with_3_filters = [x for x in all_files if NamePattern in x]

In [11]:
VI_indeksss = ["NDVI", "EVI"]
NDVI_thresholds = [3, 4, 5]

VI_indeks="EVI"
NDVI_threshold=3

for VI_indeks in VI_indeksss:
    for NDVI_threshold  in NDVI_thresholds:
        all_data = pd.DataFrame()

        for fileName in files_with_3_filters:
            if VI_indeks+str(NDVI_threshold) in fileName:
                a=pd.read_csv(data_dir + fileName)
                a['human_system_start_time'] = pd.to_datetime(a['human_system_start_time'])
                curr_year = int(fileName.split("_")[2][-4:])
                a = a[a['human_system_start_time'].dt.year == curr_year].copy()
                a = a[["ID", "season_count"]]

                all_data=pd.concat([all_data, a])

        all_data = all_data[all_data.ID.isin(list(ground_truth_labels_test.ID.unique()))].copy()
        all_data.drop_duplicates(inplace=True)
        all_data.reset_index(drop=True, inplace=True)
        print ("==================================================================================================")
        single_season=all_data[all_data.season_count<2]
        two_seasons=all_data[all_data.season_count>=2]
        print ("this must be 269:", str(len(single_season)+len(two_seasons)))
        
        evalHelp = pd.merge(ground_truth_labels_test, all_data, on=['ID'], how='left')
        true_single_predicted_single = evalHelp[evalHelp.Vote==1].copy()
        true_single_predicted_single = true_single_predicted_single[true_single_predicted_single.season_count<2]


        true_double_predicted_double = evalHelp[evalHelp.Vote>=2].copy()
        true_double_predicted_double = true_double_predicted_double[true_double_predicted_double.season_count>=2]

        true_double_predicted_single=evalHelp[evalHelp.Vote==2].copy()
        true_double_predicted_single = true_double_predicted_single[true_double_predicted_single.season_count<2]

        true_single_predicted_double=evalHelp[evalHelp.Vote==1].copy()
        true_single_predicted_double = true_single_predicted_double[true_single_predicted_double.season_count>=2]


        balanced_confus_tbl_test = pd.DataFrame(columns=['None', 'Predict_Single', 'Predict_Double'], 
                                                index=range(2))
        balanced_confus_tbl_test.loc[0, 'None'] = 'Actual_Single'
        balanced_confus_tbl_test.loc[1, 'None'] = 'Actual_Double'
        balanced_confus_tbl_test['Predict_Single']=0
        balanced_confus_tbl_test['Predict_Double']=0

        balanced_confus_tbl_test.loc[0, "Predict_Single"]=len(true_single_predicted_single)
        balanced_confus_tbl_test.loc[0, "Predict_Double"]=len(true_single_predicted_double)
        balanced_confus_tbl_test.loc[1, "Predict_Single"]=len(true_double_predicted_single)
        balanced_confus_tbl_test.loc[1, "Predict_Double"]=len(true_double_predicted_double)
        
        print ("VI_indeks: " + VI_indeks + ", NDVI_threshold: " + str(NDVI_threshold))
        print (balanced_confus_tbl_test)
        print ("")
        _dc = np.abs(balanced_confus_tbl_test.loc[0, "Predict_Double"]-balanced_confus_tbl_test.loc[1, "Predict_Single"])
        print ("count difference is " + str(_dc))
        acr_diff=np.abs(true_double_predicted_single.ExctAcr.sum()-true_single_predicted_double.ExctAcr.sum())
        print ("acr difference is "+str(acr_diff))
        print ("true_double_predicted_single.ExctAcr", true_double_predicted_single.ExctAcr.sum())
        print ("true_single_predicted_double.ExctAcr", true_single_predicted_double.ExctAcr.sum())
        print ()

this must be 269: 269
VI_indeks: NDVI, NDVI_threshold: 3
            None  Predict_Single  Predict_Double
0  Actual_Single             175              44
1  Actual_Double              19              31

count difference is 25
acr difference is 1026.7778183940616
true_double_predicted_single.ExctAcr 1754.0358052667268
true_single_predicted_double.ExctAcr 2780.8136236607884

this must be 269: 269
VI_indeks: NDVI, NDVI_threshold: 4
            None  Predict_Single  Predict_Double
0  Actual_Single             174              45
1  Actual_Double              11              39

count difference is 34
acr difference is 1897.496052101365
true_double_predicted_single.ExctAcr 1127.890721287638
true_single_predicted_double.ExctAcr 3025.386773389003

this must be 269: 269
VI_indeks: NDVI, NDVI_threshold: 5
            None  Predict_Single  Predict_Double
0  Actual_Single             189              30
1  Actual_Double              10              40

count difference is 20
acr difference is 1

# Drop Walla Walla

In [12]:
meta_dir = "/Users/hn/Documents/01_research_data/NASA/parameters/"
meta = pd.read_csv(meta_dir+"evaluation_set.csv")
meta_moreThan10Acr=meta[meta.ExctAcr>10]
print (meta.shape)
print (meta_moreThan10Acr.shape)
meta.head(2)

(6340, 8)
(3539, 8)


Unnamed: 0,ID,CropTyp,Irrigtn,DataSrc,Acres,ExctAcr,LstSrvD,county
0,100010_WSDA_SF_2017,alfalfa hay,center pivot,wsda,34,34.310305,2017/09/12,Grant
1,100204_WSDA_SF_2017,alfalfa hay,center pivot,wsda,62,61.826535,2017/08/09,Grant


In [13]:
training_set_dir = "/Users/hn/Documents/01_research_data/NASA/ML_data/"
ground_truth_labels = pd.read_csv(training_set_dir+"train_labels.csv")
print ("Unique Votes: ", ground_truth_labels.Vote.unique())
print (len(ground_truth_labels.ID.unique()))
ground_truth_labels.head(2)

Unique Votes:  [2 1]
1849


Unnamed: 0,ID,Vote
0,99837_WSDA_SF_2017,2
1,114615_WSDA_SF_2017,1


In [14]:
ground_truth_labels_extended = pd.merge(ground_truth_labels, meta, on=['ID'], how='left')
ground_truth_labels = ground_truth_labels_extended[ground_truth_labels_extended.ExctAcr>=10].copy()
ground_truth_labels.reset_index(drop=True, inplace=True)

In [15]:
ground_truth_labels=ground_truth_labels[ground_truth_labels.county!="Walla Walla"]

# Test Set (20%)

In [16]:
test_set_dir = "/Users/hn/Documents/01_research_data/NASA/ML_data/"
testset = pd.read_csv(test_set_dir+"test20_split_expertLabels_2Bconsistent.csv")

ground_truth_labels_test = ground_truth_labels[ground_truth_labels.ID.isin(list(testset.ID))]
ground_truth_labels_test.shape

(248, 9)

In [17]:
all_files = os.listdir(data_dir)

NamePattern="irr_NoNASS_SurvCorrect"
files_with_3_filters = [x for x in all_files if NamePattern in x]

In [18]:
VI_indeksss = ["NDVI", "EVI"]
NDVI_thresholds = [3, 4, 5]

VI_indeks="EVI"
NDVI_threshold=3

for VI_indeks in VI_indeksss:
    for NDVI_threshold  in NDVI_thresholds:
        all_data = pd.DataFrame()

        for fileName in files_with_3_filters:
            if VI_indeks+str(NDVI_threshold) in fileName:
                a=pd.read_csv(data_dir + fileName)
                a['human_system_start_time'] = pd.to_datetime(a['human_system_start_time'])
                curr_year = int(fileName.split("_")[2][-4:])
                a = a[a['human_system_start_time'].dt.year == curr_year].copy()
                a = a[["ID", "season_count"]]

                all_data=pd.concat([all_data, a])

        all_data = all_data[all_data.ID.isin(list(ground_truth_labels_test.ID.unique()))].copy()
        all_data.drop_duplicates(inplace=True)
        all_data.reset_index(drop=True, inplace=True)
        print ("==================================================================================================")
        single_season=all_data[all_data.season_count<2]
        two_seasons=all_data[all_data.season_count>=2]
        print ("this must be 269:", str(len(single_season)+len(two_seasons)))
        
        evalHelp = pd.merge(ground_truth_labels_test, all_data, on=['ID'], how='left')
        true_single_predicted_single = evalHelp[evalHelp.Vote==1].copy()
        true_single_predicted_single = true_single_predicted_single[true_single_predicted_single.season_count<2]


        true_double_predicted_double = evalHelp[evalHelp.Vote>=2].copy()
        true_double_predicted_double = true_double_predicted_double[true_double_predicted_double.season_count>=2]

        true_double_predicted_single=evalHelp[evalHelp.Vote==2].copy()
        true_double_predicted_single = true_double_predicted_single[true_double_predicted_single.season_count<2]

        true_single_predicted_double=evalHelp[evalHelp.Vote==1].copy()
        true_single_predicted_double = true_single_predicted_double[true_single_predicted_double.season_count>=2]


        balanced_confus_tbl_test = pd.DataFrame(columns=['None', 'Predict_Single', 'Predict_Double'], 
                                                index=range(2))
        balanced_confus_tbl_test.loc[0, 'None'] = 'Actual_Single'
        balanced_confus_tbl_test.loc[1, 'None'] = 'Actual_Double'
        balanced_confus_tbl_test['Predict_Single']=0
        balanced_confus_tbl_test['Predict_Double']=0

        balanced_confus_tbl_test.loc[0, "Predict_Single"]=len(true_single_predicted_single)
        balanced_confus_tbl_test.loc[0, "Predict_Double"]=len(true_single_predicted_double)
        balanced_confus_tbl_test.loc[1, "Predict_Single"]=len(true_double_predicted_single)
        balanced_confus_tbl_test.loc[1, "Predict_Double"]=len(true_double_predicted_double)
        
        print ("VI_indeks: " + VI_indeks + ", NDVI_threshold: " + str(NDVI_threshold))
        print (balanced_confus_tbl_test)
        print ("")
        _dc = np.abs(balanced_confus_tbl_test.loc[0, "Predict_Double"]-balanced_confus_tbl_test.loc[1, "Predict_Single"])
        print ("count difference is " + str(_dc))
        acr_diff=np.abs(true_double_predicted_single.ExctAcr.sum()-true_single_predicted_double.ExctAcr.sum())
        print ("acr difference is "+str(acr_diff))
        print ("true_double_predicted_single.ExctAcr", true_double_predicted_single.ExctAcr.sum())
        print ("true_single_predicted_double.ExctAcr", true_single_predicted_double.ExctAcr.sum())
        print ()

this must be 269: 248
VI_indeks: NDVI, NDVI_threshold: 3
            None  Predict_Single  Predict_Double
0  Actual_Single             159              41
1  Actual_Double              17              31

count difference is 24
acr difference is 962.3965821776367
true_double_predicted_single.ExctAcr 1622.880387351114
true_single_predicted_double.ExctAcr 2585.2769695287507

this must be 269: 248
VI_indeks: NDVI, NDVI_threshold: 4
            None  Predict_Single  Predict_Double
0  Actual_Single             157              43
1  Actual_Double               9              39

count difference is 34
acr difference is 1973.9842729535007
true_double_predicted_single.ExctAcr 996.7353033720248
true_single_predicted_double.ExctAcr 2970.7195763255254

this must be 269: 248
VI_indeks: NDVI, NDVI_threshold: 5
            None  Predict_Single  Predict_Double
0  Actual_Single             171              29
1  Actual_Double               8              40

count difference is 21
acr difference is 1

# Sentinel

In [22]:
sent_dir = "/Users/hn/Documents/01_research_data/NASA/Sentinel/"

In [23]:
ground_truth_labels=ground_truth_labels[ground_truth_labels.county!="Walla Walla"]
ground_truth_labels_test = ground_truth_labels[ground_truth_labels.ID.isin(list(testset.ID))]
ground_truth_labels_test.shape

(248, 9)

In [24]:
# fileNames = ["extended_all_fields_seasonCounts_noFilter_SEOS3", "extended_all_fields_seasonCounts_noFilter_SEOS4",
#              "extended_all_fields_seasonCounts_noFilter_SEOS5"]
# sentinel_data=pd.DataFrame()
# for a_file in fileNames:
#     a=pd.read_csv(sent_dir+"05_01_allFields_SeasonCounts/" + a_file+".csv")
#     a=a[a.SG_params==73]
#     sentinel_data=pd.concat([sentinel_data, a])

In [25]:
VI_indeksss = ["NDVI", "EVI"]
years=[2016, 2017, 2018]
NDVI_thresholds = [3, 4, 5]
Name_pattern = "win7_Order3"
folder_prePattern = "2Yrs_tbl_reg_fineGranular_SOS"

In [39]:
VI_indeks="NDVI"
NDVI_threshold=3
all_data=pd.DataFrame()

In [38]:
for VI_indeks in VI_indeksss:
    for NDVI_threshold in NDVI_thresholds:
        all_data=pd.DataFrame()
        for year in years:
            folder_name = folder_prePattern+str(NDVI_threshold)+"_EOS"+str(NDVI_threshold)
            data_dir = sent_dir+folder_name+"/"
            file_list = os.listdir(data_dir)
            file_list_SG73 = [x for x in file_list if Name_pattern in x]
            file_list_SG73 = [x for x in file_list_SG73 if VI_indeks in x]

            if year==2016:
                cnty1_names = [x for x in file_list_SG73 if "Adams_2016" in x]
                cnty2_names = [x for x in file_list_SG73 if "Benton_2016" in x]
                finalFileNames = cnty1_names+cnty2_names
            elif year==2017:
                finalFileNames= [x for x in file_list_SG73 if "Grant_2017" in x]
            elif year==2018:
                cnty1_names = [x for x in file_list_SG73 if "Franklin_2018" in x]
                cnty2_names = [x for x in file_list_SG73 if "Yakima_2018" in x]
                finalFileNames = cnty1_names+cnty2_names

            # print(VI_indeks, year, NDVI_threshold, finalFileNames)
            aIndeks_all_data=pd.DataFrame()
            for a_file in finalFileNames:
                a=pd.read_csv(data_dir+a_file)
                a['human_system_start_time'] = pd.to_datetime(a['human_system_start_time'])
                aIndeks_all_data=pd.concat([aIndeks_all_data, a])
            all_data=pd.concat([all_data, aIndeks_all_data])
            all_data = all_data[all_data.ID.isin(list(ground_truth_labels_test.ID.unique()))].copy()
            all_data=all_data[["ID", "season_count"]]
            all_data.drop_duplicates(inplace=True)
            all_data.reset_index(drop=True, inplace=True)

        print ("==================================================================================================")
        single_season=all_data[all_data.season_count<2]
        two_seasons=all_data[all_data.season_count>=2]
        print ("this must be 248:", str(len(single_season)+len(two_seasons)))
        
        evalHelp = pd.merge(ground_truth_labels_test, all_data, on=['ID'], how='left')
        true_single_predicted_single = evalHelp[evalHelp.Vote==1].copy()
        true_single_predicted_single = true_single_predicted_single[true_single_predicted_single.season_count<2]


        true_double_predicted_double = evalHelp[evalHelp.Vote>=2].copy()
        true_double_predicted_double = true_double_predicted_double[true_double_predicted_double.season_count>=2]

        true_double_predicted_single=evalHelp[evalHelp.Vote==2].copy()
        true_double_predicted_single = true_double_predicted_single[true_double_predicted_single.season_count<2]

        true_single_predicted_double=evalHelp[evalHelp.Vote==1].copy()
        true_single_predicted_double = true_single_predicted_double[true_single_predicted_double.season_count>=2]


        balanced_confus_tbl_test = pd.DataFrame(columns=['None', 'Predict_Single', 'Predict_Double'], 
                                                index=range(2))
        balanced_confus_tbl_test.loc[0, 'None'] = 'Actual_Single'
        balanced_confus_tbl_test.loc[1, 'None'] = 'Actual_Double'
        balanced_confus_tbl_test['Predict_Single']=0
        balanced_confus_tbl_test['Predict_Double']=0

        balanced_confus_tbl_test.loc[0, "Predict_Single"]=len(true_single_predicted_single)
        balanced_confus_tbl_test.loc[0, "Predict_Double"]=len(true_single_predicted_double)
        balanced_confus_tbl_test.loc[1, "Predict_Single"]=len(true_double_predicted_single)
        balanced_confus_tbl_test.loc[1, "Predict_Double"]=len(true_double_predicted_double)

        print ("VI_indeks: " + VI_indeks + ", NDVI_threshold: " + str(NDVI_threshold))
        print (balanced_confus_tbl_test)
        print ("")
        _dc = np.abs(balanced_confus_tbl_test.loc[0, "Predict_Double"]-balanced_confus_tbl_test.loc[1, "Predict_Single"])
        print ("count difference is " + str(_dc))
        acr_diff=np.abs(true_double_predicted_single.ExctAcr.sum()-true_single_predicted_double.ExctAcr.sum())
        print ("acr difference is "+str(acr_diff))
        print ("true_double_predicted_single.ExctAcr", true_double_predicted_single.ExctAcr.sum())
        print ("true_single_predicted_double.ExctAcr", true_single_predicted_double.ExctAcr.sum())
        print ()


this must be 248: 248
VI_indeks: NDVI, NDVI_threshold: 3
            None  Predict_Single  Predict_Double
0  Actual_Single             176              24
1  Actual_Double              18              30

count difference is 6
acr difference is 278.1679991898775
true_double_predicted_single.ExctAcr 1777.304625509689
true_single_predicted_double.ExctAcr 1499.1366263198115

this must be 248: 248
VI_indeks: NDVI, NDVI_threshold: 4
            None  Predict_Single  Predict_Double
0  Actual_Single             183              17
1  Actual_Double              11              37

count difference is 6
acr difference is 26.924727617746385
true_double_predicted_single.ExctAcr 1022.8747940444512
true_single_predicted_double.ExctAcr 995.9500664267048

this must be 248: 248
VI_indeks: NDVI, NDVI_threshold: 5
            None  Predict_Single  Predict_Double
0  Actual_Single             182              18
1  Actual_Double              12              36

count difference is 6
acr difference is 156.

In [42]:
all_data.shape

(397, 25)

In [372]:
adam_2016=pd.read_csv(data_dir+"Adams_2016_regular_EVI_SG_win7_Order3.csv")
adam_2017=pd.read_csv(data_dir+"Adams_2017_regular_EVI_SG_win7_Order3.csv")
adam_2018=pd.read_csv(data_dir+"Adams_2018_regular_EVI_SG_win7_Order3.csv")

In [401]:
all_data[all_data.ID=="102173_WSDA_SF_2018"]

Unnamed: 0,ID,Acres,county,CropGrp,CropTyp,DataSrc,ExctAcr,IntlSrD,Irrigtn,LstSrvD,...,image_year,SF_year,doy,EVI,human_system_start_time,Date,EVI_ratio,SOS,EOS,season_count
1762,102173_WSDA_SF_2018,4,Franklin,Hay/Silage,alfalfa hay,wsda,3.887527,2005/08/15 00:00:00,wheel line,2018/04/11 00:00:00,...,2018,2018,75,0.627292,2018-03-16,2018-03-16 00:00:00,0.531956,0.627292,0.0,3
1763,102173_WSDA_SF_2018,4,Franklin,Hay/Silage,alfalfa hay,wsda,3.887527,2005/08/15 00:00:00,wheel line,2018/04/11 00:00:00,...,2018,2018,124,0.605709,2018-05-04,2018-05-04 00:00:00,0.486745,0.0,0.605709,3
1764,102173_WSDA_SF_2018,4,Franklin,Hay/Silage,alfalfa hay,wsda,3.887527,2005/08/15 00:00:00,wheel line,2018/04/11 00:00:00,...,2018,2018,136,0.623922,2018-05-16,2018-05-16 00:00:00,0.524896,0.623922,0.0,3
1765,102173_WSDA_SF_2018,4,Franklin,Hay/Silage,alfalfa hay,wsda,3.887527,2005/08/15 00:00:00,wheel line,2018/04/11 00:00:00,...,2018,2018,253,0.610363,2018-09-10,2018-09-10 00:00:00,0.496493,0.0,0.610363,3
1766,102173_WSDA_SF_2018,4,Franklin,Hay/Silage,alfalfa hay,wsda,3.887527,2005/08/15 00:00:00,wheel line,2018/04/11 00:00:00,...,2018,2018,278,0.612874,2018-10-05,2018-10-05 00:00:00,0.501753,0.612874,0.0,3
1767,102173_WSDA_SF_2018,4,Franklin,Hay/Silage,alfalfa hay,wsda,3.887527,2005/08/15 00:00:00,wheel line,2018/04/11 00:00:00,...,2018,2018,324,0.608222,2018-11-20,2018-11-20 00:00:00,0.492008,0.0,0.608222,3


In [27]:
all_data.county.unique()

array(['Adams', 'Benton', 'Grant', 'Franklin', 'Yakima'], dtype=object)

In [408]:
all_data = all_data[all_data.ID.isin(list(ground_truth_labels_test.ID.unique()))].copy()
len(all_data.ID.unique())

248

In [35]:
Adams = all_data[all_data.county=="Franklin"]
Adams['human_system_start_time'].dt.year.unique()

array([2018], dtype=int64)