In [1]:
import os
import sys
from pathlib import Path
import pandas as pd

%matplotlib inline

In [2]:
sys.path.append(r"../LUCinSA_helpers")
from RF1_CreateModel import *

In [3]:
LUT=pd.read_csv('../Class_LUT.csv')
LUT.drop(['Description'], axis=1, inplace=True)
print(LUT.sort_values('LC22'))

                 USE_NAME  LC22 LC5_name  LC5  LC17          LC17_name  LC3  \
0              NoVeg_Bare     1    NoVeg   10     1         NoVeg_Bare    0   
1             NoVeg_Built     3    NoVeg   10     3        NoVeg_Built    0   
2             NoVeg_Water     7    NoVeg   10     7        NoVeg_Water    0   
4       Grassland-Natural    12   LowVeg   20    12  Grassland-Natural    0   
5       Grassland-Planted    13   LowVeg   20    13  Grassland-Planted    0   
6   Grassland-Natural-wet    17   LowVeg   20    13  Grassland-Natural    0   
14         Crops-Soybeans    31   LowVeg   20    31     Crops-Soybeans    1   
15            Crops-Beans    32   LowVeg   20    35          Crops-mix    1   
16             Crops-Corn    33   LowVeg   20    33         Crops-Corn    1   
17         Crops-Mandioca    34   LowVeg   20    35          Crops-mix    1   
18              Crops-mix    35   LowVeg   20    35          Crops-mix    1   
19          TreePlant-new    36   LowVeg   20    99 

#### start with default models with pixel-only data:

In [4]:
pix_data = pd.read_csv('D:/NasaProject/Paraguay/sampling/samplePts_FINALdfs/RFdf.csv')
pix_data.drop(['LC17','LC2'], axis=1, inplace=True)
pixdf = pix_data.merge(LUT, left_on='Class', right_on='USE_NAME', how='left')
print('sample breakdown by LC17 class:')
print(pixdf['LC17_name'].value_counts())

print('default rf model with all sample pixels and pixel only data')
in_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF'
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF'
rfout17 = MulticlassRF(pixdf,out_dir,'crop_nocrop','Permutation',29)

sample breakdown by LC17 class:
Crops-Soybeans       1349
Crops-mix             454
TreePlant             315
Grassland-Natural     309
Trees-Forest          203
NoVeg_Built           202
Shrub                 193
NoVeg_Water           186
Crops-Corn            179
NoVeg_Bare            171
Crops-Rice            171
unknown               168
Crops-Yerba-Mate      160
Trees-disturbed       154
Grassland-Planted     113
Crops-Sugar           105
Crops-Orchard          91
Grass_tree-mix         73
Name: LC17_name, dtype: int64
default rf model with all sample pixels and pixel only data


TypeError: argument of type 'method' is not iterable

In [5]:
##Remove 3/4 of the soy points because they are far overrepresented
pixdf_lessSoy = pixdf[(pixdf['rand']>.9) | (pixdf['SampMethod'] != 'CAN - unverified in GE')]
print('New sample breakdown by LC17 class:')
print(pixdf_lessSoy['LC17_name'].value_counts())
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF/LessSoy'
rfout17 = MulticlassRF(pixdf_lessSoy,out_dir,'All','Permutation',29)

New sample breakdown by LC17 class:
Crops-mix            454
Crops-Soybeans       373
TreePlant            315
Grassland-Natural    309
Trees-Forest         203
NoVeg_Built          202
Shrub                193
NoVeg_Water          186
Crops-Corn           179
NoVeg_Bare           171
Crops-Rice           171
unknown              168
Crops-Yerba-Mate     160
Trees-disturbed      154
Grassland-Planted    113
Crops-Sugar          105
Crops-Orchard         91
Grass_tree-mix        73
Name: LC17_name, dtype: int64


TypeError: argument of type 'method' is not iterable

#### combine pixel and polygon dfs and create rf datasets for points with polygons and those without:

In [12]:
poly_data = pd.read_csv('D:/NasaProject/Paraguay/ClassificationModels/RF/pts_polyData.csv')
#rename column names that also occur in pixel df
poly_data.rename(columns={'area':'areaSeg'}, inplace=True)
all_data = pixdf.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
pd.DataFrame.to_csv(all_data, 'D:/NasaProject/Paraguay/ClassificationModels/RF/pts_polyData_joinCheck.csv', sep=',', na_rep='NaN', index=True)

##### first create dataset for points outside of polygons (here we have no variables to add to the original model)

In [13]:
outsideSeg = all_data[all_data['areaSeg'].isna()]
print(f'of the {all_data.shape[0]} sample points in our dataset, {outsideSeg.shape[0]} are outside of our segmented polygons')
print(outsideSeg['LC17_name'].value_counts())
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF/outside'
rfout17 = MulticlassRF(outsideSeg,out_dir,'All','Permutation',29)

of the 4621 sample points in our dataset, 2843 are outside of our segmented polygons
Crops-mix            352
TreePlant            284
Grassland-Natural    266
Trees-Forest         202
NoVeg_Built          191
Shrub                187
NoVeg_Water          181
Crops-Soybeans       171
Trees-disturbed      153
NoVeg_Bare           146
Crops-Yerba-Mate     127
unknown              114
Grassland-Planted     89
Crops-Orchard         84
Crops-Rice            76
Grass_tree-mix        69
Crops-Sugar           67
Crops-Corn            65
Name: LC17_name, dtype: int64
there are 2843 pts in the full data set
there are 2587 pts in the training set
there are 2470 sample points after removing those without clear class
Out-of-bag score estimate: 0.59
Mean accuracy score: 0.612
Confusion Matrix: predicted  1.0  3.0  7.0  12.0  13.0  31.0  33.0  35.0  37.0  38.0  ...  60.0  \
observed                                                            ...         
1.0         17    4    1     2     0     1     

And model with more balanced soy representation (LessSoy):

In [15]:
all_data_lessSoy = pixdf_lessSoy.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
outsideSeg_lessSoy = all_data_lessSoy[all_data_lessSoy['areaSeg'].isna()]
print(outsideSeg_lessSoy['LC17_name'].value_counts())
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF/outside/LessSoy'
rfout17_lessSoy = MulticlassRF(outsideSeg_lessSoy,out_dir,'All','Permutation',29)

Crops-mix            352
TreePlant            284
Grassland-Natural    266
Trees-Forest         202
NoVeg_Built          191
Shrub                187
NoVeg_Water          181
Trees-disturbed      153
NoVeg_Bare           146
Crops-Yerba-Mate     127
unknown              114
Grassland-Planted     89
Crops-Orchard         84
Crops-Rice            76
Grass_tree-mix        69
Crops-Sugar           67
Crops-Corn            65
Crops-Soybeans        51
Name: LC17_name, dtype: int64
there are 2723 pts in the full data set
there are 2481 pts in the training set
there are 2364 sample points after removing those without clear class
Out-of-bag score estimate: 0.585
Mean accuracy score: 0.592
Confusion Matrix: predicted  1.0  3.0  7.0  12.0  13.0  31.0  33.0  35.0  37.0  38.0  ...  60.0  \
observed                                                            ...         
1.0         17    7    1     2     0     0     0     6     0     0  ...     0   
3.0          2   38    1     1     0     0     0  

##### now create dataset for points inside of polygons (here we want to add some variables first)

In [17]:
#poly_data['AvgU'] = poly_data.apply(lambda x:count([x[c] for c in df.columns if c.endswith('U')]),axis=1)
#TODO: calculate these in pandas as above
poly_data.rename(columns={'areaSeg':'var_areaSeg','AVGU':'var_AVGU','AVGR':'var_AVGR','AVGSTD':'var_AVGSTD','MAXR':'var_MAXR','MINR':'var_MINR','STDU':'var_STDU','MINU':'var_MINU','MAXU':'var_MAXU','rU':'var_RU'}, inplace=True)
polyvars = pixdf.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
withinSeg = polyvars[polyvars['var_areaSeg'] > 0]

print(withinSeg['LC17_name'].value_counts())
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF/within'
rfin17 = MulticlassRF(withinSeg,out_dir,'All','Permutation',29)

Crops-Soybeans       1179
Crops-Corn            114
Crops-mix             103
Crops-Rice             95
unknown                54
Grassland-Natural      43
Crops-Sugar            38
Crops-Yerba-Mate       33
TreePlant              31
NoVeg_Bare             25
Grassland-Planted      24
NoVeg_Built            11
Crops-Orchard           7
Shrub                   6
NoVeg_Water             5
Grass_tree-mix          4
Trees-disturbed         1
Trees-Forest            1
Name: LC17_name, dtype: int64
there are 1778 pts in the full data set
there are 1617 pts in the training set
there are 1568 sample points after removing those without clear class


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [18]:
all_data_lessSoy = pixdf_lessSoy.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
withinSeg_lessSoy = all_data_lessSoy[all_data_lessSoy['var_areaSeg']>0]
print(withinSeg_lessSoy['LC17_name'].value_counts())
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF/within/LessSoy'
rfin17_lessSoy = MulticlassRF(withinSeg_lessSoy,out_dir,'All','Permutation',29)

Crops-Soybeans       322
Crops-Corn           114
Crops-mix            103
Crops-Rice            95
unknown               54
Grassland-Natural     43
Crops-Sugar           38
Crops-Yerba-Mate      33
TreePlant             31
NoVeg_Bare            25
Grassland-Planted     24
NoVeg_Built           11
Crops-Orchard          7
Shrub                  6
NoVeg_Water            5
Grass_tree-mix         4
Trees-disturbed        1
Trees-Forest           1
Name: LC17_name, dtype: int64
there are 921 pts in the full data set
there are 836 pts in the training set
there are 787 sample points after removing those without clear class


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [42]:
#pix_data = pd.read_csv('D:/NasaProject/Paraguay/sampling/samplePts_FINALdfs/RFdf.csv')
pix_data = pd.read_csv('D:/NasaProject/Paraguay/sampling/samplePts_FINALdfs/RFdf_augmented.csv')
#pix_data = pix_data[pix_data['numPix'] < 50]
#poly_data = pd.read_csv('D:/NasaProject/Paraguay/ClassificationModels/RF/pts_polyData.csv')
#rename column names that also occur in pixel df
#poly_data.rename(columns={'area':'areaSeg'}, inplace=True)
#poly_data.rename(columns={'Seg1Dist':'var_Seg1Dist','Seg2Edge':'var_Seg2Edge','Seg3Prob':'var_Seg2Prob'},inplace=True)
#poly_data.rename(columns={'Seg1Dist':'var_Seg1Dist','Seg2Edge':'var_Seg2Edge','Seg3Prob':'var_Seg2Prob','areaSeg':'var_areaSeg','AVGU':'var_AVGU','AVGR':'var_AVGR','AVGSTD':'var_AVGSTD','MAXR':'var_MAXR','MINR':'var_MINR','STDU':'var_STDU','MINU':'var_MINU','MAXU':'var_MAXU','rU':'var_RU', 'DISTS':'var_DISTS'}, inplace=True)
#all_data = pix_data.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
all_data = pix_data.merge(poly_data, left_on='OID_orig', right_on='OID_', how='left')
#all_data.drop(['LC17','LC2'], axis=1, inplace=True)
df_in = pix_data.merge(LUT, left_on='Class', right_on='USE_NAME', how='left')
#df_in = all_data.merge(LUT, left_on='Class', right_on='USE_NAME', how='left')
#withinSeg = df_in[df_in['var_areaSeg'] > 0]
#HO_base0 = df_in[['OID_','areaSeg','Class','LC17','TESTSET20']]
#HO_base = HO_base0[(HO_base0['TESTSET20'] ==1)]
#print(HO_base)
#pd.DataFrame.to_csv(HO_base,'D:/NasaProject/Paraguay/ClassificationModels/RF/HO_base.csv', sep=',', na_rep='NaN', index=False)
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF'
PrepTestTrain(df_in, out_dir, 'All')

  pix_data = pd.read_csv('D:/NasaProject/Paraguay/sampling/samplePts_FINALdfs/RFdf_augmented.csv')


there are 133428 pts in the full data set
there are now 53856 pts in the training set after dropping CAN soy
there are 50673 sample points after removing those without clear class


('D:/NasaProject/Paraguay/ClassificationModels/RF\\All_TRAINING.csv',
 'D:/NasaProject/Paraguay/ClassificationModels/RF\\All_HOLDOUT.csv')

In [37]:
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF'
trainfeatures = 'D:/NasaProject/Paraguay/ClassificationModels/RF/All_TRAINING.csv'
RF = MulticlassRF(trainfeatures, out_dir, 'LC17',None,29)

  df_train = pd.read_csv(trainfeatures)


There are 11810 training features
Out-of-bag score estimate: 0.832
Mean accuracy score: 0.848
Confusion Matrix: predicted  1.0  3.0  7.0  12.0  13.0  31.0  33.0  35.0  37.0  38.0  ...  53.0  \
observed                                                            ...         
1.0          6    0    0     0     0     0     1     1     0     0  ...     0   
3.0          2    3    0     0     0     0     0     0     0     0  ...     0   
7.0          0    0    2     0     0     1     0     0     0     0  ...     0   
12.0         0    0    0     6     2     0     0     0     0     0  ...     0   
13.0         0    0    0     0    14     1     0     1     0     0  ...     0   
31.0         0    0    0     0     0    62     1     2     1     0  ...     0   
33.0         0    0    0     1     2     2    23     2     0     0  ...     0   
35.0         0    0    0     1     0     2     0    23     0     1  ...     0   
37.0         0    0    0     0     0     0     0     0    36     0  ...     0 

In [43]:
holdoutDF = 'D:/NasaProject/Paraguay/ClassificationModels/RF/RFSets/All_HOLDOUT.csv'
HO = get_holdout_scores(holdoutDF, RF[0], out_dir)

675


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  holdout_fields['pred']= holdout_fields_predicted
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  holdout_fields['label']= holdout_labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  holdout_fields['OID']=h_IDs


In [44]:
print(HO)

     var_evi2_Max  var_evi2_Min  var_evi2_Amp  var_evi2_Avg  var_evi2_CV  \
0            2870          2195           675          2540          204   
1            5441          3592          1849          4496          656   
2            4734          3366          1368          3998          463   
3            6928          1922          5006          4691         1870   
4            3907          2470          1437          3183          530   
..            ...           ...           ...           ...          ...   
670          4570          3797           773          4194          271   
671          5673          3494          2179          4440          612   
672          4322          3914           408          4153          138   
673          5512          4684           828          5154          309   
674          5066          4496           570          4797          192   

     var_evi2_Std  var_evi2_Jan  var_evi2_Feb  var_evi2_Mar  var_evi2_Apr  \
0         

In [45]:
HO['obs'] = HO['label'].apply(np.int64)
HO['pred'] = HO['pred'].apply(np.int64)
HO_clean = HO[['obs','pred','OID']]
print(HO_clean)

     obs  pred   OID
0     35     3  2222
1     53    35  2453
2     60    35  2463
3     38    33  2464
4     35    35  2472
..   ...   ...   ...
670   65    65  7912
671   65    31  7914
672   65    60  7916
673   80    80  7918
674   65    65  7926

[675 rows x 3 columns]


In [46]:
HO_clean
#HOfinal = HO_clean.merge(HO_base, left_on='OID', right_on='OID_', how='left')
pd.DataFrame.to_csv(HO_clean,'D:/NasaProject/Paraguay/ClassificationModels/RF/HO_Preds_8.csv', sep=',', na_rep='NaN', index=False)
