In [1]:
import os
import sys
from pathlib import Path
import pandas as pd

%matplotlib inline

In [2]:
sys.path.append(r"../LUCinSA_helpers")
from rf import *

In [3]:
'''
PARAMETERS: modify in notebook_params notebook, then run that notebook and this cell to update here
DO NOT modify this cell
'''

%store -r basic_config
print("Basic Parameters: \n brdf_dir = {} \n gridCell = {} \n index_dir = {} \n out_dir = {}"
      .format(basic_config['brdf_dir'],basic_config['grid_cell'],basic_config['index_dir'],basic_config['out_dir']))
%store -r classification_params
print("Classification_Params: \n model_type = {} \n samp_pt_file = {}".format(classification_params['model_type'],classification_params['samp_pts']))

Basic Parameters: 
 brdf_dir = /home/sandbox-cel/paraguay_lc/stac/grid/003045/brdf 
 gridCell = 3045 
 index_dir = /home/downspout-cel/paraguay_lc/stac/grids/003045/brdf_ts/ms/evi2 
 out_dir = /home/klwalker/data/tmp
Classification_Params: 
 model_type = RF 
 samp_pt_file = /home/downspout-cel/paraguay_lc/vector/sampleData/SamplePts_Mar2023_ALL.csv


In [4]:
lut=pd.read_csv('../Class_LUT.csv')
lut.drop(['Description'], axis=1, inplace=True)
print(lut.sort_values('LC22'))

                 USE_NAME  LC22 LC5_name  LC5  LC17          LC17_name  LC3  \
0              NoVeg_Bare     1    NoVeg   10     1         NoVeg_Bare    0   
1             NoVeg_Built     3    NoVeg   10     3        NoVeg_Built    0   
2             NoVeg_Water     7    NoVeg   10     7        NoVeg_Water    0   
4       Grassland-Natural    12   LowVeg   20    12  Grassland-Natural    0   
5       Grassland-Planted    13   LowVeg   20    13  Grassland-Planted    0   
6   Grassland-Natural-wet    17   LowVeg   20    13  Grassland-Natural    0   
14         Crops-Soybeans    31   LowVeg   20    31     Crops-Soybeans    1   
15            Crops-Beans    32   LowVeg   20    35          Crops-mix    1   
16             Crops-Corn    33   LowVeg   20    33         Crops-Corn    1   
17         Crops-Mandioca    34   LowVeg   20    35          Crops-mix    1   
18              Crops-mix    35   LowVeg   20    35          Crops-mix    1   
19          TreePlant-new    36   LowVeg   20    99 

## start with default models with pixel-only data:

In [5]:
samp_pts = pd.read_csv(classification_params['samp_pts'])
pix_vars = pd.read_csv('/home/downspout-cel/paraguay_lc/vector/ptsgdb_Sep21.csv')

pix_data = pix_vars.merge(samp_pts, left_on='OID_', right_on='OID_', how='left')

pix_data.drop(['LC2'], axis=1, inplace=True)
pixdf = pix_data.merge(lut, left_on='Class', right_on='USE_NAME', how='left')
print('sample breakdown by LC17 class:')
print(pixdf['LC17_name'].value_counts())

print('default rf model with all sample pixels and pixel only data')

sample breakdown by LC17 class:
Crops-Soybeans       2965
Crops-mix             459
TreePlant             342
Grassland-Natural     309
Trees-Forest          203
NoVeg_Built           202
Shrub                 193
NoVeg_Water           187
Crops-Rice            184
Crops-Corn            179
NoVeg_Bare            171
unknown               168
Crops-Yerba-Mate      160
Trees-disturbed       154
Grassland-Planted     113
Crops-Sugar           105
Crops-Orchard          91
Grass_tree-mix         73
Name: LC17_name, dtype: int64
default rf model with all sample pixels and pixel only data


## prep pixel datasets by reducing sample
### First by sampling method (reducing dominant CAN soy pts)
### can also do by field size, etc.

In [6]:
#### model_name = pixdf_25Soy
##   removes 3/4 of the soy points because they are far overrepresented
pixdf_25Soy = pixdf[(pixdf['rand']>.9) | (pixdf['SampMethod'] != 'CAN - unverified in GE')]
print('25Soy sample breakdown by LC17 class:')
print(pixdf_25Soy['LC17_name'].value_counts())
pixdf_path = os.path.join(classification_params['model_dir'],'pixdf_25Soy.csv')
pd.DataFrame.to_csv(pixdf_25Soy, pixdf_path)

25Soy sample breakdown by LC17 class:
Crops-Soybeans       525
Crops-mix            459
TreePlant            342
Grassland-Natural    309
Trees-Forest         203
NoVeg_Built          202
Shrub                193
NoVeg_Water          187
Crops-Rice           184
Crops-Corn           179
NoVeg_Bare           171
unknown              168
Crops-Yerba-Mate     160
Trees-disturbed      154
Grassland-Planted    113
Crops-Sugar          105
Crops-Orchard         91
Grass_tree-mix        73
Name: LC17_name, dtype: int64


In [7]:
#### model_name = pixdf_lessSoy
# removes all soy points that sre not verified in Google Earth
pixdf_lessSoy = pixdf_25Soy[pixdf_25Soy['SampMethod'] != 'CAN - unverified in GE']
print('there are now {} pts in the training set after dropping CAN soy'.format(len(pixdf_lessSoy)))
print('LessSoy sample breakdown by LC17 class:')
print(pixdf_lessSoy['LC17_name'].value_counts())
pixdf_path = os.path.join(classification_params['model_dir'],'pixdf_lessSoy.csv')
pd.DataFrame.to_csv(pixdf_lessSoy, pixdf_path)

there are now 3573 pts in the training set after dropping CAN soy
LessSoy sample breakdown by LC17 class:
Crops-mix            459
TreePlant            342
Grassland-Natural    309
Crops-Soybeans       257
Trees-Forest         203
NoVeg_Built          202
Shrub                193
NoVeg_Water          187
Crops-Rice           184
Crops-Corn           179
NoVeg_Bare           171
unknown              168
Crops-Yerba-Mate     160
Trees-disturbed      154
Grassland-Planted    113
Crops-Sugar          105
Crops-Orchard         91
Grass_tree-mix        73
Name: LC17_name, dtype: int64


## if polygons are available, can combine pixel and polygon dfs and create rf datasets for points with polygons and those without:

In [None]:
poly_data = pd.read_csv(classification_params['samp_poly'])
#rename column names that also occur in pixel df
poly_data.rename(columns={'area':'areaSeg'}, inplace=True)
all_data = pixdf.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
polypixdf_path = os.path.join(classification_params['model_dir'],'pts_polyData_joinCheck.csv')
pd.DataFrame.to_csv(all_data, polypixdf_path, sep=',', na_rep='NaN', index=True)

### first create dataset for points outside of polygons (here we have no variables to add to the original model)

In [None]:
outsideSeg = all_data[all_data['areaSeg'].isna()]
print(f'of the {all_data.shape[0]} sample points in our dataset, {outsideSeg.shape[0]} are outside of our segmented polygons')
print(outsideSeg['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'outside')
rfout17 = rf_model(outsideSeg,out_dir,'All','Permutation',29,'Fullsamp')

And for model with more balanced soy representation (25Soy):

In [None]:
all_data_25Soy = pixdf_25Soy.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
outsideSeg_25Soy = all_data_25Soy[all_data_25Soy['areaSeg'].isna()]
print(outsideSeg_25Soy['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'outside')
rfout17_lessSoy = rf_model(outsideSeg_lessSoy,out_dir,'All','Permutation',29,'LessSoy')

#### now create dataset for points inside of polygons (here we want to add some variables first)

In [None]:
#poly_data['AvgU'] = poly_data.apply(lambda x:count([x[c] for c in df.columns if c.endswith('U')]),axis=1)
#TODO: calculate these in pandas as above
poly_data.rename(columns={'areaSeg':'var_areaSeg','AVGU':'var_AVGU','AVGR':'var_AVGR','AVGSTD':'var_AVGSTD','MAXR':'var_MAXR','MINR':'var_MINR','STDU':'var_STDU','MINU':'var_MINU','MAXU':'var_MAXU','rU':'var_RU'}, inplace=True)
polyvars = pixdf.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
withinSeg = polyvars[polyvars['var_areaSeg'] > 0]

print(withinSeg['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'within')
rfin17 = rf_model(withinSeg,out_dir,'All','Permutation',29,'Fullsamp')

In [None]:
all_data_lessSoy = pixdf_lessSoy.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
withinSeg_lessSoy = all_data_lessSoy[all_data_lessSoy['var_areaSeg']>0]
print(withinSeg_lessSoy['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'within')
rfin17_lessSoy = rf_model(withinSeg_lessSoy,out_dir,'All','Permutation',29,'LessSoy')

# run test model

In [9]:
classification_params['model_dir']
rf = rf_model(pixdf_path,classification_params['model_dir'],'All','Inference',29,'LessSoy_test')

there are 3573 pts in the full data set
there are 3382 sample points after removing those without clear class
There are 2697 training features
Out-of-bag score estimate: 0.617
Mean accuracy score: 0.704
Confusion Matrix: predicted  1.0  3.0  7.0  12.0  31.0  33.0  35.0  37.0  38.0  52.0  60.0  \
observed                                                                   
1.0          1    0    0     0     0     0     0     0     0     0     0   
3.0          0    2    0     0     0     0     0     0     0     0     0   
7.0          0    0    1     0     0     0     0     0     0     0     0   
12.0         0    0    0     0     0     0     2     0     0     0     0   
13.0         0    0    0     1     0     0     0     0     0     0     0   
31.0         0    0    0     0     2     0     0     0     0     0     0   
33.0         0    0    0     0     0     1     0     0     0     0     0   
35.0         0    0    0     0     0     0     4     0     0     0     0   
37.0         0    0

