In [1]:
import os
import sys
from pathlib import Path
import pandas as pd

%matplotlib inline

In [2]:
sys.path.append(r"../LUCinSA_helpers")
from rf import *

In [3]:
'''
PARAMETERS: modify in notebook_params notebook, then run that notebook and this cell to update here
DO NOT modify this cell
'''

%store -r basic_config
print("Basic Parameters: \n brdf_dir = {} \n gridCell = {} \n index_dir = {} \n home_dir = {}".format(basic_config['brdf_dir'],basic_config['grid_cell'],basic_config['index_dir'],basic_config['home_dir']))
%store -r classification_params
print("Classification_Params: \n model_type = {} \n samp_pt_file = {} \n pix_vars = {} \n pixdf = {} \n model_name = {} \n lc_class = {} \n ranhold = {} \n impmeth = {}".format(classification_params['model_type'],classification_params['samp_pts'],classification_params['pix_vars'],classification_params['pixdf'],classification_params['model_name'],
classification_params['lc_mod'],classification_params['ranhold'],classification_params['impmeth']))
      

Basic Parameters: 
 brdf_dir = /home/sandbox-cel/paraguay_lc/stac/grid/003035/brdf 
 gridCell = 3035 
 index_dir = /home/downspout-cel/paraguay_lc/stac/grids/003035/brdf_ts/ms/evi2 
 home_dir = ~/data
Classification_Params: 
 model_type = RF 
 samp_pt_file = /home/downspout-cel/paraguay_lc/vector/sampleData/SamplePts_Nov2023_ALL.csv 
 pix_vars = /home/downspout-cel/paraguay_lc/vector/ptsgdb_Nov15.csv 
 pixdf = /home/downspout-cel/paraguay_lc/classification/RF/pixdf_lessSoy_Nov.csv 
 model_name = test0Nov 
 lc_class = All 
 ranhold = 29 
 impmeth = Impurity


## View the look up table
These are the different LC_models to group things in classification and to translate between numerical map categories and text labels

In [4]:
lut=pd.read_csv('../Class_LUT.csv')
lut.drop(['Description'], axis=1, inplace=True)
print(lut.sort_values('LC_UNQ'))

    LC_UNQ               USE_NAME          Segmentation  LCTrans  \
0        1                  NoVeg                 NoVeg        1   
1        2             NoVeg_Bare                   NaN        2   
2        3            NoVeg_Built                   NaN        3   
3        7            NoVeg_Water                   NaN        7   
4        9             Mixed-path                   NaN        9   
5       10                Cleared                   NaN       10   
6       11          TreePlant-new                   NaN       11   
7       12      Grassland-Natural                   NaN       15   
8       13      Grassland-Planted  LowVeg_grass_managed       15   
9       15              Grassland          LowVeg_grass       15   
10      17  Grassland-Natural-wet                   NaN       15   
11      18        Mixed-GrassEdge                   NaN       19   
12      19        Mixed-FieldEdge                   NaN       19   
13      20                 LowVeg               

## get sample dataframe:
 pixdf is the combination of the sample point file and variable stack for those points (pix_vars).
 This is created in notebook 6a_RandomForest_VariableDataframe

In [5]:
pixdf = pd.read_csv(classification_params['pixdf'])
print('sample breakdown by {}:'.format(classification_params['lc_mod']))
label_col, new_lut = get_class_col(classification_params['lc_mod'], lut)
if '{}_name'.format(label_col) in pixdf.columns:
    print(pixdf['{}_name'.format(label_col)].value_counts())
else:
    pixdf2 = pixdf.merge(new_lut[['USE_NAME','{}'.format(label_col),'{}_name'.format(label_col)]], left_on='Class', right_on='USE_NAME', how='left')
    print(pixdf2['{}_name'.format(label_col)].value_counts())

sample breakdown by All:
Mixed-path           3644
Crops-mix            2870
unknown              1645
Mixed-VegEdge        1626
Grassland-Planted     981
Trees-Forest          819
Trees-disturbed       788
TreePlant             743
Crops-Yerba-Mate      709
Grassland-Natural     517
Crops-Rice            496
Crops-Soybeans        325
Shrub                 297
Crops-Sugar           292
NoVeg_Water           264
Crops-Orchard         254
NoVeg_Bare            235
Crops-Corn            231
NoVeg_Built           224
Grass_tree-mix         90
Name: LC17_name, dtype: int64


## (optional): filter dataframe
(if using the same dataset repeatedly, use notebook 6a to filter and print the dataframe as a new pixdf)

by rejoining the pixel dataframe to ptdf, we can get attributes that can be used to filter cases.
e.g. "PurePixel"==X indicates fields that are on the edge and likely to be mixed with another class
"FieldWidth" gives an approximate field size (usally provided if fields are very small so to enable removal of small fields). "rand" is a random number from .00-.99 that can be used to reduce the dataset/stratify categories
"TESTSET10" and "TESTSET20" are flagged if a point is in a 10% or 20% (stratified) holdout group.
"SampMethod" has the sampling method (e.g. ground vs. Google Earth) 

In [None]:
### example ###########################
#pixdf = pixdf[(pixdf['rand']>.5)]
#print(pixdf['LC17_name'].value_counts())

## create rf model
this uses the multiclass RandomForestClassifier method from sklearn.ensemble (code is in ../LUCinSA_helpers/rf.py)

To use a different classification model, change 'lc_mod' in the parameters and rerun
current models = ('All' | 'trans_cats" | 'crop_nocrop' | 'crop_nocrop_medcrop' | 'crop_nocrop_medcrop_tree' | 'veg' | 'cropType' or 'single_X' (where X is any unique string in the USE_NAME column) for binary classification of X vs all else) 

In [6]:
rf0 = rf_model(classification_params['pixdf'],
         classification_params['model_dir'],
         classification_params['lc_mod'],
         classification_params['impmeth'],
         classification_params['ranhold'],
         classification_params['model_name'],
         lut)

class_col = LC17
there are 17092 pts in the full data set
there are 15405 sample points after removing those without clear class
There are 12296 training features
Out-of-bag score estimate: 0.699
Mean accuracy score: 0.78
getting confusion matrix based on LC17...
number of holdout pixels = 3109




## view confusion matrices
Note parameters: (pred_col, obs_col, lut, lc_mod_map, lc_mod_acc, print_cm=False, out_dir=None, model_name=None)
To print cm to csv file, change print_cm to True and provide an out_dir and model_Name

In [7]:
cm_cropNoCrop = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'crop_nocrop',True,classification_params['model_dir'],'Nov_default')
cm_cropType = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'cropType',True,classification_params['model_dir'],'Nov_default')
cm_veg = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'veg',True,classification_params['model_dir'],'Nov_default')
cm_all = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'All',True,classification_params['model_dir'],'Nov_default')
#cm_single = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],classification_params['lc_mod'],False,None,None)

print(cm_cropNoCrop)
print(cm_cropType)
print(cm_veg)
print(cm_all)
#print(cm_single)

getting confusion matrix based on LC2...
getting confusion matrix based on LC_crops...
getting confusion matrix based on LC5...
getting confusion matrix based on LC17...
pred_reclass  crop  nocrop   All  correct  sumcol        UA        PA
obs_reclass                                                          
crop           860     222  1082      860    1064  0.794824  0.808271
nocrop         204    1823  2027     1823    2045  0.899359  0.891443
All           1064    2045  3109     3109    3109  1.000000  1.000000
pred_reclass     Corn  Not crop  Rice  Shrub crops  Smallholder mix  Soybeans  \
obs_reclass                                                                     
Corn                6         9     0            1               17        11   
Not crop            0      1823    10           16              170         5   
Rice                0        19    74            0               12         1   
Shrub crops         0        47     0           57               54        

## view variable importance
this can be computed via Impurity or Permutation method (see sklearn docs)  by setting impmeth in rf_model
The full list is stored in the model directory for further manipulation 

In [None]:
var_imp_path = os.path.join(classification_params['model_dir'],'VarImportance_{}.csv'.format(classification_params['model_name']))
var_imp = pd.read_csv(var_imp_path, names=['var','imp'], header=None)
## view 10 most important variables:
var_imp.sort_values('imp', ascending=False).head(10)

## To save an html copy of this notebook with all outputs:
(these two cells should be last in notebook)

In [None]:
%%script echo skipping
### comment out above line and run this cell to print output as html

out_name = str(basic_config['country']+'6b_RandomFoest_ModelComparisons'+'_model'+str(classification_params['model_name'])+'_'+'Tests1')
!jupyter nbconvert --output-dir='./Outputs' --to html --no-input --ExecutePreprocessor.store_widget_state=True --output=$out_name 6b_RandomFoest_ModelComparisons.ipynb