In [None]:
import os
import sys
from pathlib import Path
import pandas as pd

%matplotlib inline

In [None]:
sys.path.append(r"../LUCinSA_helpers")
from rf import *

In [None]:
'''
PARAMETERS: modify in notebook_params notebook, then run that notebook and this cell to update here
DO NOT modify this cell
'''

%store -r basic_config
print("Basic Parameters: \n time-series data is in (smooth_dir): {} \n"
      " modelling year is (filter_year param): {} (this is first year if season spans two years)"
      .format(basic_config['smooth_dir'], basic_config['filter_yr']))
%store -r classification_params
print("Classification_Params: \n" 
      " temp output files are saved to (local_model_dir): {} \n" 
      " shared modelling files are in (main_model_dir): {} \n" 
      " feature_model = {} \n sample_model = {} \n model_name = {} \n" 
      " model_type = {} \n samp_pt_file = {} \n pix_vars = {} \n pixdf = {} \n lc_class = {} \n ranhold = {} \n impmeth = {}"
      .format(classification_params['local_model_dir'],classification_params['main_model_dir'],
              classification_params['feature_model'],classification_params['sample_model'],classification_params['model_name'],
              classification_params['model_type'],basic_config['ptfile'],
              classification_params['samp_pix_vars'],classification_params['pixdf'],classification_params['lc_mod'],
              classification_params['ranhold'],classification_params['impmeth']))
      

## View the look up table
These are the different LC_models to group things in classification and to translate between numerical map categories and text labels

In [None]:
lut=pd.read_csv('../Class_LUT.csv')
lut.drop(['Description'], axis=1, inplace=True)
print(lut.sort_values('LC_UNQ'))

## get sample dataframe:
 pixdf is the combination of the sample point file and variable stack for those points (pix_vars).
 This is created in notebooks 6a and 6b.

In [None]:
pixdf = pd.read_csv(classification_params['pixdf'])
print('sample breakdown by {}:'.format(classification_params['lc_mod']))
label_col, new_lut = get_class_col(classification_params['lc_mod'], lut)
if '{}_name'.format(label_col) in pixdf.columns:
    print(pixdf['{}_name'.format(label_col)].value_counts())
else:
    pixdf2 = pixdf.merge(new_lut[['USE_NAME','{}'.format(label_col),'{}_name'.format(label_col)]], left_on='Class', right_on='USE_NAME', how='left')
    print(pixdf2['{}_name'.format(label_col)].value_counts())

## create rf model
this uses the multiclass RandomForestClassifier method from sklearn.ensemble (code is in ../LUCinSA_helpers/rf.py)

To use a different classification model, change 'lc_mod' in the parameters and rerun
current models = ('All' | 'trans_cats" | 'crop_nocrop' | 'crop_nocrop_medcrop' | 'crop_nocrop_medcrop_tree' | 'veg' | 'cropType' or 'single_X' (where X is any unique string in the USE_NAME column) for binary classification of X vs all else) 

In [None]:
rf0 = rf_model(classification_params['pixdf'],
         classification_params['local_model_dir'],
         classification_params['lc_mod'],
         classification_params['impmeth'],
         classification_params['ranhold'],
         classification_params['model_name'],
         lut)

## view confusion matrices
Note parameters: (pred_col, obs_col, lut, lc_mod_map, lc_mod_acc, print_cm=False, out_dir=None, model_name=None)
To print cm to csv file, change print_cm to True and provide an out_dir and model_Name

In [None]:
cm_cropNoCrop = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'crop_nocrop',True,classification_params['local_model_dir'],'Nov_default')
cm_cropType = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'cropType',True,classification_params['local_model_dir'],'Nov_default')
cm_veg = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'veg',False,classification_params['local_model_dir'],'Nov_default')
cm_all = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'All',False,classification_params['local_model_dir'],'Nov_default')
#cm_single = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],classification_params['lc_mod'],False,classification_params['local_model_dir'],None)

print(cm_cropNoCrop)
print(cm_cropType)
print(cm_veg)
print(cm_all)
#print(cm_single)

## view variable importance
this can be computed via Impurity or Permutation method (see sklearn docs)  by setting impmeth in rf_model
The full list is stored in the model directory for further manipulation 

In [None]:
var_imp_path = os.path.join(classification_params['local_model_dir'],'VarImportance_{}.csv'.format(classification_params['model_name']))
var_imp = pd.read_csv(var_imp_path, names=['var','imp'], header=None)
## view 10 most important variables:
var_imp.sort_values('imp', ascending=False).head(10)

## To save an html copy of this notebook with all outputs:
(these two cells should be last in notebook)

In [None]:
%%script echo skipping
### comment out above line and run this cell to print output as html

out_name = str(basic_config['country']+'6c_RandomFoest_ModelComparisons'+'_model'+str(classification_params['model_name'])+'basic_config['filter_yr'])
!jupyter nbconvert --output-dir='./Outputs' --to html --no-input --ExecutePreprocessor.store_widget_state=True --output=$out_name 6b_RandomFoest_ModelComparisons.ipynb