In [None]:
import os
import sys
from pathlib import Path
import pandas as pd

%matplotlib inline

In [None]:
sys.path.append(r"../LUCinSA_helpers")
from rf import *

In [None]:
'''
PARAMETERS: modify in notebook_params notebook, then run that notebook and this cell to update here
DO NOT modify this cell
'''

%store -r basic_config
print("Basic Parameters: \n time-series data is in (smooth_dir): {} \n"
      " modelling year is (filter_year param): {} (this is first year if season spans two years)"
      .format(basic_config['smooth_dir'], basic_config['filter_yr']))
%store -r classification_params

print("Classification_Params: \n" 
      " modelling mode is {} \n"
      " model_type = {} \n"
      " output files are saved to (model_dir): {} \n" 
      " shared input files are in (main_model_dir): {} \n"
      " sample_model = {} \n feature_model = {} \n model_name = {} \n"
      " the full sample pt file: {} \n"
      " the full sample dataframe with the feature model applied: {} \n"
      " the subset pt file based on the sample model: {} \n"
      " lc_class = {} \n ranhold = {} \n impmeth = {}"
      .format(classification_params['model_mode'],classification_params['model_type'],classification_params['model_dir'],
              classification_params['main_model_dir'],classification_params['sample_model'],classification_params['feature_model'],
              classification_params['model_name'],basic_config['ptfile'],classification_params['samp_pix_vars'],classification_params['samp_pts'],
              classification_params['lc_mod'],classification_params['ranhold'],classification_params['impmeth']))
    

#### Set new variables here for temp model testing: -- SKIP if keeping original model

In [None]:
## Set new variables here for temp model testing:
feature_model = "base4NoPoly"
## Sample model options currently: bal400mix1 | bal400mix2 | bal400mix3
sample_model = 'base1000'
#sample_model = "bal400mix8"

## The following will set themselves based on the above variables:
classification_params['feature_model'] = feature_model
classification_params['sample_model'] = sample_model
classification_params['model_name'] = '{}_{}'.format(feature_model, sample_model)
classification_params['samp_pix_vars'] = '{}/ptsgdb_{}.csv'.format(classification_params['model_dir'],feature_model)
classification_params["samp_pts"] = '/home/downspout-cel/paraguay_lc/classification/RF/sample_dfs/{}.csv'.format(sample_model)
print('Now working with sample_model: {} \n New output model will be named: {}'
      .format(classification_params['sample_model'],classification_params['model_name']))

## Merge dataframes

In [None]:
lut=pd.read_csv('../Class_LUT.csv')
#print(lut.sort_values('LC_UNQ')[['LC_UNQ','USE_NAME','LC25','LC25_name']])

samp_pts = pd.read_csv(classification_params['samp_pts'])
print(samp_pts.columns.tolist())
#if mod_name == "base1000":
#    samp_pts.rename(columns = {"Unnamed: 0": 'OID_'}, inplace = True)

pix_vars = pd.read_csv(classification_params['samp_pix_vars'])
#print(pix_vars.columns.tolist())

pix_data = samp_pts.merge(pix_vars, left_on='OID_', right_on='OID_', how='inner')
print('sample breakdown by LC25 class:')
print(pix_data['LC25_name'].value_counts())

if classification_params['model_mode'] == 'production':
    #pixdf = pix_data.merge(lut, left_on='Class', right_on='USE_NAME', how='left')
    pixdf_path = os.path.join(classification_params['model_dir'],
                                             'pixdf_{}.csv'.format(classification_params['model_name']))
    pd.DataFrame.to_csv(pix_data, pixdf_path)
    classification_params["pixdf"] = pixdf_path

else:
    classification_params["pixdf"] = pix_data
    #print(classification_params['pixdf'].columns.tolist())

## View the look up table
These are the different LC_models to group things in classification and to translate between numerical map categories and text labels

In [None]:
lut=pd.read_csv('../Class_LUT.csv')
lut.drop(['Description'], axis=1, inplace=True)

print(lut.sort_values('LC_UNQ'))

## get sample dataframe:
 pixdf is the combination of the sample point file and variable stack for those points (pix_vars).
 This is created in notebooks 6a and 6b.

In [None]:
"""print(classification_params['pixdf'])
pixdf = pd.read_csv(classification_params['pixdf'])
#pixdf = pixdf.dropna(inplace = True)
#print(pixdf)
#print(pixdf.isnull().any().any())
print('sample breakdown by {}:'.format(classification_params['lc_mod']))
label_col, new_lut = get_class_col(classification_params['lc_mod'], lut)
if '{}_name'.format(label_col) in pixdf.columns:
    print(pixdf['{}_name'.format(label_col)].value_counts())
else:
    pixdf2 = pixdf.merge(new_lut[['USE_NAME','{}'.format(label_col),'{}_name'.format(label_col)]], left_on='Class', right_on='USE_NAME', how='left')
    print(pixdf2['{}_name'.format(label_col)].value_counts())""";

## create rf model
this uses the multiclass RandomForestClassifier method from sklearn.ensemble (code is in ../LUCinSA_helpers/rf.py)

To use a different classification model, change 'lc_mod' in the parameters and rerun
current models = ('All' | 'trans_cats" | 'crop_nocrop' | 'crop_nocrop_medcrop' | 'crop_nocrop_medcrop_tree' | 'veg' | 'cropType' or 'single_X' (where X is any unique string in the USE_NAME column) for binary classification of X vs all else) 

In [None]:
rf0 = rf_model(classification_params['pixdf'],
         classification_params['model_dir'],
         classification_params['lc_mod'],
         classification_params['impmeth'],
         classification_params['ranhold'],
         classification_params['model_name'],
         lut,     
         classification_params['feature_model'],
         classification_params['feature_mod_dict'])

## view confusion matrices
Note parameters: (pred_col, obs_col, lut, lc_mod_map, lc_mod_acc, print_cm=False, out_dir=None, model_name=None)
To print cm to csv file, change print_cm to True and provide an out_dir and model_name

In [None]:
'''
Note: if running build_weighted_accuracy_table below, these will be printed to file within that.
'''

cm_cropNoCrop = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'cropNoCrop', 
                                     print_cm=False, out_dir=classification_params['model_dir'],
                                     model_name=classification_params['model_name'])
cm_cropType = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'cropType', 
                                   print_cm=False, out_dir=classification_params['model_dir'],
                                   model_name=classification_params['model_name'])
cm_veg = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'veg', 
                              print_cm=False, out_dir=classification_params['model_dir'],
                              model_name=classification_params['model_name'])
cm_all = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'all', 
                              print_cm=False, out_dir=classification_params['model_dir'],
                              model_name=classification_params['model_name'])
#cm_single = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],classification_params['lc_mod'],False,classification_params['model_dir'],None)

print(cm_cropNoCrop)
#print(cm_cropType)
#print(cm_veg)
#print(cm_all)
#print(cm_single)


## view variable importance
this can be computed via Impurity or Permutation method (see sklearn docs)  by setting impmeth in rf_model
The full list is stored in the model directory for further manipulation 

In [None]:
var_imp_path = os.path.join(classification_params['model_dir'],'VarImportance_{}.csv'.format(classification_params['model_name']))
var_imp = pd.read_csv(var_imp_path, names=['var','imp'], header=None)
## view 10 most important variables:
var_imp.sort_values('imp', ascending=False).head(10)

## Build weighted accuracy matrix for model selection / optimization

In [None]:
model_name = classification_params['model_name']
out_dir = os.path.join(classification_params['local_dir'],'cms')
wacc = build_weighted_accuracy_table(out_dir,model_name,rf0[1],classification_params["pixdf"],lut)
print(wacc.head(n=10))

In [None]:
## use this to remove certain rows from the dfs that you do not want anymore

"""
types = ["cropNoCrop", "cropType", "veg", "all"]

delrow = 0
for i in types:
    mat = pd.read_csv("/home/ryanashraf/LUCinSA_helpers/testing/metrics/{}_metrics.csv".format(i), index_col = 0)
    
    mat = mat.drop([delrow])
    print(mat)
    mat.to_csv("/home/ryanashraf/LUCinSA_helpers/testing/metrics/{}_metrics.csv".format(i))
"""


In [None]:
## use this to remove certain rows from the df that you do not want anymore

"""
delrow = 16
mat = pd.read_csv("/home/ryanashraf/LUCinSA_helpers/testing/metrics/overall_metrics.csv", index_col = 0)
mat = mat.drop([delrow])
print(mat)
mat.to_csv("/home/ryanashraf/LUCinSA_helpers/testing/metrics/overall_metrics.csv")
"""

## Make some tables that organize the data cleanly

In [None]:
## Organize all of the models that exclude poly_pred variables

key_word = "Max_no_pp_"
model_name = classification_params['model_name']

stored = pd.read_csv(os.path.join(out_dir,'metrics','overall_metrics.csv'), index_col = 0)
tab = []
for i in stored.iterrows():
    if i[1]["Model"] == "Max_{}".format(model_name):
        tab.append(i[1])
    if i[1]["Model"].startswith(key_word):
        tab.append(i[1])

tab = pd.DataFrame(tab)
print(tab)


In [None]:
## 




In [None]:
## some cool graphs of the data you could have compiled








## To save an html copy of this notebook with all outputs:
(these two cells should be last in notebook)

In [None]:
%%script echo skipping
### comment out above line and run this cell to print output as html

out_name = str(basic_config['country']+'6c_RandomFoest_ModelComparisons'+'_model'+str(classification_params['model_name'])+'basic_config['filter_yr'])
!jupyter nbconvert --output-dir='./Outputs' --to html --no-input --ExecutePreprocessor.store_widget_state=True --output=$out_name 6b_RandomFoest_ModelComparisons.ipynb