In [None]:
import os
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
sys.path.append(r"../LUCinSA_helpers")
from rf import *

In [None]:
'''
PARAMETERS: modify in notebook_params notebook, then run that notebook and this cell to update here
DO NOT modify this cell
'''

%store -r basic_config
print("Basic Parameters: \n time-series data is in (smooth_dir): {} \n"
      " modelling year is (filter_year param): {} (this is first year if season spans two years)"
      .format(basic_config['smooth_dir'], basic_config['filter_yr']))
%store -r classification_params

print("Classification_Params: \n" 
      " modelling mode is {} \n"
      " model_type = {} \n"
      " output files are saved to (model_dir): {} \n" 
      " shared input files are in (main_model_dir): {} \n"
      " sample_model = {} \n feature_model = {} \n model_name = {} \n"
      " the full sample pt file: {} \n"
      " the full sample dataframe with the feature model applied: {} \n"
      " the subset pt file based on the sample model: {} \n"
      " % of the sample heldout for the confusion matrices: {} \n"
      " lc_class = {} \n ranhold = {} \n impmeth = {}"
      .format(classification_params['model_mode'],classification_params['model_type'],classification_params['model_dir'],
              classification_params['main_model_dir'],classification_params['sample_model'],classification_params['feature_model'],
              classification_params['model_name'],basic_config['ptfile'],classification_params['samp_pix_vars'],
              classification_params['samp_pts'],classification_params['ho_thresh'],
              classification_params['lc_mod'],classification_params['ranhold'],classification_params['impmeth']))
    

In [None]:
def set_variables(feature_model, sample_model, year):
    ## Simpler way to change model configuration than updating notebook parameters
    samp_pt_key = basic_config['ptfile']
    lut='../Class_LUT.csv'
    classification_params['feature_model'] = feature_model
    classification_params['sample_model'] = sample_model
    classification_params['model_name'] = f'{feature_model}_{sample_model}'
    model_name = classification_params['model_name']
    classification_params['samp_pix_vars'] = f'/home/downspout-cel/paraguay_lc/vector/pts_training/features/ptsfeats_{feature_model}_{year}.csv'
    classification_params["samp_pts"] = f'/home/downspout-cel/paraguay_lc/vector/pts_training/pt_subsets/{sample_model}_{year}.csv'
    print(f"Now working with sample_model:{classification_params['sample_model']} \n New output model will be named:{classification_params['model_name']}")
    print('inputs are coming from: \n    samp_df:{} \n feature_df: {} \n'.format(classification_params["samp_pts"],classification_params['samp_pix_vars']))

    return classification_params

def apply_smalls(pixdf,lut,outpath=None):
    if 'LC2' not in list(pixdf.columns):
        pixdf = pixdf.merge(lut[['LC_UNQ','LC2']],on='LC_UNQ',how='left')
    ### <=1 hectare
    if 'var_poly_area' in pixdf.columns.values.tolist():
        pixdf['smlhld_1ha'] = pixdf.apply(lambda x: 1 if (
            ((x['var_poly_area'] < 100) and (x['LC2'] == 30) and (x['LC25'] < 40)) or (
            (x['Width'] <= 100) and (x['LC2'] == 30)) or x['LC25'] in ['Crops-mix','Crops-Mandioca','Crops-Horticulture','Crops-Sesame']) else 0, axis=1)
    else:
        pixdf['smlhld_1ha'] = pixdf.apply(lambda x: 1 if ((
            (x['Width'] <= 100) and (x['LC2'] == 30)) or x['LC25'] in ['Crops-mix','Crops-Mandioca','Crops-Horticulture','Crops-Sesame']) else 0, axis=1)
    num_smlhld_1ha = pixdf['smlhld_1ha'].sum()
    print(f'{num_smlhld_1ha} of the sample points are small fields < 100 m across')
    ### <= .5 hectare
    if 'var_poly_area' in pixdf.columns.values.tolist():
        pixdf['smlhd_halfha'] = pixdf.apply(lambda x: 1 if (
        ((x['var_poly_area'] < 50) and (x['LC2'] == 30) and (x['LC25'] < 40)) or (
        (x['Width'] <= 50) and (x['LC2'] == 30)) or x['LC25'] in ['Crops-mix','Crops-Mandioca','Crops-Horticulture','Crops-Sesame']) else 0, axis=1)
    else:
        pixdf['smlhd_halfha'] = pixdf.apply(lambda x: 1 if ((
        (x['Width'] <= 50) and (x['LC2'] == 30)) or x['LC25'] in ['Crops-mix','Crops-Mandioca','Crops-Horticulture','Crops-Sesame']) else 0, axis=1)
    num_smlhld_halfha = pixdf['smlhd_halfha'].sum()
    print(f'{num_smlhld_halfha} of the sample points are very small fields < 50 m across')
    if outpath:
        pd.DataFrame.to_csv(pixdf, outpath)
    
    return pixdf

def append_vars_to_samp(lut,pt_path, var_path,out_path):

    lut=pd.read_csv(lut)
    #print(lut.sort_values('LC_UNQ')[['LC_UNQ','USE_NAME','LC25','LC25_name']])
    pt_key = pd.read_csv(basic_config['ptfile'])

    samp_pts = pd.read_csv(pt_path)
    print(f'There are {samp_pts.shape[0]} sample points')
    if ('OID_' not in list(samp_pts.columns)) & ('Unnamed: 0' in list(samp_pts.columns)):
        samp_pts.rename(columns = {"Unnamed: 0": 'OID_'}, inplace = True)
    #print(samp_pts.columns.tolist())
    if 'LC_UNQ_y' in list(samp_pts.columns):
        samp_pts.drop(['LC_UNQ_y'], axis=1, inplace=True)
    ## dropping LC2 from sample points bc used different classification system in previous versions 
    if 'LC2' in list(samp_pts.columns):
        samp_pts.drop(['LC2'], axis=1, inplace=True)
    if 'LC25' not in list(samp_pts.columns):
        if 'LC_UNQ' in list(samp_pts.columns):
            samp_pts = samp_pts.merge(lut[['LC_UNQ','LC25','LC25_name']],on='LC_UNQ',how='left')
        else:
            samp_pts.merge(lut[['LC_UNQ','LC25','LC25_name']],left_on='LC', right_on='LC_UNQ',how='left')
    pix_vars = pd.read_csv(var_path)
    if 'var_poly_area' in list(pix_vars.columns):
        ## hacky fix for issue of numbers over signed 16-bit max being converted to negative in var dataframe 
        pix_vars['var_poly_area'] = np.where(pix_vars['var_poly_area']<0,32767,pix_vars['var_poly_area'])
    pix_vars = pd.merge(pix_vars,pt_key[['OID_','PID']],on='OID_', how='left')
    pix_data = samp_pts.merge(pix_vars, left_on='PID', right_on='PID', how='inner')
    if 'LC_UNQ_x' in list(pix_data.columns):
        pix_data.rename(columns={'LC_UNQ_x': 'LC_UNQ'},inplace=True)
    if ('USE_NAME' in list(pix_data.columns)) and ('Class' not in list(pix_data.columns)):
         pix_data.rename(columns={'USE_NAME':'Class'},inplace=True)
    if 'smlhld_1ha' not in list(pix_data.columns):
        pix_data = apply_smalls(pix_data,lut)
    if 'OID__x' in list(pix_data.columns):
        pix_data.rename(columns={'OID__x': 'OID_'},inplace=True)
        pix_data.drop(['OID__y'], axis=1, inplace=True)
    
    #print('sample breakdown by LC25 class:')
    #print(pix_data['LC25_name'].value_counts())

    ## Note rand2 <= .2 has already been pulled off these datasets
    pix_data['TESTSET10'] = np.where(pix_data['rand2'] > .91, 1, 0)
    pix_data['TESTSET20'] = np.where(pix_data['rand2'] > .82, 1, 0)
    pd.DataFrame.to_csv(pix_data, out_path)

def create_temp_rf_model(lut,fixed_ho_dir,mod_dir,class_mod,runnum):
    classification_params['ho_thresh'] = 0
    class_mod_name = get_class_col(class_mod,lut)
    model_name = classification_params['model_name'] + '_' + class_mod_name[0]   
    df_in = classification_params["pixdf"]
    print(f'making model for {model_name}_{runnum}')
    #rfpath = os.path.join(mod_dir,f'{model_name}_RFmod{runnum}.joblib')
    
    # Note model needs to be created from command line to save properly for use in classification. models created here are for testing only
    if mod_dir == classification_params['main_model_dir']:
        if os.path.isfile(rfpath):
            print('STOP -- DO NOT OVERWRITE SAVED MODEL. Change model_mode or model_dir to proceed')
            sys.exit() 

    rf0 = rf_model(df_in,
        mod_dir,
        class_mod,
        classification_params['impmeth'],
        classification_params['ranhold'],
        classification_params['model_name'],
        lut,
        classification_params['feature_model'],
        classification_params['ho_thresh'],         
        classification_params['feature_mod_dict'],
        update_model_dict=False,
        fixed_ho=True,
        fixed_ho_dir=fixed_ho_dir,
        runnum=runnum)
        
    return rf0

def log_acc_results(scores_dict, model_name, these_scores,runnum):
    try:
        with open(scores_dict, 'r+') as full_dict:
            dic = json.load(full_dict)
        if runnum:
            model_name = f'{model_name}_{runnum}'
        else:
            model_name = model_name 
        dic.update({model_name : these_scores})
          
    except IOError:
        print('File not found, will create a new one.')
        dic = {model_name : rf0[1]}

    ## Add counter for multiple runs
    
    new_scores = pd.DataFrame.from_dict(dic)
    print(new_scores.head())
    #new_scores.to_csv(model_scores_tab)

    with open(scores_dict, 'w') as new_dict:
        json.dump(dic, new_dict)


## Get fixed holdout (if not already done)
All sample models use the same HO set; just need to create if changing the feature model

In [None]:
### the holdout was already taken out prior to creating the sample point files
### if GENERAL_HO files exist, this part should not need to be rerun
#fixed_ho_dir = '/home/downspout-cel/paraguay_lc/vector/pts_calval/fixed_HOs'
#ho, tr0 = get_stable_holdout(pix_data, fixed_ho_dir, 20, 'smallCrop', lut, overwrite=False) 
#ho1, tr1 = get_stable_holdout(tr0, fixed_ho_dir, 20, 'bigCrop', lut, overwrite=False) 
#ho2, tr2 = get_stable_holdout(tr1, fixed_ho_dir, 20, 'noCrop', lut, overwrite=False)
#pixdf = pd.read_csv('/home/downspout-cel/paraguay_lc/vector/pts_training/GENERAL_TRAINING.csv')

### run this part if using a new feature model
'''
var_path = classification_params['samp_pix_vars']
for ho in ['noCrop','smallCrop','medCrop', 'bigCrop']:
    pt_path = f'/home/downspout-cel/paraguay_lc/vector/pts_calval/fixed_HOs/GENERAL_HOLDOUT_{ho}.csv'
    out_path = f'/home/downspout-cel/paraguay_lc/vector/pts_calval/fixed_HOs/{feature_model}_HOLDOUT_{ho}.csv'
    append_vars_to_samp(lut, pt_path, var_path, out_path)
'''

### Set new variables here for temp model testing: -- SKIP if keeping original model

In [None]:
### MAKE SURE THIS IS USING POST HO INFO
year = 2021

#feature_model = "base4Poly6"
feature_model = "base4NoPoly"
#feature_model = "base4NoPolyLonly"
#feature_model = "base4NoPoly30m"

sample_model = 'bal300mix5'
lut='../Class_LUT.csv'

#fixed_ho_dir = '/home/downspout-cel/paraguay_lc/vector/pts_calval/EPy_district_samp'
fixed_ho_dir = '/home/downspout-cel/paraguay_lc/vector/pts_calval/fixed_HOs'
class_mod = 'all'

model_scores_dict = '/home/downspout-cel/paraguay_lc/classification/RF/model_stats/CEL_model_scores_dict_F.json'
model_scores_tab = '/home/downspout-cel/paraguay_lc/classification/RF/model_stats/CEL_model_scores_F.csv'

## For singular experimentation

In [None]:
'''
set_variables(feature_model, sample_model, year)
model_name = classification_params['model_name']
print(model_name)
out_path = f'/home/downspout-cel/paraguay_lc/classification/inputs/pixdf_{model_name}_{year}.csv'
classification_params["pixdf"] = out_path
append_vars_to_samp(lut,pt_path, var_path, out_path)

rf = create_temp_rf_model(lut,fixed_ho_dir,mod_dir,class_mod,runnum=0)
#print(rf[1])
'''

## To make multiple models

In [None]:
def iterate_multiple_models(score_dict, temp_dir, lut, fixed_ho_dir, numruns=10):
    for m in [0]:
        for b in range(0,11):
            sample_model = f'bal{m}mix{b}'
            classification_params = set_variables(feature_model, sample_model, 2021)
            model_name = classification_params['model_name']
            print(f'building {model_name}...')
            out_path = f'/home/downspout-cel/paraguay_lc/classification/inputs/pixdf_{model_name}_{year}.csv'
            classification_params["pixdf"] = out_path
            var_path = classification_params['samp_pix_vars']
            pt_path = classification_params["samp_pts"]
            if not os.path.exists(out_path):
                append_vars_to_samp(lut,pt_path, var_path, out_path)
            for rn in range(6,(numruns + 1)):
                rf0 = create_temp_rf_model(lut,fixed_ho_dir,temp_dir,class_mod,rn)
                log_acc_results(score_dict, model_name, rf0[1],rn)

model_score_dict = '/home/downspout-cel/paraguay_lc/classification/RF/model_stats/CEL_model_iterations_LC25_2021.json'
temp_mod_dir = '/home/scratch-cel/rf_mods'
iterate_multiple_models(model_score_dict, temp_mod_dir, lut, fixed_ho_dir, numruns=10)

In [None]:
def aggregate_run_scores(score_dict, agg_score_tab):
    with open(score_dict, 'r+') as full_dict:
        dic = json.load(full_dict)
    new_scores = pd.DataFrame.from_dict(dic).T
    agg_scores = new_scores.groupby(['F','S','C','A']).agg(avgF1=('F1_cnc', 'mean'),stdF1=('F1_cnc', 'std'),avgOA=('OA_cnc','mean'),stdOA=('OA_cnc','std'),
                                                        recallsc=('recall_smallCrop','mean'),stdsc=('recall_smallCrop','std'),
                                                        recallbc=('recall_bigCrop','mean'),stdbc=('recall_bigCrop','std'),
                                                        recallnc=('recall_noCrop','mean'),stdnc=('recall_noCrop','std'))
    agg_scores.to_csv(agg_score_tab)

    return agg_scores
    
def get_best_models(score_dict, final_models_tab):
    with open(score_dict, 'r+') as full_dict:
        dic = json.load(full_dict)
    new_scores = pd.DataFrame.from_dict(dic).T
    new_scores['F1_cnc'] = new_scores['F1_cnc'].astype('float64')
    best_models = new_scores.groupby(['F','S','C','A'])['F1_cnc'].idxmax()
    keep_models = new_scores.loc[best_models]
    keep_models.to_csv(final_models_tab)

    return keep_models

model_score_dict = '/home/downspout-cel/paraguay_lc/classification/RF/model_stats/CEL_model_iterations_LC25_2021.json'
agg_score_tab = '/home/downspout-cel/paraguay_lc/classification/RF/model_stats/CEL_model_iteration_LC25_summary.csv'
aggregate_run_scores(model_score_dict,agg_score_tab)
final_models_tab = '/home/downspout-cel/paraguay_lc/classification/RF/model_stats/CEL_best_models_LC25.csv'
keep_models = get_best_models(model_score_dict, final_models_tab)

In [None]:
import matplotlib.pyplot as plt
#from matplotlib.ticker import FormatStrFormatter
from matplotlib.ticker import ScalarFormatter

class MyScalarFormatter(ScalarFormatter):
    # Override '_set_format' with your own
    def _set_format(self):
        self.format = '%.2f'  # Show 2 decimals
        
agg_score_tab = '/home/downspout-cel/paraguay_lc/classification/RF/model_stats/CEL_model_iteration_LC25_summary.csv'
agg_scores = pd.read_csv(agg_score_tab)
#print(agg_scores)
agg_scores[['bal', 'mix']] = agg_scores['S'].str.split('mix', n=1, expand=True)
agg_scores['mix']=agg_scores['mix'].astype('int')
feature_mod = 'base4Poly6'
bal0 = agg_scores[(agg_scores['bal'] == 'bal0') & (agg_scores['F'] == feature_mod)]
bal100 = agg_scores[(agg_scores['bal'] == 'bal100') & (agg_scores['F'] == feature_mod)]
bal200 = agg_scores[(agg_scores['bal'] == 'bal200') & (agg_scores['F'] == feature_mod)]
bal300 = agg_scores[(agg_scores['bal'] == 'bal300') & (agg_scores['F'] == feature_mod)]

fig, axs = plt.subplots(2, 2, figsize=(10, 10))
custom_formatter = MyScalarFormatter(useMathText=True)

axs[0, 0].yaxis.set_major_formatter(custom_formatter)
axs[0, 0].errorbar(bal0['mix'],bal0['recallnc'], yerr = bal0['stdnc'], fmt ='o', label ='min class size = 0')
axs[0, 0].errorbar(bal100['mix'],bal100['recallnc'], yerr = bal100['stdnc'], fmt ='o', label ='min class size = 100')
axs[0, 0].errorbar(bal200['mix'],bal200['recallnc'], yerr = bal200['stdnc'], fmt ='o', label ='min class size = 200')
axs[0, 0].errorbar(bal300['mix'],bal300['recallnc'], yerr = bal300['stdnc'], fmt ='o', label ='min class size = 300')
axs[0, 0].set_xlabel("% mixed crop included in training sample")
axs[0, 0].set_ylabel("recall for no crop")
axs[0, 0].legend(loc='lower left')

axs[0, 1].yaxis.set_major_formatter(custom_formatter)
axs[0, 1].errorbar(bal0['mix'],bal0['recallsc'], yerr = bal0['stdsc'], fmt ='o', label ='min class size = 0')
axs[0, 1].errorbar(bal100['mix'],bal100['recallsc'], yerr = bal100['stdsc'], fmt ='o', label ='min class size = 100')
axs[0, 1].errorbar(bal200['mix'],bal200['recallsc'], yerr = bal200['stdsc'], fmt ='o', label ='min class size = 200')
axs[0, 1].errorbar(bal300['mix'],bal300['recallsc'], yerr = bal300['stdsc'], fmt ='o', label ='min class size = 300')
axs[0, 1].set_xlabel("% mixed crop included in training sample")
axs[0, 1].set_ylabel("recall for small crop")
axs[0, 1].legend(loc='lower right')

axs[1, 0].yaxis.set_major_formatter(custom_formatter)
axs[1, 0].errorbar(bal0['mix'],bal0['recallbc'], yerr = bal0['stdbc'], fmt ='o', label ='min class size = 0')
axs[1, 0].errorbar(bal100['mix'],bal100['recallbc'], yerr = bal100['stdbc'], fmt ='o', label ='min class size = 100')
axs[1, 0].errorbar(bal200['mix'],bal200['recallbc'], yerr = bal200['stdbc'], fmt ='o', label ='min class size = 200')
axs[1, 0].errorbar(bal300['mix'],bal300['recallbc'], yerr = bal300['stdbc'], fmt ='o', label ='min class size = 300')
axs[1, 0].set_xlabel("% mixed crop included in training sample")
axs[1, 0].set_ylabel("recall for big crop")
axs[1, 0].legend(loc='lower right')

axs[1, 1].yaxis.set_major_formatter(custom_formatter)
axs[1, 1].errorbar(bal0['mix'],bal0['avgF1'], yerr = bal0['stdF1'], fmt ='o', label ='min class size = 0')
axs[1, 1].errorbar(bal100['mix'],bal100['avgF1'], yerr = bal100['stdF1'], fmt ='o', label ='min class size = 100')
axs[1, 1].errorbar(bal200['mix'],bal200['avgF1'], yerr = bal200['stdF1'], fmt ='o', label ='min class size = 200')
axs[1, 1].errorbar(bal300['mix'],bal300['avgF1'], yerr = bal300['stdF1'], fmt ='o', label ='min class size = 300')
axs[1, 1].set_xlabel("% mixed crop included in training sample")
axs[1, 1].set_ylabel("F1 score for crop no crop")
axs[1, 1].legend(loc='lower right')

fig.suptitle('sample model comparisons with 25 land cover classes',y=.94)
fig.tight_layout(pad=3)
plt.show()



In [None]:
import matplotlib.pyplot as plt
#from matplotlib.ticker import FormatStrFormatter
from matplotlib.ticker import ScalarFormatter

class MyScalarFormatter(ScalarFormatter):
    # Override '_set_format' with your own
    def _set_format(self):
        self.format = '%.2f'  # Show 2 decimals
        
agg_score_tab = '/home/downspout-cel/paraguay_lc/classification/RF/model_stats/CEL_model_iteration_LC25_summary.csv'
agg_scores = pd.read_csv(agg_score_tab)
#print(agg_scores)
agg_scores[['bal', 'mix']] = agg_scores['S'].str.split('mix', n=1, expand=True)
agg_scores['mix']=agg_scores['mix'].astype('int')
balance = 'bal200'
mod_noPoly = agg_scores[(agg_scores['bal'] == balance) & (agg_scores['F'] == 'base4NoPoly')]
mod_poly6 = agg_scores[(agg_scores['bal'] == balance) & (agg_scores['F'] == 'base4Poly6')]

fig, axs = plt.subplots(2, 2, figsize=(10, 10))
custom_formatter = MyScalarFormatter(useMathText=True)

axs[0, 0].yaxis.set_major_formatter(custom_formatter)
axs[0, 0].errorbar(mod_noPoly['mix'],mod_noPoly['recallnc'], yerr = mod_noPoly['stdnc'], fmt ='o', label ='mod_noPoly')
axs[0, 0].errorbar(mod_poly6['mix'],mod_poly6['recallnc'], yerr = mod_poly6['stdnc'], fmt ='o', label ='mod_poly6')
axs[0, 0].set_xlabel("% mixed crop included in training sample")
axs[0, 0].set_ylabel("recall for no crop")
axs[0, 0].legend(loc='lower left')

axs[0, 1].yaxis.set_major_formatter(custom_formatter)
axs[0, 1].errorbar(mod_noPoly['mix'],mod_noPoly['recallsc'], yerr = mod_noPoly['stdsc'], fmt ='o', label ='mod_noPoly')
axs[0, 1].errorbar(mod_poly6['mix'],mod_poly6['recallsc'], yerr = mod_poly6['stdsc'], fmt ='o', label ='mod_poly6')
axs[0, 1].set_xlabel("% mixed crop included in training sample")
axs[0, 1].set_ylabel("recall for small crop")
axs[0, 1].legend(loc='lower right')

axs[1, 0].yaxis.set_major_formatter(custom_formatter)
axs[1, 0].errorbar(mod_noPoly['mix'],mod_noPoly['recallbc'], yerr = mod_noPoly['stdbc'], fmt ='o', label ='mod_noPoly')
axs[1, 0].errorbar(mod_poly6['mix'],mod_poly6['recallbc'], yerr = mod_poly6['stdbc'], fmt ='o', label ='mod_poly6')
axs[1, 0].set_xlabel("% mixed crop included in training sample")
axs[1, 0].set_ylabel("recall for big crop")
axs[1, 0].legend(loc='lower right')

axs[1, 1].yaxis.set_major_formatter(custom_formatter)
axs[1, 1].errorbar(mod_noPoly['mix'],mod_noPoly['avgF1'], yerr = mod_noPoly['stdF1'], fmt ='o', label ='mod_noPoly')
axs[1, 1].errorbar(mod_poly6['mix'],mod_poly6['avgF1'], yerr = mod_poly6['stdF1'], fmt ='o', label ='mod_poly6')
axs[1, 1].set_xlabel("% mixed crop included in training sample")
axs[1, 1].set_ylabel("F1 socre for crop no crop")
axs[1, 1].legend(loc='lower right')

fig.suptitle('sample model comparisons with 25 land cover classes',y=.94)
fig.tight_layout(pad=3)
plt.show()

In [None]:
def save_best_models(keep_models, temp_mod_dir, main_mod_dir):
## moves keep models into main model dir
    
    import shutil
    keepers = list(keep_models.index.values)
    print(keepers)

    for k in keepers:
        print(f'copying model for {k} to permamant directory')
        if len(k.split('_'))==2:
            run = ''
        else:
            run = k.split('_')[2]
        model = str(k.split('_')[0]+'_'+k.split('_')[1])
        current_file = os.path.join(temp_mod_dir,f'{model}_LC25_RFmod{run}.joblib')
        final_file = os.path.join(main_mod_dir,f'{model}_21_LC25_RFmod.joblib')
        shutil.copy(current_file, final_file)

#save_best_models(keep_models, mod_dir, classification_params['main_model_dir'])

### For multi-year model: Merge dfs for multiple years

In [None]:
model_name = f'{feature_model}_{sample_model}'

years = [2017,2024]
df_list = []
for y in range(years[0],years[1]):
    print(y)
    pixdf_path = f'/home/downspout-cel/paraguay_lc/classification/inputs/pixdf_{model_name}_{year}.csv'
    pixdf = pd.read_csv(pixdf_path)
    vardf = pixdf.filter(regex='var_')
    nancols = vardf.columns[vardf.isna().any()].tolist()
    if len(nanocls) > 0:
        print('oops -- NaNs in:', nancols)
    df_list.append(pixdf)
allpix = pd.concat(df_list)
allpix_path = f'/home/downspout-cel/paraguay_lc/classification/inputs/pixdf_{model_name}_9999.csv'
#pd.DataFrame.to_csv(allpix, allpix_path)
print(all_pts.shape[0])

## Basic tasks

#### View the look up table
These are the different LC_models to group things in classification and to translate between numerical map categories and text labels

In [None]:
lut=pd.read_csv('../Class_LUT.csv')
lut.drop(['Description'], axis=1, inplace=True)

print(lut.sort_values('LC_UNQ'))

## view confusion matrices
Note parameters: (pred_col, obs_col, lut, lc_mod_map, lc_mod_acc, print_cm=False, out_dir=None, model_name=None)
To print cm to csv file, change print_cm to True and provide an out_dir and model_name

In [None]:
'''
Note: if running build_weighted_accuracy_table below, these will be printed to file within that.
'''

cm_cropNoCrop = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'cropNoCrop', 
                                     print_cm=False, out_dir=classification_params['model_dir'],
                                     model_name=classification_params['model_name'])
cm_cropType = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'cropType', 
                                   print_cm=False, out_dir=classification_params['model_dir'],
                                   model_name=classification_params['model_name'])
cm_veg = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'veg', 
                              print_cm=False, out_dir=classification_params['model_dir'],
                              model_name=classification_params['model_name'])
cm_all = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],'all', 
                              print_cm=False, out_dir=classification_params['model_dir'],
                              model_name=classification_params['model_name'])
cm_single = get_confusion_matrix(rf0[1]['pred'],rf0[1]['label'],lut,classification_params['lc_mod'],classification_params['lc_mod'],False,classification_params['model_dir'],None)

print(cm_cropNoCrop)
print(cm_cropType)
print(cm_veg)
print(cm_all)
print(cm_single)


## view variable importance
this can be computed via Impurity or Permutation method (see sklearn docs)  by setting impmeth in rf_model
The full list is stored in the model directory for further manipulation 

In [None]:
var_imp_path = os.path.join(classification_params['model_dir'],'VarImportance_{}.csv'.format(classification_params['model_name']))
var_imp = pd.read_csv(var_imp_path, names=['var','imp'], header=None)
## view 10 most important variables:
var_imp.sort_values('imp', ascending=False).head(10)

In [None]:
with open(classification_params['feature_mod_dict'], 'r+') as feature_model_dict:
    dic = json.load(feature_model_dict)
print(dic.keys())

## Build weighted accuracy matrix for model selection / optimization
#### Note, this is incorporated within automated methods

In [None]:
#mc_holdout = os.path.join(classification_params['main_model_dir'],'{}_mixedCrop_HO20.csv'.format(classification_params['feature_model']))
#model_name = classification_params['model_name']
#model_name = model_name 
#out_dir = os.path.join(classification_params['local_dir'],'cmsbi')
                          
#wacc = build_weighted_accuracy_table(out_dir,model_name,rf0,classification_params["pixdf"],lut)
#wacc = build_weighted_accuracy_table(out_dir,model_name,rf0,classification_params["pixdf"],lut,binary=True, second_cm=False, ho_path=None)
#print(wacc.tail(n=10))

#### create noPoly datasets

In [None]:
#vardir = '/home/downspout-cel/paraguay_lc/vector/pts_training/features'
#vardir = '/home/downspout-cel/paraguay_lc/classification/inputs'
vardir = '/home/downspout-cel/paraguay_lc/vector/pts_calval/fixed_HOs'
#dfs = [f for f in os.listdir(vardir) if f.startswith("ptsfeats_base4Poly6")]
#dfs = [f for f in os.listdir(vardir) if f.startswith("pixdf_base4Poly6")]
dfs = [f for f in os.listdir(vardir) if f.startswith("base4Poly6")]
for df in dfs:
    df_in = os.path.join(vardir,df)
    newname = df.replace("Poly6","NoPoly")
    df_out = os.path.join(vardir,newname)
    df1 = pd.read_csv(df_in)
    #print(df1.columns.values.tolist())
    poly_vars = [v for v in df1.columns.values.tolist() if v.startswith('var_poly')]
    #df1.drop(['var_poly_ext', 'var_poly_dst', 'var_poly_cropbnds', 'var_poly_area', 'var_poly_APrEf', 'var_poly_NovDecStd','var_poly_APR'], axis=1, inplace=True)
    df1.drop(poly_vars,axis=1,inplace=True)
    df1.to_csv(df_out)

## To save an html copy of this notebook with all outputs:
(these two cells should be last in notebook)

In [None]:
%%script echo skipping
### comment out above line and run this cell to print output as html

out_name = str(basic_config['country']+'6c_RandomFoest_ModelComparisons'+'_model'+str(classification_params['model_name'])+'basic_config['filter_yr'])
!jupyter nbconvert --output-dir='./Outputs' --to html --no-input --ExecutePreprocessor.store_widget_state=True --output=$out_name 6c_RandomFoest_ModelComparisons.ipynb