In [None]:
import os
import sys
from pathlib import Path
import pandas as pd
import math
#import json

%matplotlib inline

In [None]:
sys.path.append(r"../LUCinSA_helpers")
from rf import *

In [None]:
'''
PARAMETERS: modify in notebook_params notebook, then run that notebook and this cell to update here
DO NOT modify this cell
'''

%store -r basic_config
#print(" modelling year is (filter_year param): {} (this is first year if season spans two years)".format(basic_config['filter_yr']))
%store -r classification_params

print("Classification_Params: \n" 
      " temp output files are saved to (local_model_dir): {} \n" 
      " shared modelling files are in (main_model_dir): {} \n" 
      " feature_model = {} \n sample_model = {} \n model_name = {} \n"
      " the full sample pt file: {} \n"
      " the full sample dataframe with the feature model applied: {} \n"
      " the subset pt file based on the sample model: {} \n"
      " sample_model_dict: {} \n lc_class = {}"
      .format(classification_params['model_dir'],classification_params['main_model_dir'],classification_params['feature_model'],
              classification_params['sample_model'],classification_params['model_name'],basic_config['ptfile'],
              classification_params['samp_pix_vars'],classification_params['samp_pts'],
              classification_params['feature_mod_dict'],classification_params['lc_mod']))

## define / alter sample pixels to participate in model training

### load in LUT to see class options

In [None]:
lut=Path('../Class_LUT.csv')
print(pd.read_csv(lut).sort_values('LC_UNQ')[['LC_UNQ','USE_NAME','LC25','LC25_name','LC3_name']])

### start with default models with pixel-only data:

In [None]:
samp_pts = pd.read_csv(basic_config['ptfile'])
pix_vars = pd.read_csv(classification_params['samp_pix_vars'])
#print(samp_pts)

pix_data = pix_vars.merge(samp_pts, left_on='OID_', right_on='OID_', how='left')

#pix_data.drop(['LC2'], axis=1, inplace=True)
pixdf = pix_data.merge(pd.read_csv(lut), left_on='Class', right_on='USE_NAME', how='left')
print('sample breakdown by LC25 class:')
print(pixdf['LC25_name'].value_counts())

print('default rf model with all sample pixels and pixel only data')

#### check for nan columns if desired
Note that NaN is a problem in columns that start with 'Var' (as these are used in the rf model). 
Columns that do not start with 'Var' are probably ok.

In [None]:
## Note if any NaN columns start with 'var_', the NaNs will cause the rf model to fail
nancols = pixdf.columns[pixdf.isna().any()].tolist()
print(f'columns with NaN: {nancols}')
var_nans = [v for v in nancols if v.startswith('var_')]
if len(var_nans) > 0:
    print (f'WARNING: The following variables have NaN and will not work in RF model: {var_nans}')
else:
    print('none of these are model variables, so can probably be ignored')

## Add smallholder flag to dataset

In [None]:
# already done at level of point file (might need to do again if creating new pt file)
def apply_smalls(pixdf,lut,outpath=None):
    if 'LC2' not in list(pixdf.columns):
        pixdf = pixdf.merge(lut[['LC_UNQ','LC2']],on='LC_UNQ',how='left')
    ### <=1 hectare
    pixdf['smlhld_1ha'] = pixdf.apply(lambda x: 1 if (
        ((x['var_poly_area'] < 100) and (x['LC2'] == 1)) or (
        (x['Width'] <= 100) and (x['LC2'] == 1)) or x['LC25'] in ['Crops-mix','Crops-Mandioca','Crops-Horticulture','Crops-Sesame']) else 0, axis=1)
    num_smlhld_1ha = pixdf['smlhld_1ha'].sum()
    print(f'{num_smlhld_1ha} of the sample points are small fields < 100 m across')
    ### <= .5 hectare
    pixdf['smlhld_halfha'] = pixdf.apply(lambda x: 1 if (
        ((x['var_poly_area'] < 50) and (x['LC2'] == 1)) or (
        (x['Width'] <= 50) and (x['LC2'] == 1)) or x['LC25'] in ['Crops-mix','Crops-Mandioca','Crops-Horticulture','Crops-Sesame']) else 0, axis=1)
    num_smlhld_halfha = pixdf['smlhld_halfha'].sum()
    print(f'{num_smlhld_halfha} of the sample points are very small fields < 50 m across')
    if outpath:
        pd.DataFrame.to_csv(pixdf, outpath)
    
    return pixdf

## Separate set of mixed crop pts to use as fixed holdout 

Note: This was already done. A fixed holdout was separated from the full pt file to avoid confusion

In [None]:
fixed_ho_dir = '/home/downspout-cel/paraguay_lc/vector/pts_calval/fixed_HOs'
ho, tr0 = get_stable_holdout(pixdf, fixed_ho_dir, 20, 'smallCrop', lut, overwrite=False) 
#ho1, tr1 = get_stable_holdout(tr0, fixed_ho_dir, 20, 'bigCrop', lut, overwrite=False) 
#ho2, tr2 = get_stable_holdout(tr1, fixed_ho_dir, 20, 'noCrop', lut, overwrite=False)
#pixdf = pd.read_csv('/home/downspout-cel/paraguay_lc/vector/pts_training/GENERAL_TRAINING.csv')

## prep pixel datasets by reducing sample

### Option1: by sampling method (reducing dominant CAN soy pts that are not verified in GE)

In [None]:
#### Note that there are many fewer ground verified soy points -- we want to keep all of these in the sample
soyground = pixdf[(pixdf['LC25_name'] == 'Crops-Soybeans') & (pixdf['SampMethod'] != 'CAN - unverified in GE')]
print(soyground.shape[0])

In [None]:
#### model_name = pixdf_25Soy
##   removes 3/4 of the soy points because they are far overrepresented
pixdf_25Soy = pixdf[(pixdf['rand']>.8) | (pixdf['SampMethod'] != 'CAN - unverified in GE')]
print('25Soy sample breakdown by LC25 class:')
print(pixdf_25Soy['LC25_name'].value_counts())
pixdf_path = os.path.join(classification_params['model_dir'],'pixdf_25Soy.csv')
pd.DataFrame.to_csv(pixdf_25Soy, pixdf_path)

### Drop classes that have sample sizes too small to model
#### optional -- only uesful if modeling all classes (LC25)

In [None]:
dropClass = ['Crops-Vineyard','NewPlant']
pixdf = pixdf[~pixdf['LC25_name'].isin(dropClass)]
#pixdf = pixdf.drop("Description", axis=1)
pd.options.display.max_columns = None
print(pixdf['LC25_name'].value_counts())
#print(pd.DataFrame(pixdf.isnull().any()))

### Option2: by balancing classes

##### First run this to make sure ground sample points are used first for soy (because sample is overwhelmed by unverified CAN pts)

In [None]:
#### model name = pixdf_
## Note we need about 1374 soy points for a balanced model. we want to include all the 356 ground points found above + 
allsoy = pixdf['LC25_name'].value_counts()['Crops-Soybeans']
soyground = pixdf[(pixdf['LC25_name'] == 'Crops-Soybeans') & (pixdf['SampMethod'] != 'CAN - unverified in GE')].shape[0]
othersoy = (1600 - soyground) / allsoy
pixdf_balsoy = pixdf[(pixdf['rand'] < othersoy) | (pixdf['SampMethod'] != 'CAN - unverified in GE')]
print('balsoy sample breakdown by LC25 class:')
print(pixdf_balsoy['LC25_name'].value_counts())
#pixdf_path = os.path.join(classification_params['model_dir'],'pixdf_balsoy.csv')

### Option2: by balancing classes

In [None]:
 # run balance_training_data function
 #   balances class samples based on map proportion, relative to sample size for class with max map proportion
 #   (this estimated map proportion is a column named "perLC25E" in the LUT )
 #   allows a minimum threshold to be set {cutoff} so that sample sizes are not reduced below the minimum
 #   allows a factor to be set for mixed (heterogeneous) classes to sample them more heavily than main classes
 #       (the maximum value will depend on the available samples for these classes. Current max is ~4)
 #   prints 'pixdf_bal{cutoff}mix{mix_factor}.csv' in out_dir
    
cutoff = 300
mix_factor = 5
out_dir = '/home/downspout-cel/paraguay_lc/vector/pts_training/pt_subsets' 
pixdf_bal = balance_training_data(lut, pixdf_balsoy, out_dir, cutoff = cutoff, mix_factor = mix_factor)
## repeat with mix_factor = 2 - 10

## Strip excess columns from pixdf
to avoid name changes when joining with tables that already have these columns

In [None]:
df = pixdf_bal
#df = pd.read_csv(os.path.join(classification_params['main_model_dir'],'pixdf_bal100mix0.csv'))
out_name = 'bal{}mix{}.csv'.format(cutoff,mix_factor)
df = df.loc[:,~df.columns.str.contains('var')] 
df = df.loc[:,~df.columns.str.endswith('_y')] 
df = df.rename(columns=lambda x: x.replace('_x', ''))
df.drop(['Description', 'ratios','Segmentation','LCTrans','LCTrans_name'], axis=1, inplace=True)
print(df.columns.tolist())
df.to_csv(os.path.join(out_dir,out_name))

In [None]:
## This is an old method used for the original bal1000 model. Better to use the balance_training_data function

#### sample model_name = bal1000
pixdf1 = pixdf[(pixdf['PurePixel'] != 'No') | (pixdf['LC25_name'].str.contains('mix', na=False, case=False))]
pixdf2 = pixdf1[(pixdf['rand']>.84) | (pixdf['SampMethod'] != 'CAN - unverified in GE')]
pixdf3 = pixdf2[(pixdf2['rand']>.65) | (pixdf2['LC25_name'] != 'Mixed-VegEdge')]
pixdf4 = pixdf3[(pixdf3['rand']>.65) | (pixdf3['LC25_name'] != 'Crops-mix')]
pixdf5 = pixdf4[(pixdf4['rand']>.86) | (pixdf4['LC25_name'] != 'Mixed-path')]
pixdf6 = pixdf5[(pixdf5['rand']>.30) | (pixdf5['LC25_name'] != 'Crops-Yerba-Mate')]
pixdf7 = pixdf6[(pixdf6['rand']>.39) | (pixdf6['SampMethod'] == 'GE_KW_sup') | (pixdf6['LC25_name'] != 'Grassland-Managed')]
#pixdf8 = pixdf7[(pixdf7['rand']>.36) | (pixdf7['LC25_name'] != 'Trees-Forest')]
print('pixdf_bal0 sample breakdown by LC25 class:')
print(pixdf7['LC25_name'].value_counts())
pixdf_path = os.path.join(classification_params['main_model_dir'],'pixdf_base1000.csv')
pd.DataFrame.to_csv(pixdf7, pixdf_path)

## if polygons are available, can combine pixel and polygon dfs and create rf datasets for points with polygons and those without:

In [None]:
poly_data = pd.read_csv(classification_params['samp_poly'])
#rename column names that also occur in pixel df
poly_data.rename(columns={'area':'areaSeg'}, inplace=True)
all_data = pixdf.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
polypixdf_path = os.path.join(classification_params['model_dir'],'pts_polyData_joinCheck.csv')
pd.DataFrame.to_csv(all_data, polypixdf_path, sep=',', na_rep='NaN', index=True)

### first create dataset for points outside of polygons (here we have no variables to add to the original model)

In [None]:
outsideSeg = all_data[all_data['areaSeg'].isna()]
print(f'of the {all_data.shape[0]} sample points in our dataset, {outsideSeg.shape[0]} are outside of our segmented polygons')
print(outsideSeg['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'outside')
rfout17 = rf_model(outsideSeg,out_dir,'All','Permutation',29,'Fullsamp')

And for model with more balanced soy representation (25Soy):

In [None]:
all_data_25Soy = pixdf_25Soy.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
outsideSeg_25Soy = all_data_25Soy[all_data_25Soy['areaSeg'].isna()]
print(outsideSeg_25Soy['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'outside')
rfout17_lessSoy = rf_model(outsideSeg_lessSoy,out_dir,'All','Permutation',29,'LessSoy')

#### now create dataset for points inside of polygons (here we want to add some variables first)

In [None]:
#poly_data['AvgU'] = poly_data.apply(lambda x:count([x[c] for c in df.columns if c.endswith('U')]),axis=1)
#TODO: calculate these in pandas as above
poly_data.rename(columns={'areaSeg':'var_areaSeg','AVGU':'var_AVGU','AVGR':'var_AVGR','AVGSTD':'var_AVGSTD','MAXR':'var_MAXR','MINR':'var_MINR','STDU':'var_STDU','MINU':'var_MINU','MAXU':'var_MAXU','rU':'var_RU'}, inplace=True)
polyvars = pixdf.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
withinSeg = polyvars[polyvars['var_areaSeg'] > 0]

print(withinSeg['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'within')
#rfin17 = rf_model...

In [None]:
all_data_lessSoy = pixdf_lessSoy.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
withinSeg_lessSoy = all_data_lessSoy[all_data_lessSoy['var_areaSeg']>0]
print(withinSeg_lessSoy['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'within')
#rfin17_lessSoy = rf_model(...

## To save an html copy of this notebook with all outputs:
(these two cells should be last in notebook)

In [None]:
%%script echo skipping
### comment out above line and run this cell to print output as html

out_name = str(basic_config['country']+'6a_RandomFoest_VariableDataframe'+'_model'+str(classification_params['model_name'])+'_'+'Tests1')
!jupyter nbconvert --output-dir='./Outputs' --to html --no-input --ExecutePreprocessor.store_widget_state=True --output=$out_name 6b_RandomFoest_ModelComparisons.ipynb