In [None]:
import os
import sys
from pathlib import Path
import pandas as pd
#import json

%matplotlib inline

In [None]:
sys.path.append(r"../LUCinSA_helpers")
from rf import *

In [None]:
'''
PARAMETERS: modify in notebook_params notebook, then run that notebook and this cell to update here
DO NOT modify this cell
'''

%store -r basic_config
#print(" modelling year is (filter_year param): {} (this is first year if season spans two years)".format(basic_config['filter_yr']))
%store -r classification_params
print("Classification_Params: \n" 
      " temp output files are saved to (local_model_dir): {} \n" 
      " shared modelling files are in (main_model_dir): {} \n" 
      " feature_model = {} \n sample_model = {} \n model_name = {} \n"
      " sample_model_dict is: {} \n"
      " samp_pt_file = {} \n pix_vars = {} \n pixdf = {} \n lc_class = {}"
      .format(classification_params['local_model_dir'],classification_params['main_model_dir'],
              classification_params['feature_model'],classification_params['sample_model'],classification_params['model_name'],
              classification_params['sample_mod_dict'],basic_config['ptfile'], classification_params['samp_pix_vars'],classification_params['pixdf'],
              classification_params['lc_mod']))

## define / alter sample pixels to participate in model training

### load in LUT to see class options

In [None]:
lut=pd.read_csv('../Class_LUT.csv')
print(lut.sort_values('LC_UNQ')[['LC_UNQ','USE_NAME','LC25','LC25_name']])

### start with default models with pixel-only data:

In [None]:
samp_pts = pd.read_csv(basic_config['ptfile'])
pix_vars = pd.read_csv(classification_params['samp_pix_vars'])

pix_data = pix_vars.merge(samp_pts, left_on='OID_', right_on='OID_', how='left')

pix_data.drop(['LC2'], axis=1, inplace=True)
pixdf = pix_data.merge(lut, left_on='Class', right_on='USE_NAME', how='left')
print('sample breakdown by LC25 class:')
print(pixdf['LC25_name'].value_counts())

print('default rf model with all sample pixels and pixel only data')

## Add smallholder flag to dataset

In [None]:
## <=1 hectare
pixdf['smallholder1'] = pixdf.apply(lambda x: 1 if (
    ((x['var_poly_pred_area'] < 100) and (x['LC2'] == 1)) or (
    (x['FieldWidth'] <= 100) and (x['LC2'] == 1))) else 0, axis=1)
print(pixdf['smallholder1'].value_counts())
## <= .5 hectare
pixdf['smallholder2'] = pixdf.apply(lambda x: 1 if (
    ((x['var_poly_pred_area'] < 50) and (x['LC2'] == 1)) or (
    (x['FieldWidth'] <= 50) and (x['LC2'] == 1))) else 0, axis=1)
print(pixdf['smallholder2'].value_counts())
pd.DataFrame.to_csv(pixdf, '/home/klwalker/data/df_w_sh.csv')

## Drop classes that have sample sizes too small to model

In [None]:
dropClass = ['Crops-Vineyard','NewPlant']
pixdf = pixdf[~pixdf['LC25_name'].isin(dropClass)]
print(pixdf['LC25_name'].value_counts())

## prep pixel datasets by reducing sample
### First by sampling method (reducing dominant CAN soy pts)
### can also do by field size, etc.

In [None]:
#### model_name = pixdf_25Soy
##   removes 3/4 of the soy points because they are far overrepresented
pixdf_25Soy = pixdf[(pixdf['rand']>.9) | (pixdf['SampMethod'] != 'CAN - unverified in GE')]
print('25Soy sample breakdown by LC25 class:')
print(pixdf_25Soy['LC17_name'].value_counts())
pixdf_path = os.path.join(classification_params['model_dir'],'pixdf_25Soy.csv')
#pd.DataFrame.to_csv(pixdf_25Soy, pixdf_path)

In [None]:
#### model_name = pixdf_lessSoy
# removes all soy points that sre not verified in Google Earth
pixdf_lessSoy = pixdf[pixdf['SampMethod'] != 'CAN - unverified in GE']
print('there are now {} pts in the training set after dropping CAN soy'.format(len(pixdf_lessSoy)))
print('LessSoy sample breakdown by LC17 class:')
print(pixdf_lessSoy['LC17_name'].value_counts())
pixdf_path = os.path.join(classification_params['model_dir'],'pixdf_lessSoy_Nov.csv')
#pd.DataFrame.to_csv(pixdf_lessSoy, pixdf_path)

In [None]:
#### model_name = pixdf_bal1000
pixdf1 = pixdf[(pixdf['PurePixel'] != 'No') | (pixdf['LC25_name'].str.contains('mix', na=False, case=False))]
pixdf2 = pixdf1[(pixdf['rand']>.84) | (pixdf['SampMethod'] != 'CAN - unverified in GE')]
pixdf3 = pixdf2[(pixdf2['rand']>.45) | (pixdf2['LC25_name'] != 'Mixed-VegEdge')]
pixdf4 = pixdf3[(pixdf3['rand']>.65) | (pixdf3['LC25_name'] != 'Crops-mix')]
pixdf5 = pixdf4[(pixdf4['rand']>.76) | (pixdf4['LC25_name'] != 'Mixed-path')]
pixdf6 = pixdf5[(pixdf5['rand']>.01) | (pixdf5['LC25_name'] != 'Crops-Yerba-Mate')]
pixdf7 = pixdf6[(pixdf6['rand']>.42) | (pixdf6['LC25_name'] != 'Grassland-Managed')]
pixdf8 = pixdf7[(pixdf7['rand']>.38) | (pixdf7['LC25_name'] != 'Trees-Forest')]
print('pixdf_bal0 sample breakdown by LC25 class:')
print(pixdf8['LC25_name'].value_counts())
pixdf_path = os.path.join(classification_params['local_model_dir'],'pixdf_base1000.csv')
pd.DataFrame.to_csv(pixdf5, pixdf_path)

In [None]:
### example ###########################
#pixdf = pixdf[(pixdf['rand']>.5) | (pixdf['LC17_name'] != 'Mixed-VegEdge')]]
#print(pixdf['LC17_name'].value_counts())

## if polygons are available, can combine pixel and polygon dfs and create rf datasets for points with polygons and those without:

In [None]:
poly_data = pd.read_csv(classification_params['samp_poly'])
#rename column names that also occur in pixel df
poly_data.rename(columns={'area':'areaSeg'}, inplace=True)
all_data = pixdf.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
polypixdf_path = os.path.join(classification_params['model_dir'],'pts_polyData_joinCheck.csv')
pd.DataFrame.to_csv(all_data, polypixdf_path, sep=',', na_rep='NaN', index=True)

### first create dataset for points outside of polygons (here we have no variables to add to the original model)

In [None]:
outsideSeg = all_data[all_data['areaSeg'].isna()]
print(f'of the {all_data.shape[0]} sample points in our dataset, {outsideSeg.shape[0]} are outside of our segmented polygons')
print(outsideSeg['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'outside')
rfout17 = rf_model(outsideSeg,out_dir,'All','Permutation',29,'Fullsamp')

And for model with more balanced soy representation (25Soy):

In [None]:
all_data_25Soy = pixdf_25Soy.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
outsideSeg_25Soy = all_data_25Soy[all_data_25Soy['areaSeg'].isna()]
print(outsideSeg_25Soy['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'outside')
rfout17_lessSoy = rf_model(outsideSeg_lessSoy,out_dir,'All','Permutation',29,'LessSoy')

#### now create dataset for points inside of polygons (here we want to add some variables first)

In [None]:
#poly_data['AvgU'] = poly_data.apply(lambda x:count([x[c] for c in df.columns if c.endswith('U')]),axis=1)
#TODO: calculate these in pandas as above
poly_data.rename(columns={'areaSeg':'var_areaSeg','AVGU':'var_AVGU','AVGR':'var_AVGR','AVGSTD':'var_AVGSTD','MAXR':'var_MAXR','MINR':'var_MINR','STDU':'var_STDU','MINU':'var_MINU','MAXU':'var_MAXU','rU':'var_RU'}, inplace=True)
polyvars = pixdf.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
withinSeg = polyvars[polyvars['var_areaSeg'] > 0]

print(withinSeg['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'within')
#rfin17 = rf_model...

In [None]:
all_data_lessSoy = pixdf_lessSoy.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
withinSeg_lessSoy = all_data_lessSoy[all_data_lessSoy['var_areaSeg']>0]
print(withinSeg_lessSoy['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'within')
#rfin17_lessSoy = rf_model(...

## To save an html copy of this notebook with all outputs:
(these two cells should be last in notebook)

In [None]:
%%script echo skipping
### comment out above line and run this cell to print output as html

out_name = str(basic_config['country']+'6a_RandomFoest_VariableDataframe'+'_model'+str(classification_params['model_name'])+'_'+'Tests1')
!jupyter nbconvert --output-dir='./Outputs' --to html --no-input --ExecutePreprocessor.store_widget_state=True --output=$out_name 6b_RandomFoest_ModelComparisons.ipynb