In [1]:
import os
import sys
from pathlib import Path
import pandas as pd
import math
#import json

%matplotlib inline

In [2]:
sys.path.append(r"../LUCinSA_helpers")
from rf import *

In [3]:
'''
PARAMETERS: modify in notebook_params notebook, then run that notebook and this cell to update here
DO NOT modify this cell
'''

%store -r basic_config
#print(" modelling year is (filter_year param): {} (this is first year if season spans two years)".format(basic_config['filter_yr']))
%store -r classification_params
new_feature_mod = 'Max_nomon5-6'
classification_params['samp_pix_vars'] = '/home/downspout-cel/paraguay_lc/vector/tests/ptsgdb_{}.csv'.format(new_feature_mod)

print("Classification_Params: \n" 
      " temp output files are saved to (local_model_dir): {} \n" 
      " shared modelling files are in (main_model_dir): {} \n" 
      " feature_model = {} \n sample_model = {} \n model_name = {} \n"
      " sample_model_dict is: {} \n"
      " samp_pt_file = {} \n pix_vars = {} \n pixdf = {} \n lc_class = {}"
      .format(classification_params['local_model_dir'],classification_params['main_model_dir'],
              classification_params['feature_model'],classification_params['sample_model'],classification_params['model_name'],
              classification_params['sample_mod_dict'],basic_config['ptfile'], classification_params['samp_pix_vars'],classification_params['pixdf'],
              classification_params['lc_mod']))

Classification_Params: 
 temp output files are saved to (local_model_dir): /home/downspout-cel/paraguay_lc/classification/RF 
 shared modelling files are in (main_model_dir): /home/downspout-cel/paraguay_lc/classification 
 feature_model = Max 
 sample_model = testing 
 model_name = Max_testing_2021 
 sample_model_dict is: /home/downspout-cel/paraguay_lc/Sample_Models.json 
 samp_pt_file = /home/downspout-cel/paraguay_lc/vector/sampleData/SamplePts_Dec2023_ALL.csv 
 pix_vars = /home/downspout-cel/paraguay_lc/vector/tests/ptsgdb_Max_nomon5-6.csv 
 pixdf = /home/downspout-cel/paraguay_lc/classification/RF/pixdf_base1000.csv 
 lc_class = All


## define / alter sample pixels to participate in model training

### load in LUT to see class options

In [4]:
lut=pd.read_csv('../Class_LUT.csv')
print(lut.sort_values('LC_UNQ')[['LC_UNQ','USE_NAME','LC25','LC25_name']])

    LC_UNQ            USE_NAME  LC25          LC25_name
0        1               NoVeg    99            unknown
1        2          NoVeg_Bare     2         NoVeg_Bare
2        3         NoVeg_Built     3        NoVeg_Built
3        7         NoVeg_Water     7        NoVeg_Water
4        9          Mixed-path     9         Mixed-path
5       10             Cleared    10            Cleared
6       11       TreePlant-new    11           NewPlant
7       12   Grassland-Natural    12  Grassland-Natural
8       13   Grassland-Managed    13  Grassland-Managed
9       15           Grassland    99            unknown
10      17       Grassland-Wet    17      Grassland-Wet
11      18     Mixed-GrassEdge    19      Mixed-VegEdge
12      19     Mixed-FieldEdge    19      Mixed-VegEdge
13      20              LowVeg    99            unknown
14      23  Crops-horticulture    35          Crops-mix
15      30            Crop-Low    99            unknown
16      31      Crops-Soybeans    31     Crops-S

### start with default models with pixel-only data:

In [5]:
samp_pts = pd.read_csv(basic_config['ptfile'])
pix_vars = pd.read_csv(classification_params['samp_pix_vars'])
print(samp_pts)

pix_data = pix_vars.merge(samp_pts, left_on='OID_', right_on='OID_', how='left')

pix_data.drop(['LC2'], axis=1, inplace=True)
pixdf = pix_data.merge(lut, left_on='Class', right_on='USE_NAME', how='left')
print('sample breakdown by LC25 class:')
print(pixdf['LC25_name'].value_counts())

print('default rf model with all sample pixels and pixel only data')

        OID_ UNQ8858  OID1             Class  LC_UNQ  LC5  LC3  LC4  LC2  \
0          1    3278   NaN    Crops-Soybeans      31   20    1    1    1   
1          2    3278   NaN    Crops-Soybeans      31   20    1    1    1   
2          3    3278   NaN    Crops-Soybeans      31   20    1    1    1   
3          4    3278   NaN    Crops-Soybeans      31   20    1    1    1   
4          5    3278   NaN    Crops-Soybeans      31   20    1    1    1   
...      ...     ...   ...               ...     ...  ...  ...  ...  ...   
28006  28654    3769   NaN       Crops-Sugar      38   20    1    1    1   
28007  28655    3769   NaN       Crops-Sugar      38   20    1    1    1   
28008  28656    3804   NaN  TreePlant-mature      66   70    0    0    0   
28009  28657  <Null>   NaN           Cleared      10   10    0    0    0   
28010  28658    3769   NaN      Trees-Forest      80   70    0    0    0   

                                              NOTES  ...  Photo  surface  \
0          

## Add smallholder flag to dataset

In [6]:
## <=1 hectare
pixdf['smlhld_1ha'] = pixdf.apply(lambda x: 1 if (
    ((x['var_poly_pred_area'] < 100) and (x['LC2'] == 1)) or (
    (x['FieldWidth'] <= 100) and (x['LC2'] == 1))) else 0, axis=1)
print(pixdf['smlhld_1ha'].value_counts())
## <= .5 hectare
pixdf['smlhld_halfha'] = pixdf.apply(lambda x: 1 if (
    ((x['var_poly_pred_area'] < 50) and (x['LC2'] == 1)) or (
    (x['FieldWidth'] <= 50) and (x['LC2'] == 1))) else 0, axis=1)
print(pixdf['smlhld_halfha'].value_counts())
pd.DataFrame.to_csv(pixdf, '/home/klwalker/data/df_w_sh.csv')

0    23104
1     4806
Name: smlhld_1ha, dtype: int64
0    23653
1     4257
Name: smlhld_halfha, dtype: int64


## Drop classes that have sample sizes too small to model

In [11]:
dropClass = ['Crops-Vineyard','NewPlant']
pixdf = pixdf[~pixdf['LC25_name'].isin(dropClass)]
#pixdf = pixdf.drop("Description", axis=1)
pd.options.display.max_columns = None
print(pixdf['LC25_name'].value_counts())
#print(pd.DataFrame(pixdf.isnull().any()))

Crops-Soybeans       4485
Mixed-path           3902
Crops-mix            2988
Grassland-Managed    1884
Mixed-VegEdge        1828
Trees-Forest         1711
unknown              1668
Trees-disturbed      1195
Grassland-Wet        1062
Grassland-Natural    1033
TreePlant             830
Shrub                 791
Crops-Yerba-Mate      606
Crops-Rice            516
Grass_tree-mix        464
NoVeg_Water           421
NoVeg_Bare            397
Crops-Sugar           369
Crops-Banana          313
Crops-Orchard         289
NoVeg_Built           284
Cleared               266
Crops-Corn            244
Trees-Forest_palm     213
Burnt-woody            74
Name: LC25_name, dtype: int64
columns with NaN: ['OID1', 'NOTES', 'PureCrop', 'Date', 'Photo', 'surface', 'estrat', 'LC_UNQ_y', 'USE_NAME', 'Segmentation', 'LCTrans', 'LCTrans_name', 'LC5_name', 'LC5_y', 'LC25', 'perLC17E', 'LC25_name', 'LC3_y', 'LC3_name', 'LC4_y', 'LC4_name', 'LC2', 'LC2_name', 'LC_crops', 'LC_crops_name', 'Description']


'\nfor index, row in pixdf.loc[:, NaNrows].iterrows():\n    print(index)'

#### check for nan columns if desired

In [None]:
nancols = pixdf.columns[pixdf.isna().any()].tolist()
print(f'columns with NaN: {nancols}')

#for i in nancols:
#    print(pixdf[i])

## prep pixel datasets by reducing sample

### Option1: by sampling method (reducing dominant CAN soy pts that are not verified in GE)

In [None]:
#### model_name = pixdf_25Soy
##   removes 3/4 of the soy points because they are far overrepresented
pixdf_25Soy = pixdf[(pixdf['rand']>.9) | (pixdf['SampMethod'] != 'CAN - unverified in GE')]
print('25Soy sample breakdown by LC25 class:')
print(pixdf_25Soy['LC17_name'].value_counts())
pixdf_path = os.path.join(classification_params['model_dir'],'pixdf_25Soy.csv')
#pd.DataFrame.to_csv(pixdf_25Soy, pixdf_path)

### Option2: by balancing classes

In [48]:
##   first collect the sample ratios from the LUT

def new_training_data(bal_ratio, new_name, cutoff = 400):
    
    lut=pd.read_csv('../Class_LUT.csv')
    ordered = lut.sort_values('perLC17E')[["perLC17E", "LC25_name"]]
    #print(ordered)
    
    ##   now clean this df for NaN and repeats, check if the percents add up correctly, then divide and round
    
    ordered = ordered.dropna()
    ordered = ordered.drop_duplicates(subset = "perLC17E") 
    #print(ordered)
    
    tot = ordered["perLC17E"].sum()
    # print(tot)
    # lessssss gooooooo close to 1
    
    mmin = ordered["perLC17E"].min()
    ordered["perLC17E"] = ordered["perLC17E"]//mmin
    #print(ordered)
    
    ##   find the scale factor, actually just use an appropriate number
    
    ## rescale the training data set

    counts = pixdf['LC25_name'].value_counts().rename_axis("LC25_name").reset_index(name="counts")
    print(counts)
    print(f'Total sample size before balancing is: {sum(counts["counts"])}')
    
    ratiodf = ordered.merge(counts, left_on="LC25_name", right_on="LC25_name", how='left')
    ratiodf['ratios'] = np.where(ratiodf["counts"] < cutoff, 1, ((ratiodf["perLC17E"] * bal_ratio) / ratiodf["counts"]))
    
    pixdf_ratios_rebal = pixdf.merge(ratiodf[['LC25_name','ratios']],left_on="LC25_name", right_on="LC25_name", how='left')
    pixdf_ratios_rebal = pixdf_ratios_rebal[pixdf_ratios_rebal['rand'] < pixdf_ratios_rebal['ratios']]
    print(pixdf_ratios_rebal['LC25_name'].value_counts())
    totsamp = sum(pixdf_ratios_rebal['LC25_name'].value_counts())
    print(f'Total sample size after balancing is: {totsamp}')
    
    pixdf_path = os.path.join(classification_params['local_dir'],'r_tests/pixdf_{}_rebal_{}.csv'.format(new_feature_mod, new_name))
    pd.DataFrame.to_csv(pixdf_ratios_rebal, pixdf_path)

#for i in range(1, 20):
#    bal = math.exp(i / 7)
#    #print(bal)
#    new_training_data(bal_ratio = bal, new_name = "{}".format(i), cutoff = 400)

new_training_data(bal_ratio = 5.5, new_name = "base", cutoff = 400)


            LC25_name  counts
0      Crops-Soybeans    4485
1          Mixed-path    3902
2           Crops-mix    2988
3   Grassland-Managed    1884
4       Mixed-VegEdge    1828
5        Trees-Forest    1711
6             unknown    1668
7     Trees-disturbed    1195
8       Grassland-Wet    1062
9   Grassland-Natural    1033
10          TreePlant     830
11              Shrub     791
12   Crops-Yerba-Mate     606
13         Crops-Rice     516
14     Grass_tree-mix     464
15        NoVeg_Water     421
16         NoVeg_Bare     397
17        Crops-Sugar     369
18       Crops-Banana     313
19      Crops-Orchard     289
20        NoVeg_Built     284
21            Cleared     266
22         Crops-Corn     244
23  Trees-Forest_palm     213
24        Burnt-woody      74
Total sample size before balancing is: 27833
Trees-Forest         1292
Grassland-Managed    1236
Grassland-Wet        1057
Crops-Soybeans        935
Grassland-Natural     479
NoVeg_Bare            397
Shrub              

KeyError: 'local_dir'

In [None]:
# use this to fix any mess ups u make 
"""
dir = "/home/downspout-cel/paraguay_lc/classification/RF"
for filename in os.listdir(dir):
    if filename.startswith("pixdf_{}".format(new_feature_mod)):
        filepath = os.path.join(dir, filename)
        os.remove(filepath)
"""

## prep pixel datasets by reducing sample
### First by sampling method (reducing dominant CAN soy pts)
### can also do by field size, etc.

In [20]:
#### model_name = pixdf_25Soy
##   removes 3/4 of the soy points because they are far overrepresented
pixdf_25Soy = pixdf[(pixdf['rand']>.9) | (pixdf['SampMethod'] != 'CAN - unverified in GE')]
print('25Soy sample breakdown by LC25 class:')
print(pixdf_25Soy['LC25_name'].value_counts())
pixdf_path = os.path.join(classification_params['local_dir'],'r_tests/pixdf_{}_25Soy.csv'.format(new_feature_mod))
pd.DataFrame.to_csv(pixdf_25Soy, pixdf_path)

25Soy sample breakdown by LC25 class:
LC25_name
Mixed-path           3902
Crops-mix            2988
Grassland-Managed    1884
Mixed-VegEdge        1828
Trees-Forest         1711
unknown              1668
Trees-disturbed      1195
Grassland-Wet        1062
Grassland-Natural    1033
TreePlant             830
Shrub                 791
Crops-Soybeans        767
Crops-Yerba-Mate      606
Crops-Rice            516
Grass_tree-mix        464
NoVeg_Water           421
NoVeg_Bare            397
Crops-Sugar           369
Crops-Banana          313
Crops-Orchard         289
NoVeg_Built           284
Cleared               266
Crops-Corn            244
Trees-Forest_palm     213
Burnt-woody            74
Name: count, dtype: int64


In [21]:
#### model_name = pixdf_lessSoy
# removes all soy points that sre not verified in Google Earth
pixdf_lessSoy = pixdf[pixdf['SampMethod'] != 'CAN - unverified in GE']
print('there are now {} pts in the training set after dropping CAN soy'.format(len(pixdf_lessSoy)))
print('LessSoy sample breakdown by LC25 class:')
print(pixdf_lessSoy['LC25_name'].value_counts())
pixdf_path = os.path.join(classification_params['local_dir'],'r_tests/pixdf_{}_lessSoy_Nov.csv'.format(new_feature_mod))
pd.DataFrame.to_csv(pixdf_lessSoy, pixdf_path)

there are now 23710 pts in the training set after dropping CAN soy
LessSoy sample breakdown by LC25 class:
LC25_name
Mixed-path           3902
Crops-mix            2988
Grassland-Managed    1884
Mixed-VegEdge        1828
Trees-Forest         1711
unknown              1668
Trees-disturbed      1195
Grassland-Wet        1062
Grassland-Natural    1033
TreePlant             830
Shrub                 791
Crops-Yerba-Mate      606
Crops-Rice            516
Grass_tree-mix        464
NoVeg_Water           421
NoVeg_Bare            397
Crops-Sugar           369
Crops-Soybeans        356
Crops-Banana          313
Crops-Orchard         289
NoVeg_Built           284
Cleared               266
Crops-Corn            244
Trees-Forest_palm     213
Burnt-woody            74
Name: count, dtype: int64


In [22]:
#### model_name = pixdf_bal1000
pixdf1 = pixdf[(pixdf['PurePixel'] != 'No') | (pixdf['LC25_name'].str.contains('mix', na=False, case=False))]
pixdf2 = pixdf1[(pixdf['rand']>.84) | (pixdf['SampMethod'] != 'CAN - unverified in GE')]
pixdf3 = pixdf2[(pixdf2['rand']>.65) | (pixdf2['LC25_name'] != 'Mixed-VegEdge')]
pixdf4 = pixdf3[(pixdf3['rand']>.65) | (pixdf3['LC25_name'] != 'Crops-mix')]
pixdf5 = pixdf4[(pixdf4['rand']>.86) | (pixdf4['LC25_name'] != 'Mixed-path')]
pixdf6 = pixdf5[(pixdf5['rand']>.50) | (pixdf5['LC25_name'] != 'Crops-Yerba-Mate')]
pixdf7 = pixdf6[(pixdf6['rand']>.37) | (pixdf6['LC25_name'] != 'Grassland-Managed')]
pixdf8 = pixdf7[(pixdf7['rand']>.36) | (pixdf7['LC25_name'] != 'Trees-Forest')]
print('pixdf_bal0 sample breakdown by LC25 class:')
print(pixdf8['LC25_name'].value_counts())
pixdf_path = os.path.join(classification_params['local_dir'],'r_tests/pixdf_{}_base1000.csv'.format(new_feature_mod))
pd.DataFrame.to_csv(pixdf8, pixdf_path)pixdf3 = pixdf2[(pixdf2['rand']>.45) | (pixdf2['LC25_name'] != 'Mixed-VegEdge')]
pixdf4 = pixdf3[(pixdf3['rand']>.65) | (pixdf3['LC25_name'] != 'Crops-mix')]
pixdf5 = pixdf4[(pixdf4['rand']>.76) | (pixdf4['LC25_name'] != 'Mixed-path')]
pixdf6 = pixdf5[(pixdf5['rand']>.01) | (pixdf5['LC25_name'] != 'Crops-Yerba-Mate')]
pixdf7 = pixdf6[(pixdf6['rand']>.42) | (pixdf6['LC25_name'] != 'Grassland-Managed')]
pixdf8 = pixdf7[(pixdf7['rand']>.38) | (pixdf7['LC25_name'] != 'Trees-Forest')]
print('pixdf_bal0 sample breakdown by LC25 class:')
print(pixdf8['LC25_name'].value_counts())
pixdf_path = os.path.join(classification_params['local_model_dir'],'pixdf_base1000.csv')
pd.DataFrame.to_csv(pixdf5, pixdf_path)

pixdf_bal0 sample breakdown by LC25 class:
LC25_name
unknown              1659
Grassland-Managed    1108
Trees-Forest         1071
Grassland-Wet        1062
Grassland-Natural    1012
Crops-Soybeans       1006
Crops-mix            1006
Shrub                 778
TreePlant             735
Mixed-VegEdge         647
Trees-disturbed       597
Mixed-path            588
Grass_tree-mix        464
Crops-Rice            461
NoVeg_Bare            382
NoVeg_Water           370
Crops-Sugar           322
Crops-Banana          313
Crops-Orchard         289
NoVeg_Built           276
Cleared               266
Crops-Yerba-Mate      239
Trees-Forest_palm     213
Crops-Corn            200
Burnt-woody            74
Name: count, dtype: int64


In [13]:
### example ###########################
#pixdf = pixdf[(pixdf['rand']>.5) | (pixdf['LC17_name'] != 'Mixed-VegEdge')]]
#print(pixdf['LC17_name'].value_counts())

## if polygons are available, can combine pixel and polygon dfs and create rf datasets for points with polygons and those without:

In [14]:
poly_data = pd.read_csv(classification_params['samp_poly'])
#rename column names that also occur in pixel df
poly_data.rename(columns={'area':'areaSeg'}, inplace=True)
all_data = pixdf.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
polypixdf_path = os.path.join(classification_params['model_dir'],'pts_polyData_joinCheck.csv')
pd.DataFrame.to_csv(all_data, polypixdf_path, sep=',', na_rep='NaN', index=True)

KeyError: 'samp_poly'

### first create dataset for points outside of polygons (here we have no variables to add to the original model)

In [15]:
outsideSeg = all_data[all_data['areaSeg'].isna()]
print(f'of the {all_data.shape[0]} sample points in our dataset, {outsideSeg.shape[0]} are outside of our segmented polygons')
print(outsideSeg['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'outside')
rfout17 = rf_model(outsideSeg,out_dir,'All','Permutation',29,'Fullsamp')

NameError: name 'all_data' is not defined

And for model with more balanced soy representation (25Soy):

In [16]:
all_data_25Soy = pixdf_25Soy.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
outsideSeg_25Soy = all_data_25Soy[all_data_25Soy['areaSeg'].isna()]
print(outsideSeg_25Soy['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'outside')
rfout17_lessSoy = rf_model(outsideSeg_lessSoy,out_dir,'All','Permutation',29,'LessSoy')

NameError: name 'poly_data' is not defined

#### now create dataset for points inside of polygons (here we want to add some variables first)

In [17]:
#poly_data['AvgU'] = poly_data.apply(lambda x:count([x[c] for c in df.columns if c.endswith('U')]),axis=1)
#TODO: calculate these in pandas as above
poly_data.rename(columns={'areaSeg':'var_areaSeg','AVGU':'var_AVGU','AVGR':'var_AVGR','AVGSTD':'var_AVGSTD','MAXR':'var_MAXR','MINR':'var_MINR','STDU':'var_STDU','MINU':'var_MINU','MAXU':'var_MAXU','rU':'var_RU'}, inplace=True)
polyvars = pixdf.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
withinSeg = polyvars[polyvars['var_areaSeg'] > 0]

print(withinSeg['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'within')
#rfin17 = rf_model...

NameError: name 'poly_data' is not defined

In [18]:
all_data_lessSoy = pixdf_lessSoy.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
withinSeg_lessSoy = all_data_lessSoy[all_data_lessSoy['var_areaSeg']>0]
print(withinSeg_lessSoy['LC17_name'].value_counts())
out_dir = os.path.join(classification_params['model_dir'],'within')
#rfin17_lessSoy = rf_model(...

NameError: name 'poly_data' is not defined

## To save an html copy of this notebook with all outputs:
(these two cells should be last in notebook)

In [None]:
%%script echo skipping
### comment out above line and run this cell to print output as html

out_name = str(basic_config['country']+'6a_RandomFoest_VariableDataframe'+'_model'+str(classification_params['model_name'])+'_'+'Tests1')
!jupyter nbconvert --output-dir='./Outputs' --to html --no-input --ExecutePreprocessor.store_widget_state=True --output=$out_name 6b_RandomFoest_ModelComparisons.ipynb