In [None]:
import os
import sys
from pathlib import Path
import pandas as pd

%matplotlib inline

In [None]:
sys.path.append(r"../LUCinSA_helpers")
from rf1_create_model import *

In [None]:
lut=pd.read_csv('../Class_LUT.csv')
lut.drop(['Description'], axis=1, inplace=True)
print(lut.sort_values('LC22'))

#### start with default models with pixel-only data:

In [None]:
pix_data = pd.read_csv('D:/NasaProject/Paraguay/sampling/samplePts_FINALdfs/RFdf.csv')
pix_data.drop(['LC17','LC2'], axis=1, inplace=True)
pixdf = pix_data.merge(LUT, left_on='Class', right_on='USE_NAME', how='left')
print('sample breakdown by LC17 class:')
print(pixdf['LC17_name'].value_counts())

print('default rf model with all sample pixels and pixel only data')
in_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF'
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF'
rfout17 = MulticlassRF(pixdf,out_dir,'crop_nocrop','Permutation',29)

In [None]:
##Remove 3/4 of the soy points because they are far overrepresented
pixdf_lessSoy = pixdf[(pixdf['rand']>.9) | (pixdf['SampMethod'] != 'CAN - unverified in GE')]
print('New sample breakdown by LC17 class:')
print(pixdf_lessSoy['LC17_name'].value_counts())
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF/LessSoy'
rfout17 = MulticlassRF(pixdf_lessSoy,out_dir,'All','Permutation',29)

#### combine pixel and polygon dfs and create rf datasets for points with polygons and those without:

In [None]:
poly_data = pd.read_csv('D:/NasaProject/Paraguay/ClassificationModels/RF/pts_polyData.csv')
#rename column names that also occur in pixel df
poly_data.rename(columns={'area':'areaSeg'}, inplace=True)
all_data = pixdf.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
pd.DataFrame.to_csv(all_data, 'D:/NasaProject/Paraguay/ClassificationModels/RF/pts_polyData_joinCheck.csv', sep=',', na_rep='NaN', index=True)

##### first create dataset for points outside of polygons (here we have no variables to add to the original model)

In [None]:
outsideSeg = all_data[all_data['areaSeg'].isna()]
print(f'of the {all_data.shape[0]} sample points in our dataset, {outsideSeg.shape[0]} are outside of our segmented polygons')
print(outsideSeg['LC17_name'].value_counts())
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF/outside'
rfout17 = MulticlassRF(outsideSeg,out_dir,'All','Permutation',29)

And model with more balanced soy representation (LessSoy):

In [None]:
all_data_lessSoy = pixdf_lessSoy.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
outsideSeg_lessSoy = all_data_lessSoy[all_data_lessSoy['areaSeg'].isna()]
print(outsideSeg_lessSoy['LC17_name'].value_counts())
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF/outside/LessSoy'
rfout17_lessSoy = MulticlassRF(outsideSeg_lessSoy,out_dir,'All','Permutation',29)

##### now create dataset for points inside of polygons (here we want to add some variables first)

In [None]:
#poly_data['AvgU'] = poly_data.apply(lambda x:count([x[c] for c in df.columns if c.endswith('U')]),axis=1)
#TODO: calculate these in pandas as above
poly_data.rename(columns={'areaSeg':'var_areaSeg','AVGU':'var_AVGU','AVGR':'var_AVGR','AVGSTD':'var_AVGSTD','MAXR':'var_MAXR','MINR':'var_MINR','STDU':'var_STDU','MINU':'var_MINU','MAXU':'var_MAXU','rU':'var_RU'}, inplace=True)
polyvars = pixdf.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
withinSeg = polyvars[polyvars['var_areaSeg'] > 0]

print(withinSeg['LC17_name'].value_counts())
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF/within'
rfin17 = MulticlassRF(withinSeg,out_dir,'All','Permutation',29)

In [None]:
all_data_lessSoy = pixdf_lessSoy.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
withinSeg_lessSoy = all_data_lessSoy[all_data_lessSoy['var_areaSeg']>0]
print(withinSeg_lessSoy['LC17_name'].value_counts())
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF/within/LessSoy'
rfin17_lessSoy = MulticlassRF(withinSeg_lessSoy,out_dir,'All','Permutation',29)

In [None]:
#pix_data = pd.read_csv('D:/NasaProject/Paraguay/sampling/samplePts_FINALdfs/RFdf.csv')
pix_data = pd.read_csv('D:/NasaProject/Paraguay/sampling/samplePts_FINALdfs/RFdf_augmented.csv')
#pix_data = pix_data[pix_data['numPix'] < 50]
#poly_data = pd.read_csv('D:/NasaProject/Paraguay/ClassificationModels/RF/pts_polyData.csv')
#rename column names that also occur in pixel df
#poly_data.rename(columns={'area':'areaSeg'}, inplace=True)
#poly_data.rename(columns={'Seg1Dist':'var_Seg1Dist','Seg2Edge':'var_Seg2Edge','Seg3Prob':'var_Seg2Prob'},inplace=True)
#poly_data.rename(columns={'Seg1Dist':'var_Seg1Dist','Seg2Edge':'var_Seg2Edge','Seg3Prob':'var_Seg2Prob','areaSeg':'var_areaSeg','AVGU':'var_AVGU','AVGR':'var_AVGR','AVGSTD':'var_AVGSTD','MAXR':'var_MAXR','MINR':'var_MINR','STDU':'var_STDU','MINU':'var_MINU','MAXU':'var_MAXU','rU':'var_RU', 'DISTS':'var_DISTS'}, inplace=True)
#all_data = pix_data.merge(poly_data, left_on='OID_', right_on='OID_', how='left')
all_data = pix_data.merge(poly_data, left_on='OID_orig', right_on='OID_', how='left')
#all_data.drop(['LC17','LC2'], axis=1, inplace=True)
df_in = pix_data.merge(LUT, left_on='Class', right_on='USE_NAME', how='left')
#df_in = all_data.merge(LUT, left_on='Class', right_on='USE_NAME', how='left')
#withinSeg = df_in[df_in['var_areaSeg'] > 0]
#HO_base0 = df_in[['OID_','areaSeg','Class','LC17','TESTSET20']]
#HO_base = HO_base0[(HO_base0['TESTSET20'] ==1)]
#print(HO_base)
#pd.DataFrame.to_csv(HO_base,'D:/NasaProject/Paraguay/ClassificationModels/RF/HO_base.csv', sep=',', na_rep='NaN', index=False)
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF'
PrepTestTrain(df_in, out_dir, 'All')

In [None]:
out_dir = 'D:/NasaProject/Paraguay/ClassificationModels/RF'
trainfeatures = 'D:/NasaProject/Paraguay/ClassificationModels/RF/All_TRAINING.csv'
RF = MulticlassRF(trainfeatures, out_dir, 'LC17',None,29)

In [None]:
holdoutDF = 'D:/NasaProject/Paraguay/ClassificationModels/RF/RFSets/All_HOLDOUT.csv'
HO = get_holdout_scores(holdoutDF, RF[0], out_dir)

In [None]:
print(HO)

In [None]:
HO['obs'] = HO['label'].apply(np.int64)
HO['pred'] = HO['pred'].apply(np.int64)
HO_clean = HO[['obs','pred','OID']]
print(HO_clean)

In [None]:
HO_clean
#HOfinal = HO_clean.merge(HO_base, left_on='OID', right_on='OID_', how='left')
pd.DataFrame.to_csv(HO_clean,'D:/NasaProject/Paraguay/ClassificationModels/RF/HO_Preds_8.csv', sep=',', na_rep='NaN', index=False)
