In [12]:
#import packages
import geopandas as gpd, pandas as pd, os, numpy as np

# Section 1: Reading in the data and exploring using pandas and geopandas
## Read the Costa_Classification_Data_Cleaned2.csv file into a DataFrame

In [13]:
df=pd.read_csv('../Costa_Rica_Data/Costa_Classification_Data_Cleaned2.csv') ##../ in the relative path moves out of the templates directory into the CostaRica-Notebook directory

## Get basic information from the DataFrame

In [14]:
print(df.shape) #number of rows and columns
display(df.columns) #all column names


### Split out response and predictor columns

In [15]:
id=['plotid', 'sampleid']

resp=['Use', 'Cover', 'Vegetation', 'Herbaceous',
       'Grass', 'Cultivation', 'WetLand', 'Terrain', 'Water', 'Another Class',
       'SAF']
pred=['BLUE', 'GREEN', 'NIR',
       'RED', 'SWIR1', 'SWIR2', 'altura2', 'aspect', 'aspectcos', 'aspectdeg',
       'aspectYesn', 'brightness', 'clay_1mMed', 'diff', 'elevation', 'evi',
       'fpar', 'hand30_100', 'lai', 'mTPI', 'ndvi', 'ocs_1mMed', 'sand_1mMed',
       'savi', 'Yeslt_1mMed', 'slope', 'topDiv', 'wetness']

### Look at summary stats for predictor columns

In [16]:
display(df[pred].describe()) #summary statistics for predictors
display(df[pred].isna().sum()) #number of NAs by column


### Exercise 1: Explaining NAs
- Why does altura2 have so many NAs (hint look at altura2 data in geemap? Can a different value be substituted for the NAs? 
- Why does savi, wetness, diff, and brightness have 26 NAs?
- Why does topDiv have NAs?
- Why does NDVI have NAs?
- Why does mTPI have NAs?
- What would happen if we did not address the NAs?

### Addressing NA in predictors
Here we will make 2 assumptions
- altura2 nas should be changed to zero
- all other na will be removed

In [17]:
df.loc[df['altura2'].isna(),'altura2']=0 #set na to zero
df['altura2']
df2=df.dropna()

### Look at summary stats for response columns

In [18]:
display(df2[resp].describe()) #summary stats for response
display((df2[resp].isna()).sum())

### Exercise 2: Unique Categories
- What are the unique categories for each response column and their frequency of occurrence?
- How is Not_Applicable different from No Information?

### Look at summary stats for plot id columns

In [19]:
display(df2[id].describe()) #summary stats for response
display((df2[id].isna()).sum())
df_gr=df2[id].groupby('plotid')
cnt_sampleid=df_gr.agg('size')
display(np.unique(cnt_sampleid)) #display how many sampleid records are in each plotid
display(cnt_sampleid[cnt_sampleid > 9]) #display which plots have more than 9 records
display(df.loc[df['plotid']==1630]) #select just plot 1630 which sampleid to use

### Exercise 3: Unique identifiers
- Which column represents the plot and the subplot?
- How many subplots are there per plot?
- Why are there some subplots with 18 records?
- Which plots have 18 records?
- What should we do with those records?

### Fixing the duplicated sampleids
#### There are many ways to address the duplicated (QAQC) plots. In this example we are going to just use the first sample collected.

In [20]:
df3=df2.groupby(['plotid','sampleid']).first()#make sure to group on both plotid and sampleid to get all unique combinations
df3

### Exercise 4: Addressing duplicate samples
- How many subplots were removed?
- Does the total number of records make sense?
- Select for the last value. Do you get the same number of records?
- Can you think of another way to remove duplicated records?


## Aggregating to the plot


In [21]:
UseResp=pd.crosstab(df['plotid'],df['Use'],normalize='index')#percentage by use class
pred_agg=df[['plotid']+pred].groupby('plotid').agg(['mean','std'])
pred_agg.columns = ["_".join(a) for a in pred_agg.columns.to_flat_index()] #change the multilevel index to a single level index to merger DataFrames


### Merge the percent use columns with the mean and standard deviation of aggregated subplots to create a cleaned data frame

In [22]:
clean_df=UseResp.merge(pred_agg,left_on='plotid',right_on='plotid')
clean_df

### Exercise 5: Summarizing plots
- For each response variable create a clean percent category data frame with summarized mean, standard deviation, min, and max values. How many predictor variables do you have? 

## Save out the plot and plot_subplot data

In [None]:
outpath1 = 'plot_data.csv'
outpath2= 'plot_subplot_data.csv'
if(not os.path.exists(outpath1)):clean_df.to_csv(outpath1)
if(not os.path.exists(outpath2)):df2.to_csv(outpath2)