Currently the train/test/validation data doesn't contain clay_perc.

In [3]:
from pathlib import Path
import pandas as pd
import json

from src.data_utils import train_val_test_split, get_outlier_iqr

In [4]:
dpath = Path('/Users/campbelli/Documents/geofm-plant-traits/data')
df = pd.read_csv(dpath / 'Master_Table_Modelling_LOWCORRELATIONS_wB8A.csv')
# Select only the pixels with 70% of the basal area covered and only community-weighted means.
cwms = df[(df['PercentCovered'] == '70') & (df['Type'] == 'CWMean')]
cwms.loc[:, 'TraitValue'] = pd.to_numeric(cwms['TraitValue'], errors='raise')

  df = pd.read_csv(dpath / 'Master_Table_Modelling_LOWCORRELATIONS_wB8A.csv')


In [5]:
# All commented variables don't seem to be included in the paper.
# ** indicates same value for all pixels in a plot.
bands = [
    'B2_real', # Blue band, 490 nm
    'B3_real', # Green band, 560 nm
    'B4_real', # Red band, 665 nm
    'B5_real', # Red edge band, 705 nm
    'B6_real', # Red edge band, 740 nm
    'B7_real', # Red edge band, 783 nm
    'B8_real', # NIR band, 842 nm
    'B11_real', # SWIR band, 1610 nm
    'B12_real' # SWIR band, 2190 nm
]
spectral_indices = [
    'MCARI',
    'MCARI_Corr',
    'MCARI_Entropy',
    'MSAVI2', # (2 × NIR + 1 - √((2 × NIR + 1)² - 8 × (NIR - Red))) / 2
    'MSAVI2_Corr',
    'MSAVI2_Entropy',
    'NDRE', #  (NIR - RedEdge)/(NIR + RedEdge)
    'NDRE_Corr',
    'NDRE_Entropy',
]
climate_vars = [
    'CWD', # Climate Water Deficit
    'SoilMoist', # Soil Moisture
    'SRAD', # Solar irradiance
    'Tmax', # Mean annual maximum temperature
]
soil_vars = [
    'Slope', # Terrain slope
    'sand_perc_plot', # Sand content percentage
    'ph_h20_plot', # Soil pH in water 
    'clay_perc_plot', # Clay content percentage **
    'CEC_mmol_kg_plot', # Cation exchange capacity **
]

cols = [*bands, *spectral_indices, *climate_vars, *soil_vars]

In [6]:
# Find plots where there is no Sentinel-2 data.
null_plots = cwms[cwms[bands].isnull().any(axis=1)].groupby('New_Plot')[bands].count()
null_plots.index

Index(['TRU041006', 'TRU041007', 'TRU041008', 'TRU041019', 'TRU041020',
       'TRU041021', 'TRU041022', 'TRU041031', 'TRU041032', 'TRU041033',
       'TRU041034', 'TRU041043', 'TRU041044', 'TRU041053', 'TRU041054',
       'TRU041061', 'TRU04993', 'TRU04994', 'TRU04995', 'TRU04996'],
      dtype='object', name='New_Plot')

In [7]:
# Drop plots without Sentinel-2 data.
cwms = cwms[~cwms['New_Plot'].isin(null_plots.index)]

In [8]:
# Find number of plots with available data by variable.
cwms.groupby('variable').count()['New_Plot'].sort_values(ascending=False)

variable
Ca.Percent      1888
K.Percent       1888
Mg.Percent      1888
P.Percent       1887
Thickness.mm    1885
Area.cm2        1884
N.Percent       1884
SLA.g.m2        1884
WD              1876
Asat            1548
C.Percent       1537
Fresh.mass.g    1270
Dry.mass.g      1269
Amax            1267
Name: New_Plot, dtype: int64

In [None]:
# Split data into train, test and validation sets per variable (ignore the unequal amount of data).
train_split = 0.7
validation_split = 0.1
test_split = 0.2

cwms_no_outliers = pd.DataFrame()

for variable in cwms['variable'].unique():
    # Get the pixels for the current variable
    pixels = cwms[cwms['variable'] == variable]

    # Drop outliers using IQR method, with conservative threshold of 4*IQR.
    outliers = get_outlier_iqr(pixels.loc[:, 'TraitValue'], zlim=4)
    print(f"Outliers for {variable}: {len(outliers)}")
    pixels = pixels.drop(outliers.index).set_index('New_Plot')

    cwms_no_outliers = pd.concat([cwms_no_outliers, pixels.reset_index()], join='outer')

    if pixels.isna().any().any():
        print(f"Variable {variable} has missing values.")
        continue

    train_pixels, val_pixels, test_pixels = train_val_test_split(pixels, train_split, validation_split)

    # Drop columns not included in the analysis and save data.
    opath = Path('..') / 'data'
    
    train_pixels[cols].to_csv(opath / 'train' / f'{variable}_train_data.csv')
    val_pixels[cols].to_csv(opath / 'validation' / f'{variable}_val_data.csv')
    test_pixels[cols].to_csv(opath / 'test' / f'{variable}_test_data.csv')

    # Save labels.
    train_pixels['TraitValue'].to_csv(opath / 'train' / f'{variable}_train_labels.csv')
    val_pixels['TraitValue'].to_csv(opath / 'validation' / f'{variable}_val_labels.csv')
    test_pixels['TraitValue'].to_csv(opath / 'test' / f'{variable}_test_labels.csv')

Outliers for SLA.g.m2: 37
Outliers for Thickness.mm: 16
Outliers for Mg.Percent: 0
Outliers for K.Percent: 10
Outliers for Ca.Percent: 27
Outliers for N.Percent: 4
Outliers for Dry.mass.g: 57
Outliers for Fresh.mass.g: 19
Outliers for WD: 0
Outliers for P.Percent: 2
Outliers for C.Percent: 0
Outliers for Area.cm2: 54
Outliers for Amax: 0
Outliers for Asat: 0


## Calculate mean and standard deviation of traits
This will be used for evaluating data during model validation.

In [None]:
trait_stats = {}

for variable in cwms_no_outliers.variable.unique():
    var_cwm = cwms_no_outliers[cwms_no_outliers.variable == variable]
    var_mean, var_std =  var_cwm['TraitValue'].mean(), var_cwm['TraitValue'].std()
    
    trait_stats[variable] = {
        'mean': var_mean,
        'std': var_std
    }

In [21]:
metadata_path = Path('/Users/campbelli/Documents/geofm-plant-traits/data/metadata')

with open(metadata_path / 'trait_stats.json', 'w') as f:
    json.dump(trait_stats, f)