# Set up the environment

Before running the notebook, please install the necessary packages and environment by running the following shell commands in your terminal:

```bash
# Create the conda environment from the provided environment file
conda env create -f ../conda_env_pkgs.yml -n soc_model_env

# Activate the new environment
conda activate soc_model_env

# Launch Jupyter Notebook from within the environment
jupyter notebook


In [None]:
import json
import ee
import geemap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score

from pprint import pprint as pp
import tabulate

# Authenticate and Initialize Earth Engine
ee.Authenticate()
ee.Initialize(project= "ee-christopherharrellgis")

# Optional: Display map
#Map = geemap.Map(basemap = "SATELLITE")
Map = geemap.Map()
#Map

In [201]:
# Global variables
start_year = 2023
end_year = 2024
month_start = 3
month_end = 2
max_cloud_cover = 20

#### Import SOC Samples and Study Area as Feature Collections

In [202]:
# Create a FeatureCollection from SOC_samples.csv
def create_feature_collection(df):
    features = []
    for idx, row in df.iterrows():
        point = ee.Geometry.Point([row['longitude'], row['latitude']])
        feature = ee.Feature(point, {
            'plot_no': row['plot_no'],
            'MgC_per_ha': row['MgC_per_ha'],
            'MgC_SE': row['MgC_SE']
        })
        features.append(feature)
    return ee.FeatureCollection(features)

# Load the study area GeoJSON file
study_area_geojson = '../soc/data/study_area.geojson'

# Load the SOC samples table
soc_samples_df = pd.read_csv("../soc/data/SOC_samples.csv")

with open(study_area_geojson) as f:
    geojson_data = json.load(f)

# Convert the study area to an Earth Engine FeatureCollection
study_area = ee.FeatureCollection(geojson_data)

# Convert the SOC samples table to an Earth Engine FeatureCollection
soc_samples_points = create_feature_collection(soc_samples_df)

Map.addLayer(study_area, {'color': 'red'}, 'Study Area', False)
Map.addLayer(soc_samples_points, {'color': 'yellow'}, 'Sample Points', False)
Map.centerObject(study_area, zoom=11)

In [203]:
""" wdpa = ee.FeatureCollection('WCMC/WDPA/current/polygons')

# Filter WDPA features that spatially intersect with your study area
overlapping_pas = wdpa.filter(ee.Filter.intersects('.geo', soc_samples_points.geometry()))

# Get the list of names
protected_areas_with_names = overlapping_pas.filter(ee.Filter.notNull(['NAME']))

# Get the names as a list
protected_area_names = protected_areas_with_names.aggregate_array('NAME')

# Print the names of protected areas that contain sample points
print('Protected Areas containing sample points:')
print(protected_area_names.getInfo())

Map.addLayer(overlapping_pas, {'color': 'blue'}, 'PAS') """

" wdpa = ee.FeatureCollection('WCMC/WDPA/current/polygons')\n\n# Filter WDPA features that spatially intersect with your study area\noverlapping_pas = wdpa.filter(ee.Filter.intersects('.geo', soc_samples_points.geometry()))\n\n# Get the list of names\nprotected_areas_with_names = overlapping_pas.filter(ee.Filter.notNull(['NAME']))\n\n# Get the names as a list\nprotected_area_names = protected_areas_with_names.aggregate_array('NAME')\n\n# Print the names of protected areas that contain sample points\nprint('Protected Areas containing sample points:')\nprint(protected_area_names.getInfo())\n\nMap.addLayer(overlapping_pas, {'color': 'blue'}, 'PAS') "

#### Generate an Image for each environmental covariate
- Elevation (Copernicus DEM)
- Slope (Copernicus DEM)
- Mean Annual Precipitation (MAP) (TerraClimate)
- Mean Annual Temperature (MAT) (TerraClimate)
- Mean NDVI (Sentinel-2 SR)
- Mean EVI (Sentinel-2 SR)
- ESA Landcover Classification (ESA WorldCover)

In [257]:

# COPERNICUS DEM (30m)
dem = ee.ImageCollection('COPERNICUS/DEM/GLO30').mosaic().select('DEM')

elevation = dem.reproject(crs='EPSG:4326', scale=10)
slope = ee.Terrain.slope(dem)

# Visualization parameters for Elevation
vis_params_elevation = {
    'min': 0,
    'max': 3000,
    'palette': ['#00FFFF', '#0000FF', '#008000', '#FFFF00', '#FF0000', '#800000']
}

# Visualization parameters for slope
vis_params_slope = {
    'min': 0,
    'max': 60,
    'palette': ['#00FFFF', '#0000FF', '#008000', '#FFFF00', '#FF0000', '#800000']
}

Map.addLayer(elevation.clip(study_area), vis_params_elevation, "elevation", False)
Map.addLayer(slope.clip(study_area), vis_params_slope, "slope", False)


In [None]:
def config_TC_bands(img):
    tmmn_band = img.select('tmmn').multiply(0.1)
    tmmx_band = img.select('tmmx').multiply(0.1)
    precip_band = img.select('pr')
    
    bands = precip_band.addBands([tmmn_band, tmmx_band], overwrite=True)
    return bands.copyProperties(img, img.propertyNames())

def calc_map(img):
    tavg = img.expression(
        '(tmmx + tmmn) / 2',
        {
            'tmmx': img.select('tmmx'),
            'tmmn': img.select('tmmn')
        }
    ).rename('tavg')
    return tavg.copyProperties(img, img.propertyNames())

imgCol_TC = ee.ImageCollection('IDAHO_EPSCOR/TERRACLIMATE') \
    .filter(ee.Filter.calendarRange(start_year, end_year, 'year')) \
    .filter(ee.Filter.calendarRange(month_start, month_end, 'month')) \
    .filterBounds(study_area) \
    .map(config_TC_bands)

terraclimate_temp = imgCol_TC.select(['tmmn', 'tmmx'])
terraclimate_precip = imgCol_TC.select(['pr'])

# Averaged monthly mean temperature
monthly_avg_temp = terraclimate_temp.map(calc_map).mean()

# Averaged monthly standard deviation of temperature
monthly_avg_temp_stddev = terraclimate_temp.reduce(ee.Reducer.stdDev()).rename(['std_tmmn', 'std_tmmx'])

# Averaged monthly mean of precipitation
monthly_avg_precip = terraclimate_precip.mean()

# Averaged monthly standard deviation of precipitation
monthly_avg_precip_stddev = terraclimate_precip.reduce(ee.Reducer.stdDev()).rename(['std_pr'])

avg_temp_vis_params = {
    'min': -10,  
    'max': 40,
    'palette': ['blue', 'cyan', 'green', 'yellow', 'orange', 'red', 'darkred']
}

stddev_temp_vis_params = {
    'min': 0,  
    'max': 15,
    'palette': ['white', 'lightyellow', 'yellow', 'orange', 'red', 'darkred']
}

avg_precip_vis_params = {
    'min': 0,
    'max': 2500,  
    'palette': ['lightblue', 'blue', 'darkblue', 'purple', 'darkred']
}

stddev_precip_vis_params = {
    'min': 0,
    'max': 500,
    'palette': ['white', 'lightgreen', 'green', 'yellow', 'orange', 'red']
}

Map.addLayer(monthly_avg_temp.clip(study_area), avg_temp_vis_params, 'temp_avg', False)

Map.addLayer(monthly_avg_temp_stddev.select('std_tmmn').clip(study_area), stddev_temp_vis_params, 'tmmn_stdev', False)
Map.addLayer(monthly_avg_temp_stddev.select('std_tmmx').clip(study_area), stddev_temp_vis_params, 'tmmx_stdev', False)

Map.addLayer(monthly_avg_precip.clip(study_area), avg_precip_vis_params, 'precip_avg', False)
Map.addLayer(monthly_avg_precip_stddev.clip(study_area), stddev_precip_vis_params, 'precip_stdev', False)


""" print(monthly_avg_temp.bandNames().getInfo())  # Check for 'monthly_avg_temp_stddev'
print(monthly_avg_temp_stddev.bandNames().getInfo())  # Check for 'monthly_avg_temp_stddev'
print(monthly_avg_precip.bandNames().getInfo())  # Check for 'monthly_avg_temp_stddev'
print(monthly_avg_precip_stddev.bandNames().getInfo())  # Check for 'monthly_avg_temp_stddev' """


In [237]:
def config_s2_bands(img):
    bands = img.select(['B2', 'B4', 'B8'])
    renamed_bands = bands.rename(['B', 'R', 'NIR'])
    return renamed_bands.copyProperties(img, img.propertyNames())

def mask_s2_clouds(image):
    """Masks clouds and cirrus based on SCL band."""
    scl = image.select(['SCL'])
    cloudShadow = scl.eq(3)
    cloudsLow = scl.eq(7)
    cloudsMed = scl.eq(8)
    cloudsHigh = scl.eq(9)
    cirrus = scl.eq(10)
    mask = (cloudShadow.Or(cloudsLow).Or(cloudsMed).Or(cloudsHigh).Or(cirrus).Not())
    return image.updateMask(mask).divide(10000).copyProperties(image, image.propertyNames())

imgCol_S2_SR = ee.ImageCollection('COPERNICUS/S2_SR') \
    .filter(ee.Filter.calendarRange(start_year, end_year, 'year'))\
    .filter(ee.Filter.calendarRange(month_start, month_end, 'month'))\
    .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20)) \
    .filterBounds(study_area) \
    .map(mask_s2_clouds) \
    .map(config_s2_bands)
    
def calc_ndvi(img):
    ndvi = img.normalizedDifference(['NIR', 'R']).rename('NDVI')
    return ndvi.copyProperties(img, img.propertyNames())

def calc_evi(img):
    nir = img.select('NIR')
    red = img.select('R')
    blue = img.select('B')
    
    numerator = nir.subtract(red)
    denominator = nir.add(red.multiply(6)).subtract(blue.multiply(7.5)).add(1)
    
    evi = numerator.divide(denominator).multiply(2.5).rename('EVI')

    return evi.copyProperties(img, img.propertyNames())


mean_ndvi = imgCol_S2_SR.map(calc_ndvi).mean()
mean_evi = imgCol_S2_SR.map(calc_evi).mean()

# EVI visualization
evi_vis = {
    'min': 0.0,
    'max': 1.0,
    'palette': ['purple', 'white', 'green']
}

# NDVI visualization
ndvi_vis = {
    'min': 0.0,
    'max': 1.0,
    'palette': ['blue', 'white', 'green']
}

#NDVI_max:0.6291760206222534
#NDVI_min:-0.10195349901914597
""" ndvi_stats = mean_ndvi.reduceRegion(
    reducer=ee.Reducer.minMax(),
    geometry=study_area,  
    scale=30, 
    maxPixels=1e13  
) """

Map.addLayer(mean_ndvi.clip(study_area), ndvi_vis, 'NDVI', False)
Map.addLayer(mean_evi.clip(study_area), evi_vis, 'EVI', False)



In [239]:
land_cover = ee.ImageCollection('ESA/WorldCover/v200').first()

lc_vis = {
    'bands': ['Map']
}

Map.addLayer(land_cover.clip(study_area), lc_vis, 'Land Cover', False)

In [220]:
Map

Map(bottom=523855.0, center=[0.25165476947811843, 37.258395286565445], controls=(WidgetControl(options=['posit…

In [None]:
# This script was adapted from an Open-Source Github repo here: https://github.com/leonsnill/lst_landsat/blob/master/lst_landsat.py
# Global variables
#start_year = 2023
#end_year = 2024
#month_start = 3
#month_end = 2
#max_cloud_cover = 20
t_threshold = 0

# Algorithm Specifications
# min/max ndvi
ndvi_v = 0.63
ndvi_s = -0.1

# Veg, soil, water emissivity
epsilon_v = 0.985
epsilon_s = 0.96
epsilon_w = 0.99

# Coefficients for Landsat 8
cs_l8 = [0.04019, 0.02916, 1.01523,
         -0.38333, -1.50294, 0.20324,
         0.00918, 1.36072, -0.27514]

def config_l8_bands(img):
    bands = ['SR_B2', 'SR_B3', 'SR_B4', 'SR_B5', 'SR_B6', 'SR_B7']
    thermal_band = ['ST_B10']
    new_bands = ['B', 'G', 'R', 'NIR', 'SWIR1', 'SWIR2']
    new_thermal_bands = ['TIR']
    vnirswir = img.select(bands).multiply(0.0001).rename(new_bands)
    tir = img.select(thermal_band).multiply(0.1).rename(new_thermal_bands)
    return vnirswir.addBands(tir).copyProperties(img, ['system:time_start'])

# Cloud mask for Surface Reflectance products
def mask_l8_clouds(img):
    cloudShadowBitMask = ee.Number(2).pow(3).int()
    cloudsBitMask = ee.Number(2).pow(5).int()
    qa = img.select('QA_PIXEL')
    mask = qa.bitwiseAnd(cloudShadowBitMask).eq(0).And(
           qa.bitwiseAnd(cloudsBitMask).eq(0))
    return img.updateMask(mask)

# Radiometric Calibration
def fun_radcal(img):
    radiance = ee.Algorithms.Landsat.calibratedRadiance(img).rename('RADIANCE')
    return img.addBands(radiance)

# L to ee.Image
def fun_l_addband(img):
    l = ee.Image(img.get('L')).select('RADIANCE').rename('L')
    return img.addBands(l)

# NDVI - required for emissivity calculation
def fun_ndvi(img):
    ndvi = img.normalizedDifference(['NIR', 'R']).rename('NDVI')
    return img.addBands(ndvi)

# FVC (Fraction Vegetation Cover) - required for emissivity calculation
def fun_fvc(img):
    fvc = img.expression(
        '((NDVI-NDVI_s)/(NDVI_v-NDVI_s))**2',
        {
            'NDVI': img.select('NDVI'),
            'NDVI_s': ndvi_s,
            'NDVI_v': ndvi_v
        }
    ).rename('FVC')
    return img.addBands(fvc)

# Scale Emissivity - required for LST calculation
def fun_epsilon_scale(img):
    epsilon_scale = img.expression(
        'epsilon_s+(epsilon_v-epsilon_s)*FVC',
        {
            'FVC': img.select('FVC'),
            'epsilon_s': epsilon_s,
            'epsilon_v': epsilon_v
        }
    ).rename('EPSILON_SCALE')
    return img.addBands(epsilon_scale)

# Emissivity (Epsilon) - required for LST calculation
def fun_epsilon(img):
    pseudo = img.select(['NDVI']).set('system:time_start', img.get('system:time_start'))
    epsilon = pseudo.where(img.expression('NDVI > NDVI_v',
                                         {'NDVI': img.select('NDVI'),
                                          'NDVI_v': ndvi_v}), epsilon_v)
    epsilon = epsilon.where(img.expression('NDVI < NDVI_s && NDVI >= 0',
                                          {'NDVI': img.select('NDVI'),
                                           'NDVI_s': ndvi_s}), epsilon_s)
    epsilon = epsilon.where(img.expression('NDVI < 0',
                                          {'NDVI': img.select('NDVI')}), epsilon_w)
    epsilon = epsilon.where(img.expression('NDVI <= NDVI_v && NDVI >= NDVI_s',
                                          {'NDVI': img.select('NDVI'),
                                           'NDVI_v': ndvi_v,
                                           'NDVI_s': ndvi_s}), img.select('EPSILON_SCALE')).rename('EPSILON')
    return img.addBands(epsilon)

# Scale WV content
def fun_wv_scale(img):
    wv_scaled = ee.Image(img.get('WV')).multiply(0.1).rename('WV_SCALED')
    wv_scaled = wv_scaled.resample('bilinear')
    return img.addBands(wv_scaled)

# Atmospheric Functions - required for LST calculation
def fun_af1(img):
    af1 = img.expression(
        '('+str(cs_l8[0])+'*(WV**2))+('+str(cs_l8[1])+'*WV)+('+str(cs_l8[2])+')',
        {
            'WV': img.select('WV_SCALED')
        }
    ).rename('AF1')
    return img.addBands(af1)

def fun_af2(img):
    af2 = img.expression(
        '('+str(cs_l8[3])+'*(WV**2))+('+str(cs_l8[4])+'*WV)+('+str(cs_l8[5])+')',
        {
            'WV': img.select('WV_SCALED')
        }
    ).rename('AF2')
    return img.addBands(af2)

def fun_af3(img):
    af3 = img.expression(
        '('+str(cs_l8[6])+'*(WV**2))+('+str(cs_l8[7])+'*WV)+('+str(cs_l8[8])+')',
        {
            'WV': img.select('WV_SCALED')
        }
    ).rename('AF3')
    return img.addBands(af3)

# Gamma Function - required for LST calculation
def fun_gamma(img):
    gamma = img.expression('(BT**2)/(1324*L)',
                          {'BT': img.select('TIR'),
                           'L': img.select('L')
                          }).rename('GAMMA')
    return img.addBands(gamma)

# Delta Function - required for LST calculation
def fun_delta(img):
    delta = img.expression('BT-((BT**2)/1324)',
                          {'BT': img.select('TIR')
                          }).rename('DELTA')
    return img.addBands(delta)

# Land Surface Temperature calculation
def fun_lst(img):
    lst = img.expression(
        '(GAMMA*(((1/EPSILON)*(AF1*L+AF2))+AF3)+DELTA)-273.15',
        {
            'GAMMA': img.select('GAMMA'),
            'DELTA': img.select('DELTA'),
            'EPSILON': img.select('EPSILON'),
            'AF1': img.select('AF1'),
            'AF2': img.select('AF2'),
            'AF3': img.select('AF3'),
            'L': img.select('L')
        }
    ).rename('LST')
    return img.addBands(lst)

def fun_mask_lst(img):
    mask = img.select('LST').gt(t_threshold)
    return img.updateMask(mask)

# Create maxDifference-filter to match TOA and SR products
maxDiffFilter = ee.Filter.maxDifference(
    difference=2 * 24 * 60 * 60 * 1000,
    leftField='system:time_start',
    rightField='system:time_start'
)

# Define joins
join_wv = ee.Join.saveBest(
    matchKey='WV',
    measureKey='timeDiff'
)

join_l = ee.Join.saveBest(
    matchKey='L',
    measureKey='timeDiff'
)

# Landsat 8 OLI-TIRS
imgCol_L8_TOA = ee.ImageCollection('LANDSAT/LC08/C02/T1_TOA')\
    .filterBounds(study_area)\
    .filter(ee.Filter.calendarRange(start_year, end_year, 'year'))\
    .filter(ee.Filter.calendarRange(month_start, month_end, 'month'))\
    .filter(ee.Filter.lt('CLOUD_COVER_LAND', max_cloud_cover))\
    .select(['B10']) # Thermal Infared 1

imgCol_L8_SR = ee.ImageCollection('LANDSAT/LC08/C02/T1_L2')\
    .filterBounds(study_area)\
    .filter(ee.Filter.calendarRange(start_year, end_year, 'year'))\
    .filter(ee.Filter.calendarRange(month_start, month_end, 'month'))\
    .filter(ee.Filter.lt('CLOUD_COVER_LAND', max_cloud_cover))\
    .map(mask_l8_clouds)

imgCol_L8_SR = imgCol_L8_SR.map(config_l8_bands)

# NCEP/NCAR Water Vapor Product
imgCol_WV = ee.ImageCollection('NCEP_RE/surface_wv')\
    .filterBounds(study_area)\
    .filter(ee.Filter.calendarRange(start_year, end_year, 'year'))\
    .filter(ee.Filter.calendarRange(month_start, month_end, 'month'))

# TOA (Radiance) and SR
imgCol_L8_TOA = imgCol_L8_TOA.map(fun_radcal)
imgCol_L8_SR = ee.ImageCollection(join_l.apply(imgCol_L8_SR, imgCol_L8_TOA, maxDiffFilter))
imgCol_L8_SR = imgCol_L8_SR.map(fun_l_addband)

# Water Vapor
imgCol_L8_SR = ee.ImageCollection(join_wv.apply(imgCol_L8_SR, imgCol_WV, maxDiffFilter))
imgCol_L8_SR = imgCol_L8_SR.map(fun_wv_scale)

# Atmospheric Functions
imgCol_L8_SR = imgCol_L8_SR.map(fun_af1)
imgCol_L8_SR = imgCol_L8_SR.map(fun_af2)
imgCol_L8_SR = imgCol_L8_SR.map(fun_af3)

# Delta and Gamma Functions
imgCol_L8_SR = imgCol_L8_SR.map(fun_delta)
imgCol_L8_SR = imgCol_L8_SR.map(fun_gamma)

# Parameters and Indices
imgCol_L8_SR = imgCol_L8_SR.map(fun_ndvi)
imgCol_L8_SR = imgCol_L8_SR.map(fun_fvc)
imgCol_L8_SR = imgCol_L8_SR.map(fun_epsilon_scale)
imgCol_L8_SR = imgCol_L8_SR.map(fun_epsilon)

# LST
imgCol_L8_SR = imgCol_L8_SR.map(fun_lst)
imgCol_L8_SR = imgCol_L8_SR.map(fun_mask_lst)

# Calculate mean LST
mean_lst = imgCol_L8_SR.select(['LST']).mean()

vis_params_lst = {
    'min': -10,
    'max': 30,
    'palette': ['#313695', '#74add1', '#fdae61', '#a50026']
}

#Map.addLayer(mean_lst.clip(study_area), vis_params_lst, 'Mean LST')
mean_lst.getInfo()
lst_stats = mean_lst.reduceRegion(
    reducer=ee.Reducer.minMax(),
    geometry=study_area,  
    scale=30, 
    maxPixels=1e13  
)
lst_stats


In [320]:
# Stack the covariates into a single ImageCollection with correct band names
covariates = ee.Image.cat([
    elevation.rename('elevation'),
    slope.rename('slope'),
    monthly_avg_temp.rename('tavg'),
    monthly_avg_temp_stddev.rename(['std_tmmn', 'std_tmmx']),  
    monthly_avg_precip.rename('pr'),  
    monthly_avg_precip_stddev.rename('std_pr'),
    mean_ndvi.rename('ndvi'),  
    mean_evi.rename('evi'),  
    land_cover.rename('land_cover')
])

# Extracting values at the sample points
covariate_values = soc_samples_points.map(lambda feature: feature.set({
    'elevation': covariates.select('elevation').reduceRegion(ee.Reducer.mean(), feature.geometry(), 30).get('elevation'),
    'slope': covariates.select('slope').reduceRegion(ee.Reducer.mean(), feature.geometry(), 30).get('slope'),
    'std_tmmn': covariates.select('std_tmmn').reduceRegion(ee.Reducer.mean(), feature.geometry(), 30).get('std_tmmn'),
    'std_tmmx': covariates.select('std_tmmx').reduceRegion(ee.Reducer.mean(), feature.geometry(), 30).get('std_tmmx'),
    'pr': covariates.select('pr').reduceRegion(ee.Reducer.mean(), feature.geometry(), 30).get('pr'),
    'std_pr': covariates.select('std_pr').reduceRegion(ee.Reducer.mean(), feature.geometry(), 30).get('std_pr'),
    'ndvi': covariates.select('ndvi').reduceRegion(ee.Reducer.mean(), feature.geometry(), 30).get('ndvi'),
    'evi': covariates.select('evi').reduceRegion(ee.Reducer.mean(), feature.geometry(), 30).get('evi'),
    'land_cover': covariates.select('land_cover').reduceRegion(ee.Reducer.mode(), feature.geometry(), 30).get('land_cover')
}))



# Convert the result to pandas DataFrame
covariate_values = covariate_values.getInfo()

# Flatten the data by extracting 'properties' from each feature in 'features'
covariate_values_list = []
for feature in covariate_values['features']:
    covariate_values_dict = feature['properties']
    covariate_values_list.append(covariate_values_dict)

covariate_df = pd.DataFrame(covariate_values_list)
print(covariate_df.head())
#covariates.getInfo

      MgC_SE  MgC_per_ha    elevation       evi  land_cover      ndvi plot_no  \
0  11.074052   -5.917497  1736.530640  0.210128          30  0.204085     100   
1   6.831054    5.368321  1719.246338  0.217572          30  0.203475     101   
2   6.005767  -25.157348  1721.940674  0.228161          30  0.217788     102   
3  13.180585   -6.361791  1700.883789  0.252125          30  0.239380     103   
4   4.785345  -23.747264  1773.380371  0.306732          30  0.297429     104   

           pr     slope      std_pr  std_tmmn  std_tmmx  
0  128.166667  0.402805  183.919744  0.525182  1.292278  
1  128.166667  0.402805  183.919744  0.525182  1.292278  
2  128.166667  0.402805  183.919744  0.525182  1.292278  
3  142.541667  0.402805  211.564407  0.601950  1.236869  
4  142.541667  0.402805  211.564407  0.601950  1.236869  


###Training a model with sklearn

In [299]:
# Define features (X) and target variable (y)
X = covariate_df[['elevation', 'evi', 'land_cover', 'ndvi', 'pr', 'slope', 
        'std_pr', 'std_tmmn', 'std_tmmx']]  # Covariates
y = covariate_df['MgC_per_ha']  # Target variable (MgC_per_ha)

# Optional: Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Get feature importances
feature_importances = rf.feature_importances_

# Create a DataFrame for feature importances
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort the features by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)


| Feature   |   Importance |
|:----------|-------------:|
| std_tmmn  |     0.230555 |
| elevation |     0.165769 |
| std_pr    |     0.160722 |
| evi       |     0.131279 |
| ndvi      |     0.121691 |

R^2 score on test set: 0.23630450247031576

In [None]:
# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
plt.xlabel('Feature Importance')
plt.title('Feature Importances for Predicting MgC_per_ha')
plt.show()

In [None]:
# Define the top 5 features based on importance
top_features = ['elevation', 'std_tmmn', 'std_pr', 'evi', 'ndvi']

# Create the selected_covariates image stack
selected_covariates = covariates.select(top_features)

# Sample the covariates and SOC values from your soil samples
X = covariate_df[['elevation', 'std_tmmn', 'std_pr', 'evi', 'ndvi']].values
y = covariate_df['MgC_per_ha'].values

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Check the R^2 score on the test set
test_score = rf_model.score(X_test, y_test)
print(f"Test R^2 score: {test_score}")