### SOC Stock Model Work

The plan: 
- **Gather the covariates** using the SCORPAN model 
- **add harmonized and transformed soc** (response)
- **clean and process training data** -- Take out variables with moderate to high amnts of missing data, impute variables w low to mod
- **Handle categorical variables** --> Convert text/categorical variables (e.g., moist_color_name, friability) to numeric codes or one-hot encoding.
- **Handle numeric predictors**. Scale/normalize if using regression methods sensitive to magnitude (optional, regression kriging is usually OK without scaling).Ensure no extreme outliers that could bias the model.

In [1]:
## Prepare Training Data to estimate SOC stock throughout Angola
import pandas as pd

training_data = pd.read_csv("/Users/inesschwartz/Desktop/training_data_with_log_soc.csv")

In [18]:
# clean and process dataset

import pandas as pd

# Calculate missing percentage per column
missing_pct = training_data.isna().mean() * 100

# Categorize by missingness
missing_category = pd.cut(
    missing_pct,
    bins=[-1, 10, 30, 100],
    labels=['Low (<10%)', 'Moderate (10–30%)', 'High (>30%)']
)

# Combine into a summary table
missing_summary = pd.DataFrame({
    'Missing_Percent': missing_pct,
    'Category': missing_category
}).sort_values(by='Missing_Percent', ascending=False)

In [3]:
# Identify high missingness columns and drop from dataset
high_missing_cols = missing_summary[missing_summary['Category'] == 'High (>30%)'].index.tolist()

# Add organic_carbon to the drop list
cols_to_drop = high_missing_cols + ['organic_carbon']

print("Columns to drop due to high missingness (>30%) and organic_carbon:")
print(cols_to_drop)

# Drop these columns from the dataset (ignore errors if a column is not present)
training_data.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# Save new dataframe to CSV
training_data.to_csv("/Users/inesschwartz/Desktop/training_data_cleaning.csv", index=False)


Columns to drop due to high missingness (>30%) and organic_carbon:
['gypsum', 'caco3', 'structure_class', 'structure_type', 'conductivity', 'friability', 'total_n', 'exchangable_bases_sum', 'pore_diameter', 'thick_contents_count', 'thick_contents_nature', 'atm_15', 'p205', 'root_diameter', 'eg', 'durability', 'organic_carbon']


In [15]:
# Moderate missingness
moderate_missing = missing_summary[missing_summary['Category'] == 'Moderate (10–30%)']
moderate_missing

#drop moist_color_name, moist_color_name, moist_chroma, moist_hue
# replace missing/blank "structure_degree", "pore_quantity", "compaction" categorical descriptions with "missing"
# replace missing/black "eq_hum", "cec", "ph_kcl","dry_hue", "dry_chroma",  numeric descriptions with n/a or null (whatever best for floats)


Unnamed: 0,Missing_Percent,Category
moist_color_name,29.315961,Moderate (10–30%)
moist_value,26.927253,Moderate (10–30%)
moist_chroma,26.818675,Moderate (10–30%)
moist_hue,24.647123,Moderate (10–30%)
structure_degree,21.715527,Moderate (10–30%)
free_iron,19.001086,Moderate (10–30%)
moisture_degree,18.892508,Moderate (10–30%)
eq_hum,17.698154,Moderate (10–30%)
pore_quantity,15.092291,Moderate (10–30%)
dry_color_name,10.423453,Moderate (10–30%)


In [13]:
# low missingness
low_missing = missing_summary[missing_summary['Category'] == 'Low (<10%)']
low_missing

Unnamed: 0,Missing_Percent,Category
compaction,9.337676,Low (<10%)
ph_kcl,7.817590,Low (<10%)
soluble_sodium,5.646037,Low (<10%)
dry_hue,4.668838,Low (<10%)
dry_chroma,4.234528,Low (<10%)
...,...,...
precip_coldest_quarter32733,0.000000,Low (<10%)
min_temp_coldest_month32733,0.000000,Low (<10%)
mean_temp_wettest_quarter32733,0.000000,Low (<10%)
mean_temp_warmest_quarter32733,0.000000,Low (<10%)


In [None]:
## handle moderate and low missing data via knn imputation = uses similarity between samples

from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
training_data[moderate_missing] = imputer.fit_transform(training_data[moderate_missing])


identify numeric vs categorical variables

In [5]:
## identify numerica vs categorical variables

import pandas as pd

# Load your dataset
training_data

# Separate numeric and categorical predictors
numeric_cols = training_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = training_data.select_dtypes(include=['object']).columns.tolist()

# Remove response variable from predictors
numeric_cols.remove('log_soc_stock')  # response variable

print("Numeric predictors:")
print(numeric_cols)

print("\nCategorical predictors:")
print(categorical_cols)


Numeric predictors:
['site_info_id', 'X_coord', 'Y_coord', 'MRVBF', 'RLD', 'aspect', 'aspect_classes', 'aspect_cos', 'aspect_sin', 'dem_filledfiltered', 'flow_accumulation', 'relief', 'ridge_levels', 'roughness', 'slope', 'twi_300m', 'valleydepth2', 'annual_mean_temp', 'annual_precip2', 'isothermality_32733', 'max_temp_warmest_month32733', 'mean_temp_driest_quarter32733', 'mean_temp_warmest_quarter32733', 'mean_temp_wettest_quarter32733', 'min_temp_coldest_month32733', 'precip_coldest_quarter32733', 'precip_driest_month32733', 'precip_driest_quarter32733', 'precip_seasonality2', 'precip_warmest_quarter32733', 'precip_wettest_month32733', 'precip_wettest_quarter32733', 'temp_annual_range32733', 'temp_seasonality32733', 'landsurface_value', 'eco_value', 'eco_subclass_code', 'eco_class_code', 'eco_division_code', 'faosoil_id', 'thick_sand', 'fine_sand', 'silt', 'clay', 'eq_hum', 'free_iron', 'organic_material', 'ph_h2o', 'ph_kcl', 'cec', 'soluble_sodium', 'porosity', 'bulk_density', 'hori

In [20]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# ======================================================
# --- Handle missing data and drop unwanted columns ---
# ======================================================

# Drop specific unwanted columns
cols_to_drop = ['moist_color_name', 'moist_chroma', 'moist_hue']
training_data.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# Replace missing categorical values with "missing"
missing_cat_cols = ['structure_degree', 'pore_quantity', 'compaction']
for col in missing_cat_cols:
    if col in training_data.columns:
        training_data[col] = training_data[col].fillna("missing").replace("", "missing")

# Replace missing numeric values with NaN (null for floats)
missing_num_cols = ['eq_hum', 'cec', 'ph_kcl', 'dry_hue', 'dry_chroma']
for col in missing_num_cols:
    if col in training_data.columns:
        training_data[col] = training_data[col].replace("", np.nan)

# ======================================================
# --- Feature selection with categorical + numeric ---
# ======================================================

# Separate target
y = training_data['log_soc_stock']
X = training_data.drop(columns=['log_soc_stock'], errors='ignore')

# Automatically detect categorical and numeric columns
all_categorical = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Safeguard: keep only categorical columns with <= 50 unique categories
cat_limit = 50
categorical_cols = [col for col in all_categorical if X[col].nunique() <= cat_limit]

print("Categorical columns included:", categorical_cols)
print("Categorical columns skipped (too many categories):",
      [col for col in all_categorical if col not in categorical_cols])
print("Numeric columns:", numeric_cols)

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ],
    remainder='drop'
)

# Model
model = RandomForestRegressor(random_state=42)

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Fit model
pipeline.fit(X, y)

# Get feature names after OHE
ohe_feature_names = []
if categorical_cols:
    ohe_feature_names = pipeline.named_steps['preprocessor'] \
        .named_transformers_['cat'] \
        .get_feature_names_out(categorical_cols)

all_feature_names = np.concatenate([ohe_feature_names, numeric_cols])

# Feature importances
importances = pipeline.named_steps['model'].feature_importances_
feature_importance = pd.DataFrame({
    'feature': all_feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print("\nTop 20 Features by Importance:")
print(feature_importance.head(20))


Categorical columns included: ['district', 'landsurface_label', 'eco_subclass_clean', 'eco_class_clean', 'eco_division_clean', 'DOMSOI', 'africa_lithology_90m.img.vat_lithology', 'structure_degree', 'pore_quantity', 'dry_hue']
Categorical columns skipped (too many categories): ['profile', 'FAOSOIL', 'moisture_degree', 'root_quantity', 'texture', 'dry_color_name', 'compaction']
Numeric columns: ['site_info_id', 'X_coord', 'Y_coord', 'MRVBF', 'RLD', 'aspect', 'aspect_classes', 'aspect_cos', 'aspect_sin', 'dem_filledfiltered', 'flow_accumulation', 'relief', 'ridge_levels', 'roughness', 'slope', 'twi_300m', 'valleydepth2', 'annual_mean_temp', 'annual_precip2', 'isothermality_32733', 'max_temp_warmest_month32733', 'mean_temp_driest_quarter32733', 'mean_temp_warmest_quarter32733', 'mean_temp_wettest_quarter32733', 'min_temp_coldest_month32733', 'precip_coldest_quarter32733', 'precip_driest_month32733', 'precip_driest_quarter32733', 'precip_seasonality2', 'precip_warmest_quarter32733', 'preci