# LCLUC Random Forest

In [2]:
# Import packages
import os
import geopandas as gpd
import pandas as pd
import numpy as np
import rioxarray as rioxr
import xarray as xr
import rasterstats
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)


In [3]:
# Read in data
wrk_dir = Path.cwd().parent # go up a directory with parent
data_path = os.path.join(wrk_dir, 'data')

all_label_data = pd.read_parquet(os.path.join(data_path, 'all_label_data.parquet'))

In [4]:
# Create data set with only forest and non-forest labels 
def subset_labels(df, classes):
    """
    Create dataset with specified classes, combining all others as 'nonforest'
    """
    # Create a copy to avoid modifying original
    subset = df.copy()
    
    # Replace class values: keep if in classes list, otherwise set to 'nonforest'
    subset['class'] = subset['class'].apply(
        lambda x: x if x in classes else 'nonforest'
    )
    
    # Convert class column to categorical
    subset['class'] = subset['class'].astype('category')
    
    return subset

forest = subset_labels(all_label_data, ['forest'])
forest_water_bare = subset_labels(all_label_data, ['forest', 'water', 'bare'])

In [5]:
print(f"{forest['class'].value_counts()}\n")
print(forest_water_bare['class'].value_counts())

class
nonforest    1030
forest        270
Name: count, dtype: int64

class
nonforest    670
bare         270
forest       270
water         90
Name: count, dtype: int64


For the forest/non-forest, definitely have a class imbalance that should be considered in the model design...