In [16]:
!pip install tensorflow_decision_forests -U -qq
!pip install wurlitzer -U -qq

In [81]:
import tensorflow_decision_forests as tfdf
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [82]:
model = tfdf.keras.RandomForestModel(task=tfdf.keras.Task.REGRESSION)

data = pd.read_csv("/content/result.csv", delimiter=';', encoding='utf-8')

train_df, test_df = train_test_split(data, test_size=0.3, random_state=25)

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label="N_Fe", task=model.task)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df, label="N_Fe", task=model.task)

Use /tmp/tmprsdk1_oe as temporary training directory


In [61]:
#@title Define "set_cell_height".

from IPython.core.magic import register_line_magic
from IPython.display import Javascript
from IPython.display import display

# Some of the model training logs can cover the full
# screen if not compressed to a smaller viewport.
# This magic allows setting a max height for a cell.
@register_line_magic
def set_cell_height(size):
  display(
      Javascript("google.colab.output.setIframeHeight(0, true, {maxHeight: " +
                 str(size) + "})"))

In [83]:
model_config = {
    "task": tfdf.keras.Task.REGRESSION,
    "num_trees": 150,
    "max_depth": 20,
    "min_examples": 5,
    "num_candidate_attributes_ratio": 0.6,
}

In [84]:
# Perform k-fold cross-validation
k = 5  # Number of folds
fold_size = len(train_df) // k

mse_scores = []

In [85]:
%%time
%set_cell_height 300

for fold in range(k):
    start, end = fold * fold_size, (fold + 1) * fold_size

    # Split data into train and validation sets
    fold_train_df = pd.concat([train_df.iloc[:start], train_df.iloc[end:]])
    fold_val_df = train_df.iloc[start:end]

    fold_train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(fold_train_df, label="N_Fe", task=tfdf.keras.Task.REGRESSION)
    fold_val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(fold_val_df, label="N_Fe", task=tfdf.keras.Task.REGRESSION)

    # Create and train the model
    model = tfdf.keras.RandomForestModel(**model_config)
    model.compile(["accuracy"])
    model.fit(fold_train_ds, verbose=0)

    # Evaluate on the validation set
    fold_predictions = model.predict(fold_val_ds).flatten()
    fold_true_labels = np.concatenate([y.numpy() for x, y in fold_val_ds])
    mse_score = mean_squared_error(fold_true_labels, fold_predictions)
    mse_scores.append(mse_score)

<IPython.core.display.Javascript object>

Use /tmp/tmpiebuhhuf as temporary training directory
Use /tmp/tmpomnq4j1f as temporary training directory
Use /tmp/tmp0ucrru8j as temporary training directory
Use /tmp/tmp5dafleen as temporary training directory
Use /tmp/tmpvfpz91zt as temporary training directory
CPU times: user 29.5 s, sys: 1.45 s, total: 31 s
Wall time: 31.1 s


In [86]:
# Display the results
for fold, mse_score in enumerate(mse_scores):
    print(f"Fold {fold + 1} - Mean Squared Error: {mse_score:.4f}")

Fold 1 - Mean Squared Error: 0.0167
Fold 2 - Mean Squared Error: 0.0208
Fold 3 - Mean Squared Error: 0.0198
Fold 4 - Mean Squared Error: 0.0290
Fold 5 - Mean Squared Error: 0.0221


In [87]:
mean_mse = np.mean(mse_scores)
print(f"Mean Squared Error across folds: {mean_mse:.4f}")

Mean Squared Error across folds: 0.0217
