In [None]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [None]:
train_file_path = "../input/house-prices-advanced-regression-techniques/train.csv"
dataset_df = pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(dataset_df.shape))

In [None]:
X_train, X_val, Y_train, Y_val = tfdf(X, Y, test_size=0.2, random_state=42)

In [None]:
import matplotlib.pyplot as plt
logs = rf.make_inspector().training_logs()
plt.plot([log.num_trees for log in logs], [log.evaluation.rmse for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("RMSE (out-of-bag)")
plt.show()

In [None]:
train_sizes = [5, 10, 25, 50, 75, 100]

training_accuracies = []
validation_accuracies = []

for size in train_sizes:
    if size < 1.0:
        X_train_subset, _, Y_train_subset, _ = tfdf(X_train, Y_train, train_size=size, random_state=42)
    else:
        X_train_subset, Y_train_subset = X_train, Y_train

  
    logs.fit(X_train_subset, Y_train_subset)

    Y_train_pred = logs.predict(X_train_subset)
    train_accuracy = accuracy_score(Y_train_subset, Y_train_pred)
    training_accuracies.append(train_accuracy)

    Y_val_pred = logs.predict(X_val)
    val_accuracy = accuracy_score(Y_val, Y_val_pred)
    validation_accuracies.append(val_accuracy)

    print(f'Trained with {size}% of data: Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}')

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, training_accuracies, label='Training Accuracy', marker='o')
plt.plot(train_sizes, validation_accuracies, label='Validation Accuracy', marker='o')
plt.title('Learning Curves: Training and Validation Accuracy vs Training Size')
plt.xlabel('Training Size Proportion')
plt.ylabel('Accuracy')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 4))

# Mean decrease in AUC of the class 1 vs the others.
variable_importance_metric = "NUM_AS_ROOT"
variable_importances = inspector.variable_importances()[variable_importance_metric]

# Extract the feature name and importance values.
#
# `variable_importances` is a list of <feature, importance> tuples.
feature_names = [vi[0].name for vi in variable_importances]
feature_importances = [vi[1] for vi in variable_importances]
# The feature are ordered in decreasing importance value.
feature_ranks = range(len(feature_names))

bar = plt.barh(feature_ranks, feature_importances, label=[str(x) for x in feature_ranks])
plt.yticks(feature_ranks, feature_names)
plt.gca().invert_yaxis()

# TODO: Replace with "plt.bar_label()" when available.
# Label each bar with values
for importance, patch in zip(feature_importances, bar.patches):
  plt.text(patch.get_x() + patch.get_width(), patch.get_y(), f"{importance:.4f}", va="top")

plt.xlabel(variable_importance_metric)
plt.title("NUM AS ROOT of the class 1 vs the others")
plt.tight_layout()
plt.show()