<a href="https://colab.research.google.com/github/hiydavid/homl-learning/blob/main/projects/penguins.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TensorFlow Decision Forest on Penquins Dataset

Resources:
* Introducing TensorFlow Decision Forests: [Link](https://blog.tensorflow.org/2021/05/introducing-tensorflow-decision-forests.html)

---
# Load data & libraries

In [1]:
# installing tfdf and set path
!pip install -Uqq tensorflow_decision_forests

from google.colab import drive
drive.mount('/content/drive', force_remount = False)
path = '/content/drive/MyDrive/Colab/homl_chapters/projects/data/'

In [2]:
# import libs
import tensorflow_decision_forests as tfdf
import pandas as pd
import random

In [3]:
# get data
df = pd.read_csv(path + "penguins.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB


In [4]:
# view data
df.head(10)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,female,2007
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,male,2007
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,,2007
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,,2007


In [5]:
# split data
test_pct = 0.20

test_idx = random.sample(
    list(df.index.values), 
    round(len(df) * test_pct)
)

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    df.drop(test_idx), 
    label="species"
)

test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    df.iloc[test_idx], 
    label="species"
)

In [15]:
# check available models
tfdf.keras.get_all_models()

[tensorflow_decision_forests.keras.RandomForestModel,
 tensorflow_decision_forests.keras.GradientBoostedTreesModel,
 tensorflow_decision_forests.keras.CartModel]

---
# Random forest model

In [6]:
# train rf model
rf_model = tfdf.keras.RandomForestModel()
rf_model.fit(train_ds)



<tensorflow.python.keras.callbacks.History at 0x7fbf6f05da50>

In [8]:
# evaluate the model
rf_model.compile(metrics=["accuracy"])
print(rf_model.evaluate(test_ds))

[0.0, 0.9855072498321533]


In [10]:
# plot model
tfdf.model_plotter.plot_model_in_colab(
    rf_model, 
    tree_idx=0
)

In [11]:
# print model summary
rf_model.summary()

Model: "random_forest_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (7):
	bill_depth_mm
	bill_length_mm
	body_mass_g
	flipper_length_mm
	island
	sex
	year

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.           "__LABEL"  3.292488 ################
    2.              "year"  3.277443 ###############
    3.               "sex"  3.219297 ###############
    4.       "body_mass_g"  2.887602 #############
    5.     "bill_depth_mm"  2.205376 #######
    6.            "island"  1.947990 ######
    7. "flipper_length_mm"  1.625046 ###
    8.    "bill_length_mm"  1.127859 

Variable Importance: NUM_AS_ROOT:
    1. "flipper_length_mm" 126.000000 ################
    2.    "bill_length_mm" 102.

---
# Gradient boosting model

In [19]:
# Create another model with specified hyper-parameters
gb_model = tfdf.keras.GradientBoostedTreesModel(
    num_trees=100,
    growing_strategy="BEST_FIRST_GLOBAL",
    max_depth=3,
    split_axis="SPARSE_OBLIQUE",
)

gb_model.fit(train_ds)



<tensorflow.python.keras.callbacks.History at 0x7fbf6ed4fc90>

In [20]:
# Evaluate the model
gb_model.compile(metrics=["accuracy"])
print(gb_model.evaluate(test_ds))

[0.0, 0.9710144996643066]


In [22]:
# print model summary
gb_model.summary()

Model: "gradient_boosted_trees_model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (7):
	bill_depth_mm
	bill_length_mm
	body_mass_g
	flipper_length_mm
	island
	sex
	year

No weights

Variable Importance: MEAN_MIN_DEPTH:
    1.           "__LABEL"  2.875057 ################
    2.               "sex"  2.868254 ###############
    3.       "body_mass_g"  2.834666 ###############
    4.              "year"  2.801290 ###############
    5. "flipper_length_mm"  2.528245 #############
    6.    "bill_length_mm"  1.878940 ########
    7.            "island"  1.629691 ######
    8.     "bill_depth_mm"  0.811579 

Variable Importance: NUM_AS_ROOT:
    1.     "bill_depth_mm" 80.000000 ################