# Decision Tree

## Palmer's Penguins Dataset

Predicting body mass of penguins based on numerous factors

In [1]:
from sklearn.model_selection import train_test_split
import polars as pl
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

df = pd.read_parquet("../data/penguins.parquet")

df = df.dropna(
    subset=[
        "bill_length_mm",
        "bill_depth_mm",
        "flipper_length_mm",
        "body_mass_g",
    ]
)

# Only use numerial data for now
# categorical_cols = [col for col in df.columns if df[col].dtype not in [pl.Float64, pl.Int64]]
categorical_cols = [col for col in df.columns if df[col].dtype not in ["float64", "int64"]]
y = df["body_mass_g"]
X = df.drop(categorical_cols + ["body_mass_g"], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0
)

# # scikit-learn not working with polars!
# X_train = X_train.to_pandas().reset_index(drop=True)
# y_train = y_train.to_pandas().reset_index(drop=True)
# X_valid = X_valid.to_pandas().reset_index(drop=True)
# y_valid = y_valid.to_pandas().reset_index(drop=True)

body_mass_model = DecisionTreeRegressor(random_state=0)
body_mass_model.fit(X_train, y_train)
body_mass_preds = body_mass_model.predict(X_valid)
print(df.head())
body_mass_preds

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
4  Adelie  Torgersen            36.7           19.3              193.0   
5  Adelie  Torgersen            39.3           20.6              190.0   

   body_mass_g     sex  year  
0       3750.0    male  2007  
1       3800.0  female  2007  
2       3250.0  female  2007  
4       3450.0  female  2007  
5       3650.0    male  2007  


array([3250., 3550., 4450., 3750., 3650., 4050., 5300., 3625., 5550.,
       3950., 3000., 4600., 3950., 3900., 3750., 4300., 4375., 5050.,
       4600., 3200., 4150., 3550., 3900., 4850., 3600., 3000., 5000.,
       3800., 5000., 5550., 3150., 4100., 4250., 4500., 4875., 3400.,
       4050., 5500., 3500., 3975., 3600., 4100., 4450., 5000., 3650.,
       4200., 4800., 5400., 5000., 4250., 3975., 3250., 4250., 4050.,
       3200., 3550., 5400., 3350., 3975., 3200., 3975., 3950., 4450.,
       5050., 3075., 3200., 3550., 4300., 4250.])

In [2]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_valid, body_mass_preds)

357.60869565217394