# Random Forest

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import polars as pl
import pandas as pd

df = pl.read_parquet("../data/penguins.parquet")

df = df.drop_nulls(
    subset=[
        "bill_length_mm",
        "bill_depth_mm",
        "flipper_length_mm",
        "body_mass_g",
    ]
)

df.sample(10)

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
str,str,f64,f64,f64,f64,str,i64
"""Gentoo""","""Biscoe""",43.6,13.9,217.0,4900.0,"""female""",2008
"""Gentoo""","""Biscoe""",43.5,14.2,220.0,4700.0,"""female""",2008
"""Adelie""","""Biscoe""",39.0,17.5,186.0,3550.0,"""female""",2008
"""Gentoo""","""Biscoe""",42.8,14.2,209.0,4700.0,"""female""",2007
"""Gentoo""","""Biscoe""",46.9,14.6,222.0,4875.0,"""female""",2009
"""Adelie""","""Biscoe""",41.4,18.6,191.0,3700.0,"""male""",2008
"""Gentoo""","""Biscoe""",49.1,15.0,228.0,5500.0,"""male""",2009
"""Chinstrap""","""Dream""",51.3,19.9,198.0,3700.0,"""male""",2007
"""Adelie""","""Dream""",39.5,16.7,178.0,3250.0,"""female""",2007
"""Adelie""","""Biscoe""",38.1,17.0,181.0,3175.0,"""female""",2009


In [67]:
from operator import index


y = df["body_mass_g"]
X = df.drop(["body_mass_g"])

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0
)

numerical_cols = [
    col for col in X_train.columns if X_train[col].dtype in [pl.Float64, pl.Int64]
]
categorical_columns = [
    col for col in X_train.columns if X[col].dtype in [pl.Categorical, pl.Utf8]
]
numerical_transformer = SimpleImputer(strategy="median")
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

model = RandomForestRegressor(n_estimators=100, random_state=0)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model),
    ]
)

# scikit-learn not working with polars!
X_train = X_train.to_pandas().reset_index(drop=True)
y_train = y_train.to_pandas().reset_index(drop=True)
X_valid = X_valid.to_pandas().reset_index(drop=True)
y_valid = y_valid.to_pandas().reset_index(drop=True)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_valid)
y_pred

array([4058.  , 3535.75, 3964.5 , 3487.75, 3956.5 , 4028.75, 5479.  ,
       3476.25, 5430.5 , 3447.5 , 3305.75, 4589.25, 4412.25, 3844.  ,
       3534.5 , 4251.  , 4534.75, 5663.5 , 3734.75, 3367.5 , 3842.75,
       3514.75, 3800.5 , 5499.75, 3985.  , 3206.  , 5515.5 , 4132.25,
       5511.5 , 5332.25, 3824.75, 4041.5 , 4203.75, 4449.25, 4633.75,
       3516.5 , 4000.25, 5358.25, 3539.25, 3990.75, 3389.  , 4127.5 ,
       4112.75, 4604.75, 3899.  , 3993.  , 4339.75, 5236.  , 4750.  ,
       3644.75, 4110.5 , 4042.75, 4092.5 , 3845.75, 3337.25, 3987.25,
       5529.  , 3765.5 , 3939.75, 3623.5 , 4005.5 , 3471.75, 4062.25,
       5681.5 , 3098.75, 3431.  , 3150.25, 4458.25, 3978.25])

In [70]:
random_pred = pipeline.predict(
    pd.DataFrame(
        {
            "species": "Adelie",
            "island": "Dream",
            "bill_length_mm": 40.0,
            "bill_depth_mm": 17.0,
            "flipper_length_mm": 180.0,
            "sex": "female",
            "year": "2007",
        },
        index=[0],
    )
)
random_pred

array([3399.75])

In [58]:
score = mean_absolute_error(y_valid, y_pred)
score # much better than Decision Tree on numerical data only

255.9963768115942