# Random Forest

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import polars as pl
import pandas as pd

df = pd.read_parquet("../data/penguins.parquet")

df = df.dropna(
    subset=[
        "bill_length_mm",
        "bill_depth_mm",
        "flipper_length_mm",
        "body_mass_g",
    ]
)

df.sample(10)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
218,Gentoo,Biscoe,46.2,14.4,214.0,4650.0,,2008
114,Adelie,Biscoe,39.6,20.7,191.0,3900.0,female,2009
241,Gentoo,Biscoe,52.1,17.0,230.0,5550.0,male,2009
119,Adelie,Torgersen,41.1,18.6,189.0,3325.0,male,2009
55,Adelie,Biscoe,41.4,18.6,191.0,3700.0,male,2008
306,Chinstrap,Dream,40.9,16.6,187.0,3200.0,female,2008
295,Chinstrap,Dream,49.2,18.2,195.0,4400.0,male,2007
172,Gentoo,Biscoe,50.2,14.3,218.0,5700.0,male,2007
66,Adelie,Biscoe,35.5,16.2,195.0,3350.0,female,2008
254,Gentoo,Biscoe,47.2,15.5,215.0,4975.0,female,2009


In [15]:
y = df["body_mass_g"]
X = df.drop(["body_mass_g"], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0
)

numerical_cols = [
    # col for col in X_train.columns if X_train[col].dtype in [pl.Float64, pl.Int64]
    col for col in X_train.columns if X_train[col].dtype in ["float64", "int64"]
]
categorical_columns = [
    # col for col in X_train.columns if X[col].dtype in [pl.Categorical, pl.Utf8]
    col for col in X_train.columns if X[col].dtype in ["object", "string"]
]
numerical_transformer = SimpleImputer(strategy="median")
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

model = RandomForestRegressor(n_estimators=100, random_state=0)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model),
    ]
)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_valid)
y_pred

array([4058.  , 3535.75, 3964.5 , 3487.75, 3956.5 , 4028.75, 5479.  ,
       3476.25, 5430.5 , 3447.5 , 3305.75, 4589.25, 4412.25, 3844.  ,
       3534.5 , 4251.  , 4534.75, 5663.5 , 3734.75, 3367.5 , 3842.75,
       3514.75, 3800.5 , 5499.75, 3985.  , 3206.  , 5515.5 , 4132.25,
       5511.5 , 5332.25, 3824.75, 4041.5 , 4203.75, 4449.25, 4633.75,
       3516.5 , 4000.25, 5358.25, 3539.25, 3990.75, 3389.  , 4127.5 ,
       4112.75, 4604.75, 3899.  , 3993.  , 4339.75, 5236.  , 4750.  ,
       3644.75, 4110.5 , 4042.75, 4092.5 , 3845.75, 3337.25, 3987.25,
       5529.  , 3765.5 , 3939.75, 3623.5 , 4005.5 , 3471.75, 4062.25,
       5681.5 , 3098.75, 3431.  , 3150.25, 4458.25, 3978.25])

In [16]:
random_pred = pipeline.predict(
    pd.DataFrame(
        {
            "species": "Adelie",
            "island": "Dream",
            "bill_length_mm": 40.0,
            "bill_depth_mm": 17.0,
            "flipper_length_mm": 180.0,
            "sex": "female",
            "year": "2007",
        },
        index=[0],
    )
)
random_pred

array([3399.75])

In [17]:
score = mean_absolute_error(y_valid, y_pred)
score # much better than Decision Tree on numerical data only

255.9963768115942