In [1]:
import polars as pl
from sklearn.model_selection import train_test_split

In [2]:
df = pl.read_csv("./datasets/penguins.csv", null_values=["NA"], dtypes={"year": str})
df = df.drop_nulls(subset=["sex"])

x = df.drop("sex")
y = df.select((pl.col("sex") == "male").alias("target").cast(int))

# A weird quirk for sklearn.
y = y.to_numpy().ravel()

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.33, random_state=1234
)

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [4]:
# Make pipelines for specific columns.
cols_numeric = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
pipeline_numeric = Pipeline(
    [
        ("impute_w_mean", SimpleImputer(strategy="mean")),
        ("scale_normal", StandardScaler()),
    ]
)

cols_categorical = ["species", "island", "year"]
pipeline_categorical = Pipeline(
    [
        ("impute_w_most_frequent", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encode", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

# Put it all together into a transformer.
preprocessing_transformer = ColumnTransformer(
    [
        ("numeric", pipeline_numeric, cols_numeric),
        ("categorical", pipeline_categorical, cols_categorical),
    ]
)

# Fit the transformer with training data.
preprocessing_transformer.fit(x_train, y_train)

# ---
print(preprocessing_transformer.get_feature_names_out(), end="\n\n")
print(preprocessing_transformer.fit_transform(x_train, y_train))

['numeric__bill_length_mm' 'numeric__bill_depth_mm'
 'numeric__flipper_length_mm' 'numeric__body_mass_g'
 'categorical__species_Adelie' 'categorical__species_Chinstrap'
 'categorical__species_Gentoo' 'categorical__island_Biscoe'
 'categorical__island_Dream' 'categorical__island_Torgersen'
 'categorical__year_2007' 'categorical__year_2008'
 'categorical__year_2009']

[[-1.00173392  0.34890155 -1.47813963 ...  1.          0.
   0.        ]
 [ 0.00368887  0.08989674  1.22603896 ...  0.          1.
   0.        ]
 [-0.4716019   1.17771697 -0.26837552 ...  0.          1.
   0.        ]
 ...
 [ 1.53924369  1.48852275  0.22976264 ...  0.          1.
   0.        ]
 [ 0.93599001 -0.47991386  1.9376649  ...  0.          1.
   0.        ]
 [ 0.46069923 -0.27271    -0.69535109 ...  0.          1.
   0.        ]]


In [5]:
from sklearn.ensemble import RandomForestClassifier

# Make a RF Classifier and pipeline it with the preprocessor.
rf_clf = RandomForestClassifier()

preprocess_model_pipeline = Pipeline(
    [("preprocessing", preprocessing_transformer), ("classifier", rf_clf)]
)

preprocess_model_pipeline.fit(x_train, y_train)
y_predicted = preprocess_model_pipeline.predict(x_test)



In [6]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

scores = [
    ("accuracy", accuracy_score(y_test, y_predicted)),
    ("precision", precision_score(y_test, y_predicted)),
    ("recall", recall_score(y_test, y_predicted)),
    ("f1", f1_score(y_test, y_predicted)),
]

for score in scores:
    print(score)

('accuracy', 0.9)
('precision', 0.8627450980392157)
('recall', 0.9166666666666666)
('f1', 0.8888888888888888)
