In [2]:
from titanic_model_processor.main import make_pipeline, fit, export
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [3]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [4]:
X.rename(columns={
    "home.dest": "homedest"
}, inplace=True)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
pipeline = make_pipeline(
    num_vars=("age", "fare", "sex"), 
    cat_vars=("pclass", "embarked"),
    components=10,
    classifier=RandomForestClassifier(n_estimators=100),
    scaler=StandardScaler(),
    num_imp_strategy="median")

In [7]:
pipeline = fit(pipeline=pipeline, x_train=X_train, y_train=y_train)

ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'male'

In [None]:
pipeline.score(X_train, y_train)

0.9675262655205349

In [None]:
pipeline.score(X_test, y_test)

0.7709923664122137

In [None]:
export(pipeline=pipeline, file="titanic.pkl")