In [1]:
import marimo as mo
import polars as pl

# Titanic Tutorial with Marimo

In [2]:
train_data = pl.read_csv("./data/train.csv")
test_data = pl.read_csv("./data/test.csv")

In [3]:
head_cnt = mo.ui.slider(3, 10, label="head count")
head_cnt

In [4]:
train_data.head(n=head_cnt.value)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


In [5]:
test_data.head(n=head_cnt.value)

PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,str,str,f64,i64,i64,str,f64,str,str
892,3,"""Kelly, Mr. James""","""male""",34.5,0,0,"""330911""",7.8292,,"""Q"""
893,3,"""Wilkes, Mrs. James (Ellen Need…","""female""",47.0,1,0,"""363272""",7.0,,"""S"""
894,2,"""Myles, Mr. Thomas Francis""","""male""",62.0,0,0,"""240276""",9.6875,,"""Q"""


In [6]:
women = (
    train_data.filter(pl.col("Sex") == "female").select(pl.col("Survived")).to_series()
)
rate_women = sum(women) / len(women)

mo.md(
    f"""
    ## % of women who survived

    {rate_women}
    """
)

In [7]:
men = train_data.filter(pl.col("Sex") == "male").select(pl.col("Survived")).to_series()
rate_men = sum(men) / len(men)

mo.md(
    f"""
    ## % of men who survived

    {rate_men}
    """
)

In [8]:
features = mo.ui.multiselect(
    options=train_data.columns,
    label="features",
    value=["Pclass", "Sex", "SibSp", "Parch"],
)

In [9]:
mo.hstack([features, mo.md(f"Selected features: {', '.join(features.value)}")])

In [10]:
from sklearn.ensemble import RandomForestClassifier

categorical_features = [f for f in features.value if train_data[f].dtype == pl.Utf8]
categories = {}
for col in categorical_features:
    # trainとtestの両方からユニーク値を取得
    unique_vals = (
        pl.concat([train_data[col], test_data[col]]).unique().drop_nulls().to_list()
    )
    categories[col] = unique_vals

y = train_data.select(pl.col("Survived")).to_numpy().ravel()
X = (
    train_data.select(features.value)
    .to_dummies(columns=categorical_features)
    .to_numpy()
)
X_test = (
    test_data.select(features.value).to_dummies(columns=categorical_features).to_numpy()
)

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
predictions = model.predict(X_test)

In [12]:
output = pl.DataFrame(
    {
        "PassengerId": test_data.select(pl.col("PassengerId")).to_series(),
        "Survived": predictions.tolist(),
    }
)
output.head(10)

PassengerId,Survived
i64,i64
892,0
893,1
894,0
895,0
896,1
897,0
898,1
899,0
900,1
901,0
