## Goal
Goal is to predict whether a passenger was transported to an alternate dimension (Transported column, which is true or false)

In [1]:
import polars as pl
import polars_ds as pds

In [82]:
def engineer_df(df):
  return (
    df
    .with_columns(
      # pl.col("Transported").cast(pl.Float64),
      pl.col("CryoSleep").cast(pl.Float64),
      pl.col("VIP").cast(pl.Float64),
      pid_prefix = pl.col("PassengerId").str.slice(0, 4).cast(pl.Float64),
      pid_suffix = pl.col("PassengerId").str.slice(5, 6).cast(pl.Float64),
      # Parse Cabin into three fields split by "/"
      cabin_deck = pl.col("Cabin").str.split("/").list.get(0),
      cabin_num = pl.col("Cabin").str.split("/").list.get(1).cast(pl.Float64),
      cabin_side = pl.col("Cabin").str.split("/").list.get(2),
    )
    .with_columns(
      pl.col("VIP").fill_null(0),
      pl.col("HomePlanet").fill_null("Unknown"),
      # pl.col("CryoSleep").fill_null(0),
      pl.col("Cabin").fill_null("Unknown"),
      pl.col("cabin_deck").fill_null("Unknown"),
      pl.col("cabin_num").fill_null(0),
      pl.col("cabin_side").fill_null("Unknown"),
      pl.col("Destination").fill_null("Unknown"),
      pl.col("Age").fill_null(0),
      pl.col("RoomService").fill_null(0),
      pl.col("FoodCourt").fill_null(0),
      pl.col("ShoppingMall").fill_null(0),
      pl.col("Spa").fill_null(0),
      pl.col("VRDeck").fill_null(0),
      pl.col("Name").fill_null("Unknown"),
    )
    # .with_columns(pl.col("Transported").cast(pl.Int64))
    .drop("PassengerId")
  )

In [59]:
def k_fold(df, k=5):
    n = len(df)
    df = df.with_columns(
        (pl.int_range(0, n) % k).alias("fold")
    )
    
    # Create list of (train, val) splits
    folds = []
    for i in range(k):
        train = df.filter(pl.col("fold") != i).drop("fold")
        val = df.filter(pl.col("fold") == i).drop("fold")
        folds.append((train, val))
    
    return folds



In [60]:
folds = k_fold(engineer_df(pl.read_csv("spaceship-titanic/train.csv")))

In [None]:
import ydf
train, val = folds[0]
model = (
  ydf.GradientBoostedTreesLearner(label="Transported")
  .train(train)
)

evaluation = model.evaluate(val)
evaluation

Feature Name is a CATEGORICAL feature whose dictionary has a single element. The feature will not be useful during model training.
Train model on 6954 examples
Model trained in 0:00:00.345174


AttributeError: 'Analysis' object has no attribute 'shap'

In [78]:
model.analyze(val, sampling=1.0)

In [75]:
model.predict(val)
val

HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,pid_prefix,pid_suffix,cabin_deck,cabin_num,cabin_side
str,f64,str,str,f64,f64,f64,f64,f64,f64,f64,str,i64,f64,f64,str,f64,str
"""Europa""",0.0,"""B/0/P""","""TRAPPIST-1e""",39.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Maham Ofracculy""",0,1.0,1.0,"""B""",0.0,"""P"""
"""Earth""",0.0,"""F/0/P""","""PSO J318.5-22""",44.0,0.0,0.0,483.0,0.0,291.0,0.0,"""Sandie Hinetthews""",1,5.0,1.0,"""F""",0.0,"""P"""
"""Europa""",1.0,"""B/1/P""","""TRAPPIST-1e""",34.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Altardr Flatic""",1,8.0,2.0,"""B""",1.0,"""P"""
"""Earth""",0.0,"""Unknown""","""TRAPPIST-1e""",31.0,0.0,32.0,0.0,876.0,0.0,0.0,"""Justie Pooles""",0,12.0,1.0,"""Unknown""",0.0,"""Unknown"""
"""Earth""",0.0,"""F/6/P""","""55 Cancri e""",14.0,0.0,412.0,0.0,1.0,0.0,679.0,"""Philda Brighttt""",0,17.0,2.0,"""F""",6.0,"""P"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Europa""",1.0,"""B/301/P""","""55 Cancri e""",19.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Muonon Stranbeate""",1,9252.0,2.0,"""B""",301.0,"""P"""
"""Earth""",,"""F/1893/P""","""TRAPPIST-1e""",44.0,0.0,1030.0,1015.0,0.0,11.0,0.0,"""Annah Gilleyons""",1,9259.0,1.0,"""F""",1893.0,"""P"""
"""Earth""",1.0,"""G/1505/P""","""TRAPPIST-1e""",31.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Agnesa Baldson""",1,9268.0,1.0,"""G""",1505.0,"""P"""
"""Europa""",0.0,"""A/97/P""","""TRAPPIST-1e""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Polaton Conable""",1,9275.0,1.0,"""A""",97.0,"""P"""


In [83]:
test = pl.read_csv("spaceship-titanic/test.csv")

In [84]:
test_input = engineer_df(test)

In [85]:
predictions = model.predict(test_input)

In [86]:
predictions

array([0.6102701 , 0.11902908, 0.9823433 , ..., 0.97203   , 0.5121196 ,
       0.75028384], shape=(4277,), dtype=float32)

In [90]:
output = pl.DataFrame({
  "PassengerId": test["PassengerId"],
  "Transported": predictions
}).with_columns(pl.when(pl.col("Transported") > 0.5).then(pl.lit("True")).otherwise(pl.lit("False")).alias("Transported"))
output.write_csv("spaceship-titanic/submission.csv", separator=",")
