In [56]:
from pandas import DataFrame
from common.utils import load_dataset, optimize_memory, get_params, DatasetType
import pandas as pd
from ydata_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [57]:
train_df: DataFrame = load_dataset("spaceship-titanic", DatasetType.TRAIN, index=False)
train_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [58]:
train_df.dtypes

HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [59]:
train_df.shape

(8693, 13)

In [60]:
profile = ProfileReport(train_df, title="Spaceship Titanic Overview", explorative=True)
#profile.to_file("report.html")

In [67]:
train_df["Destination"] = train_df["Destination"].astype("category")
train_df["HomePlanet"] = train_df["HomePlanet"].astype("category")
train_df["CryoSleep"] = train_df["CryoSleep"].astype("bool")
train_df["VIP"] = train_df["VIP"].astype("bool")
train_df["Transported"] = train_df["Transported"].astype("bool")

if "Cabin" in train_df.columns:
    train_df[["Cabin_deck", "Cabin_num", "Cabin_side"]] = train_df["Cabin"] \
        .str.split("/", expand=True)
    train_df["Cabin_num"] = pd.to_numeric(train_df["Cabin_num"], errors="coerce")
    train_df["Cabin_deck"] = train_df["Cabin_deck"].astype("category")
    train_df["Cabin_side"] = train_df["Cabin_side"].astype("category")


drop_cols = ["Cabin", "Name"]

for col in drop_cols:
    if col in train_df.columns:
        train_df = train_df.drop(col, axis=1)



In [62]:
train_df.dtypes

HomePlanet      category
CryoSleep           bool
Destination     category
Age              float64
VIP                 bool
RoomService      float64
FoodCourt        float64
ShoppingMall     float64
Spa              float64
VRDeck           float64
Transported         bool
Cabin_deck      category
Cabin_num        float64
Cabin_side      category
dtype: object

In [63]:
train_df.to_parquet("data/spaceship-titanic-train.parquet")

In [64]:
test_df: DataFrame = load_dataset("spaceship-titanic", DatasetType.TEST, index=False)
test_df.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [65]:
test_df = pd.read_csv("test.csv")

# Apply same preprocessing as train_df
test_df["Destination"] = test_df["Destination"].astype("category")
test_df["HomePlanet"] = test_df["HomePlanet"].astype("category")
test_df["CryoSleep"] = test_df["CryoSleep"].astype("bool")
test_df["VIP"] = test_df["VIP"].astype("bool")

if "Cabin" in test_df.columns:
    test_df[["Cabin_deck", "Cabin_num", "Cabin_side"]] = test_df["Cabin"].str.split("/", expand=True)
    test_df['Cabin_num'] = pd.to_numeric(test_df['Cabin_num'], errors='coerce')
    test_df["Cabin_deck"] = test_df["Cabin_deck"].astype("category")
    test_df["Cabin_side"] = test_df["Cabin_side"].astype("category")
    test_df = test_df.drop("Cabin", axis=1)

test_df = test_df.drop("Name", axis=1)

# Drop PassengerId from features for prediction, but keep it for submission
passenger_ids = test_df["PassengerId"]
X_test_final = test_df.drop("PassengerId", axis=1)

FileNotFoundError: [Errno 2] No such file or directory: 'test.csv'