- https://www.kaggle.com/competitions/spaceship-titanic/data
- https://www.kaggle.com/code/dansbecker/your-first-machine-learning-model
- Lazy Predict package: https://github.com/shankarpandala/lazypredict
- https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
- https://www.kaggle.com/code/alexisbcook/missing-values

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
TRAIN_DATA: str = "data/train.csv"
TEST_DATA: str = "data/test.csv"
SUBMISSION_DATA: str = "data/sample_submission.csv"

OUTPUT: str = "data/submission.csv"

SEED: int = 2912

In [None]:
train_df = pd.read_csv(TRAIN_DATA)
test_df = pd.read_csv(TEST_DATA)
submission_df = pd.read_csv(SUBMISSION_DATA)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.dtypes

In [None]:
train_df["PassengerId"].nunique() == train_df.shape[0]

In [None]:
test_df["PassengerId"].nunique() == test_df.shape[0]

In [None]:
train_df["HomePlanet"].value_counts(dropna=False)

In [None]:
test_df["HomePlanet"].value_counts(dropna=False)

In [None]:
train_df["CryoSleep"].value_counts(dropna=False)

In [None]:
test_df["CryoSleep"].value_counts(dropna=False)

In [None]:
train_df["Destination"].value_counts(dropna=False)

In [None]:
test_df["Destination"].value_counts(dropna=False)

In [None]:
train_df["VIP"].value_counts(dropna=False)

In [None]:
test_df["VIP"].value_counts(dropna=False)

In [None]:
train_df["Cabin"].value_counts(dropna=False)

In [None]:
test_df["Cabin"].value_counts(dropna=False)

In [None]:
# Target
train_df["Transported"].value_counts(dropna=False)

In [None]:
train_df["Name"].nunique()

In [None]:
for col in train_df:
    print(col)
    display(train_df[col].unique())
    print("---")

In [None]:
train_df.isna().sum()

In [None]:
test_df.isna().sum()

In [None]:
train_nona_df = train_df.dropna(how="any")
train_nona_df.shape

In [None]:
train_df.shape[0] - train_nona_df.shape[0]

In [None]:
train_nona_df["Transported"].value_counts(dropna=False)

## Missing Values

In [None]:
train_df.isna().sum().sort_values(ascending=False)

In [None]:
test_df.isna().sum().sort_values(ascending=False)

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html
imputer_cols = train_df.select_dtypes(include="number").columns.to_list()
imputer_cols

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
imputer = SimpleImputer(strategy="mean")

imputer.fit(train_df[imputer_cols])

train_df[imputer_cols] = imputer.transform(train_df[imputer_cols])
test_df[imputer_cols] = imputer.transform(test_df[imputer_cols])

# train_df.head(1)
test_df.head(1)

In [None]:
train_df.columns

In [None]:
cat_binary_imputer_cols = ["HomePlanet", "CryoSleep", "Destination", "VIP"]

In [None]:
cat_binary_imputer = SimpleImputer(strategy="most_frequent")

cat_binary_imputer.fit(train_df[cat_binary_imputer_cols])

train_df[cat_binary_imputer_cols] = cat_binary_imputer.transform(
    train_df[cat_binary_imputer_cols]
)
test_df[cat_binary_imputer_cols] = cat_binary_imputer.transform(
    test_df[cat_binary_imputer_cols]
)

# train_df.head(1)
test_df.head(1)

In [None]:
train_df["HomePlanet"].value_counts(dropna=False)

In [None]:
train_df.isna().sum().sort_values(ascending=False)

## Encoding categorical features

In [None]:
train_df.head()

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
encoder = OrdinalEncoder()

encoder.fit(train_df[cat_binary_imputer_cols])

train_df[cat_binary_imputer_cols] = encoder.transform(train_df[cat_binary_imputer_cols])
test_df[cat_binary_imputer_cols] = encoder.transform(test_df[cat_binary_imputer_cols])

# train_df.head(1)
test_df.head(1)

## Training Data

In [None]:
y = train_df["Transported"]
# y.head()

In [None]:
features = [
    "HomePlanet",
    "CryoSleep",
    "Destination",
    "Age",
    "VIP",
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "Spa",
    "VRDeck",
]

X = train_df[features]
X.head()

In [None]:
test_df = test_df[features]
test_df.head()

## Decision Tree

In [None]:
dt_model = DecisionTreeClassifier(random_state=SEED)

dt_model.fit(X, y)

In [None]:
y_pred = dt_model.predict(X)
y_pred

# dt_model.predict_proba(X)

In [None]:
accuracy_score(y, y_pred)

In [None]:
# submission_df["Transported"] = dt_model.predict(test_df)

In [None]:
# submission_df.head()

In [None]:
# submission_df.to_csv(OUTPUT, index=False)

## Random Forest

In [None]:
rf_model = RandomForestClassifier(random_state=SEED)

rf_model.fit(X, y)

In [None]:
submission_df["Transported"] = rf_model.predict(test_df)

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv(OUTPUT, index=False)

---