# Second Model

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

from constants import TRAIN_DATA, TEST_DATA, SUBMISSION_DATA, OUTPUT, SEED

In [None]:
train_df = pd.read_csv(TRAIN_DATA)
test_df = pd.read_csv(TEST_DATA)
submission_df = pd.read_csv(SUBMISSION_DATA)

In [None]:
train_df.head()

In [None]:
train_df.isna().sum().sort_values(ascending=False)

In [None]:
cabin_cols = ["CabinDeck", "CabinNum", "CabinSide"]

train_df[cabin_cols] = train_df["Cabin"].str.split("/", expand=True)
train_df.head()

In [None]:
test_df[cabin_cols] = test_df["Cabin"].str.split("/", expand=True)
test_df.head()

In [None]:
train_df[cabin_cols[0]].value_counts(dropna=False)

In [None]:
test_df[cabin_cols[0]].value_counts(dropna=False)

In [None]:
train_df[cabin_cols[1]].value_counts(dropna=False)

In [None]:
train_df[cabin_cols[2]].value_counts(dropna=False)

In [None]:
test_df[cabin_cols[2]].value_counts(dropna=False)

In [None]:
train_df.dtypes

In [None]:
# train_df["Name"].unique().tolist()
# test_df["Name"].unique().tolist()

## Missing Values

In [None]:
num_imputer_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

In [None]:
# num_imputer = SimpleImputer(strategy="mean")
num_imputer = SimpleImputer(strategy="median")

num_imputer.fit(train_df[num_imputer_cols])

train_df[num_imputer_cols] = num_imputer.transform(train_df[num_imputer_cols])
test_df[num_imputer_cols] = num_imputer.transform(test_df[num_imputer_cols])

In [None]:
train_df[num_imputer_cols].isna().sum().sort_values(ascending=False)

In [None]:
test_df[num_imputer_cols].isna().sum().sort_values(ascending=False)

In [None]:
cat_imputer_cols = [
    "HomePlanet",
    "CryoSleep",
    "Destination",
    "VIP",
    "CabinDeck",
    "CabinSide",
]

In [None]:
cat_imputer = SimpleImputer(strategy="most_frequent")

cat_imputer.fit(train_df[cat_imputer_cols])

train_df[cat_imputer_cols] = cat_imputer.transform(train_df[cat_imputer_cols])
test_df[cat_imputer_cols] = cat_imputer.transform(test_df[cat_imputer_cols])

In [None]:
train_df[cat_imputer_cols].isna().sum().sort_values(ascending=False)

In [None]:
test_df[cat_imputer_cols].isna().sum().sort_values(ascending=False)

In [None]:
train_df.isna().sum().sort_values(ascending=False)

In [None]:
test_df.isna().sum().sort_values(ascending=False)

## Encoding categorical features

In [None]:
train_df.head()

In [None]:
encoder = OrdinalEncoder()

encoder.fit(train_df[cat_imputer_cols])

train_df[cat_imputer_cols] = encoder.transform(train_df[cat_imputer_cols])
test_df[cat_imputer_cols] = encoder.transform(test_df[cat_imputer_cols])

In [None]:
train_df.head()

In [None]:
test_df.head()

## Training Data

In [None]:
y = train_df["Transported"]

In [None]:
features = [
    "HomePlanet",
    "CryoSleep",
    "Destination",
    "Age",
    "VIP",
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "Spa",
    "VRDeck",
    # "CabinDeck",
    # "CabinSide",
]

X = train_df[features]
X.head()

In [None]:
test_df = test_df[features]
test_df.head()

## Random Forest

In [None]:
rf_model = RandomForestClassifier(random_state=SEED)

rf_model.fit(X, y)

In [None]:
rf_fi = pd.Series(rf_model.feature_importances_, index=rf_model.feature_names_in_)
rf_fi.sort_values(ascending=False)

In [None]:
perm_fi = permutation_importance(
    rf_model, X, y, n_repeats=10, random_state=SEED, n_jobs=2
)

rf_perm_fi = pd.Series(perm_fi.importances_mean, index=rf_model.feature_names_in_)
rf_perm_fi.sort_values(ascending=False)

In [None]:
submission_df["Transported"] = rf_model.predict(test_df)
submission_df.to_csv(OUTPUT, index=False)

---