In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [2]:
df_train.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
df_train.info()
df_train.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [4]:
sample_submission.head(5)

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False


In [4]:
target_col = "Transported"
drop_cols = ["PassengerId", "Name"]
numeric_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
categorical_cols = ["HomePlanet", "CryoSleep", "Cabin", "Destination", "VIP"]

In [5]:
y = df_train[target_col].astype(int)
X = df_train.drop(columns=drop_cols + [target_col])

In [6]:
X.head(5)

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0


In [7]:
y.head(5)

0    0
1    1
2    0
3    0
4    1
Name: Transported, dtype: int64

In [8]:
X_numeric = X[numeric_cols].copy()

In [9]:
X_categorical = X[categorical_cols].copy()

In [10]:
X_numeric.head()
X_categorical.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP
0,Europa,False,B/0/P,TRAPPIST-1e,False
1,Earth,False,F/0/S,TRAPPIST-1e,False
2,Europa,False,A/0/S,TRAPPIST-1e,True
3,Europa,False,A/0/S,TRAPPIST-1e,False
4,Earth,False,F/1/S,TRAPPIST-1e,False


In [11]:
X_numeric.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,39.0,0.0,0.0,0.0,0.0,0.0
1,24.0,109.0,9.0,25.0,549.0,44.0
2,58.0,43.0,3576.0,0.0,6715.0,49.0
3,33.0,0.0,1283.0,371.0,3329.0,193.0
4,16.0,303.0,70.0,151.0,565.0,2.0


In [12]:
for col in numeric_cols:
    median = X_numeric[col].median()
    X_numeric[col] = X_numeric[col].fillna(median)


In [13]:
for col in categorical_cols:
    mode = X_categorical[col].mode().iloc[0]
    X_categorical[col] = X_categorical[col].fillna(mode)

  X_categorical[col] = X_categorical[col].fillna(mode)


In [16]:
## sanity check
X_numeric.isna().sum()
X_categorical.isna().sum()


HomePlanet     0
CryoSleep      0
Cabin          0
Destination    0
VIP            0
dtype: int64

In [17]:
X_cat_encoded = pd.get_dummies(X_categorical, drop_first=False)
X_cat_encoded.head()

Unnamed: 0,CryoSleep,VIP,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Cabin_A/0/P,Cabin_A/0/S,Cabin_A/1/S,Cabin_A/10/P,Cabin_A/10/S,...,Cabin_G/999/P,Cabin_G/999/S,Cabin_T/0/P,Cabin_T/1/P,Cabin_T/2/P,Cabin_T/2/S,Cabin_T/3/P,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,False,True,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,False,False,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [20]:
X_prepared = pd.concat([X_numeric, X_cat_encoded], axis=1)
print(X_prepared.shape)
X_prepared.isna().sum().sum()  # should be 0

(8693, 6574)


np.int64(0)

In [21]:
print(y.shape)

(8693,)


In [23]:
X_train, X_val, y_train, y_val = train_test_split(
    X_prepared, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
    

In [24]:

rf = RandomForestClassifier(
    n_estimators=200,   # number of trees
    random_state=42,
    n_jobs=-1,          # use all cores
)

rf.fit(X_train, y_train)
y_val_pred = rf.predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)
print("Validition accuracy:", val_acc)

Validition accuracy: 0.7906843013225991


In [26]:
## BELOW IS THE FINAL MODEL FOR THE FIRST KAGGLE SUBMISSION


rf_final = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf_final.fit(X_prepared, y)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [27]:
## save the medians / modes we used to fill the NaNs earlier during model training so we can run the same preprocessing on the 
## test data

numeric_medians = X_numeric.median()
cat_modes = X_categorical.mode().iloc[0]


In [29]:
X_test_raw = df_test.drop(columns=drop_cols)  # same cols_to_drop as before
X_test_numeric = X_test_raw[numeric_cols].copy()
X_test_categorical = X_test_raw[categorical_cols].copy()

In [30]:
X_test_numeric = X_test_numeric.fillna(numeric_medians)
for col in categorical_cols:
    X_test_categorical[col] = X_test_categorical[col].fillna(cat_modes[col])

  X_test_categorical[col] = X_test_categorical[col].fillna(cat_modes[col])


In [31]:
X_test_cat_encoded = pd.get_dummies(X_test_categorical, drop_first=False)
X_test_prepared = pd.concat([X_test_numeric, X_test_cat_encoded], axis=1)

In [33]:
## match the columns of the training data and the test data

X_test_prepared = X_test_prepared.reindex(
    columns=X_prepared.columns,   # same columns, same order as train
    fill_value=0                  # any missing dummy gets 0
)

In [34]:
X_test_prepared.shape  # (len(df_test), 6574)

(4277, 6574)

In [35]:
test_preds = rf_final.predict(X_test_prepared)  # 0/1


In [36]:
submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Transported": test_preds.astype(bool),  # Kaggle expects True/False
})

In [37]:
submission.to_csv("submission_rf_v1.csv", index=False)