## RandomForest model - Spaceship Titanic Kaggle Competition

### Importing Libraries

In [12]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


### Loading Training and Test Data

In [2]:
train_path = 'train.csv'
test_path = 'test.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

y = train_data.Transported
train_data = train_data.drop(columns=['Transported'])

train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre


#### This is an overview of all the data that was given for training purpose

### Combining Training and Test Data

In [3]:
combined = pd.concat([train_data, test_data], keys=['train', 'test'])
combined

Unnamed: 0,Unnamed: 1,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
train,0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
train,1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
train,2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
train,3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
train,4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
test,4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
test,4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
test,4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
test,4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


#### This is the conbination overview of the training data and the testing data, conbining training and test sets help to avoid unknown features in the test data

### Creating Total Spending Feature

In [None]:
total_spend = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

combined['TotalSpending'] = combined[total_spend].sum(axis=1)

train  0           0.0
       1         736.0
       2       10383.0
       3        5176.0
       4        1091.0
                ...   
test   4272        0.0
       4273     1018.0
       4274        0.0
       4275     3203.0
       4276        0.0
Name: TotalSpending, Length: 12970, dtype: float64

### Splitting the Cabin Column in to deck and side

In [5]:
# https://www.google.com/search?q=how+to+make+sure+tran+and+test+data+have+equal+features+for+model+traning+in+panda&oq=how+to+make+sure+tran+and+test+data+have+equal+features+for+model+traning+in+panda&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIHCAEQIRiPAjIHCAIQIRiPAtIBCTIyMTYxajBqN6gCALACAA&sourceid=chrome&ie=UTF-8
cabin_split = combined['Cabin'].str.split('/', expand=True)

combined['Deck'] = cabin_split[0]
combined['Side'] = cabin_split[2]

combined['Side'].value_counts(dropna=False)

Side
S      6381
P      6290
NaN     299
Name: count, dtype: int64

### Selecting Features for Modeling

In [None]:
features = [
    'CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 
    'ShoppingMall', 'Spa', 'VRDeck', 'Deck', 'Side', 'TotalSpending'
]

X_train_full = combined.loc['train', features]
X_test_full = combined.loc['test', features]


Train shape: (8693, 11)
Test shape: (4277, 11)


### Filling Unknown for missing values

In [7]:
X_train_full = (
    X_train_full
    .fillna({'Deck': 'Unknown', 'Side': 'Unknown'})
    .fillna(0)
)

X_test_full = (
    X_test_full
    .fillna({'Deck': 'Unknown', 'Side': 'Unknown'})
    .fillna(0)
)

### Defining Categorical and Numerical Columns

In [8]:
OH_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

categorical_cols = ['Deck', 'Side']
numerical_cols = [
    'CryoSleep', 'VIP', 'TotalSpending', 'Age', 'Spa',
    'VRDeck', 'RoomService', 'FoodCourt', 'ShoppingMall'
]

### One-Hot Encoding Categorical Features

In [None]:
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train_full[categorical_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test_full[categorical_cols]))

num_X_train = X_train_full.drop(categorical_cols, axis=1)
num_X_test = X_test_full.drop(categorical_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpending,0,...,2,3,4,5,6,7,8,9,10,11
0,False,39.0,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,False,24.0,False,109.0,9.0,25.0,549.0,44.0,736.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,10383.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,5176.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,False,16.0,False,303.0,70.0,151.0,565.0,2.0,1091.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,41.0,True,0.0,6819.0,0.0,1643.0,74.0,8536.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8689,True,18.0,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
8690,False,26.0,False,0.0,0.0,1872.0,1.0,0.0,1873.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
8691,False,32.0,False,0.0,1049.0,0.0,353.0,3235.0,4637.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Training a Random Forest Model

In [10]:
# https://www.kaggle.com/code/alexisbcook/categorical-variables
X_train, X_val, y_train_split, y_val = train_test_split(
    OH_X_train, y, test_size=0.2, random_state=40
)

forest_model = RandomForestClassifier(
    n_estimators=300,
    random_state=52
)

forest_model.fit(X_train, y_train_split)

preds = forest_model.predict(X_val)
accuracy = accuracy_score(y_val, preds)
accuracy


0.7849338700402531

### Training on Full Data and getting Submission CSV file

In [11]:
fore_pred = forest_model.predict(X_val)
test_predictions = forest_model.predict(OH_X_test)

output = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions
})

submission_file_name = f'submission_{pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")}.csv'
output.to_csv(submission_file_name, index=False)