In [117]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from tensorflow import keras
from tensorflow.keras import layers
from scikeras.wrappers import KerasClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score
import seaborn
from matplotlib import pyplot

# Load the dataset, split features from label, and visualize first 5 entries.
df = pd.read_csv('dataset/train.csv')
features = df.drop(columns = 'Transported')
label = df['Transported']
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [118]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

def preprocess(data):
    # Feature engineering: Cabin encoding and total spent, age group, and path features injection.
    data[['cabin_deck', 'cabin_number', 'cabin_side']] = data['Cabin'].str.split('/', n=3, expand=True)
    data['cabin_number'] = (data['cabin_number']).astype('float')
    data['total_spent'] = data['RoomService']+data['FoodCourt']+data['ShoppingMall']+data['Spa']+data['VRDeck']
    data['age_0-18'] = data['Age'].apply(lambda x: True if x<=18.0 else False)
    data['path'] = data['HomePlanet']+data['Destination']
    # Drop least relevant and engined cabin column
    data.drop(columns=['PassengerId','Name','Cabin'],inplace=True)

    # Split features into numeric and categorical columns
    numeric_data = data.select_dtypes(include=[np.number])
    categorical_data = data.select_dtypes(exclude=[np.number])
    # Categorical data - mode
    c_imputator = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    columns = categorical_data.columns.to_list()
    categorical_data = pd.DataFrame(c_imputator.fit_transform(categorical_data),columns = columns)
    categorical_data = pd.DataFrame(OneHotEncoder(sparse_output=False).fit_transform(categorical_data))
    # Numerical data - mean
    n_imputator = SimpleImputer(missing_values=np.nan, strategy='mean')
    columns = numeric_data.columns.to_list()
    numeric_data = pd.DataFrame(n_imputator.fit_transform(numeric_data),columns = columns)
    numeric_data = pd.DataFrame(StandardScaler().fit_transform(numeric_data))
    # Concat
    data = pd.concat([numeric_data,categorical_data], axis = 1)
    return data

features = preprocess(features)

In [119]:
# Split the resampled data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, label, test_size=0.2, random_state=1)

In [120]:
# Define a Keras neural network model
def create_keras_model():
    model = keras.Sequential([
        layers.Dense(128, input_dim=X_train.shape[1], activation="relu"),
        layers.Dense(64, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(1, activation="sigmoid")
    ])
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

keras_classifier = KerasClassifier(build_fn=create_keras_model, epochs=50, batch_size=64, verbose=1)


In [121]:
# Ensemble Learning (Soft Voting)
models = [
    ("Random Forest", RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)),
    ("Gradient Boosting", GradientBoostingClassifier(n_estimators=100, random_state=1)),
    ("Decision Tree", DecisionTreeClassifier(criterion="entropy", max_depth=4, random_state=1)),
    ("SVM (RBF Kernel)", SVC(kernel="rbf", C=1.0, gamma=0.2, probability=True, random_state=1)),
    ("Logistic Regression", LogisticRegression(random_state=1, max_iter=50000)),
    ("Keras Neural Network", keras_classifier),
]

ensemble = VotingClassifier(models, voting="soft")
# Train the ensemble model
ensemble.fit(X_train, y_train)


Epoch 1/50


  X, y = self._initialize(X, y)


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [122]:
X_test = pd.read_csv('dataset/test.csv')
X_test = preprocess(X_test)

In [123]:
# Evaluate the ensemble model on the validation set
ensemble_accuracy = accuracy_score(y_val, ensemble.predict(X_val))
print(f"Ensemble Model Accuracy: {ensemble_accuracy}")

# Get probability estimates for test data
test_probabilities = ensemble.predict_proba(X_test)[:, 1]
print(test_probabilities)

Ensemble Model Accuracy: 0.8234617596319724
[0.62798307 0.06399945 0.95665758 ... 0.90373485 0.67599944 0.67525544]


In [124]:
# Apply a threshold to convert probabilities to binary predictions
test_data = pd.read_csv('dataset/test.csv')
threshold = 0.6
test_predictions = (test_probabilities > threshold).astype(bool)

# Save the test predictions to a CSV file
output = pd.DataFrame({"PassengerId": test_data["PassengerId"], "Transported": test_predictions})
output.to_csv("submission_ensemble.csv", index=False)
print("Test predictions saved to submission_ensemble.csv")

Test predictions saved to submission_ensemble.csv
