In [154]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from tensorflow import keras
from tensorflow.keras import layers
from scikeras.wrappers import KerasClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score
import seaborn
from matplotlib import pyplot

# Load the dataset, split features from label, and visualize first 5 entries.
df = pd.read_csv('dataset/train.csv')
features = df.drop(columns = 'Transported')
label = df['Transported']
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [155]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

def preprocess(data):
    # Feature engineering: Cabin encoding and total spent, age group, and path features injection.
    data[['cabin_deck', 'cabin_number', 'cabin_side']] = data['Cabin'].str.split('/', n=3, expand=True)
    data['cabin_number'] = (data['cabin_number']).astype('float')
    data['total_spent'] = data['RoomService']+data['FoodCourt']+data['ShoppingMall']+data['Spa']+data['VRDeck']
    data['age_0-18'] = data['Age'].apply(lambda x: True if x<=18.0 else False)
    data['path'] = data['HomePlanet']+data['Destination']
    # Drop least relevant and engined cabin column
    data.drop(columns=['PassengerId','Name','Cabin'],inplace=True)

    # Split features into numeric and categorical columns
    numeric_data = data.select_dtypes(include=[np.number])
    categorical_data = data.select_dtypes(exclude=[np.number])
    # Categorical data - mode
    columns = categorical_data.columns.to_list()
    categorical_data = pd.DataFrame(SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit_transform(categorical_data),columns = columns)
    categorical_data = pd.DataFrame(OneHotEncoder(sparse_output=False).fit_transform(categorical_data))
    # Numerical data - mean
    columns = numeric_data.columns.to_list()
    numeric_data = pd.DataFrame(SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(numeric_data),columns = columns)
    numeric_data = pd.DataFrame(StandardScaler().fit_transform(numeric_data))
    # Concat
    data = pd.concat([numeric_data,categorical_data], axis = 1)
    return data

features = preprocess(features)

In [156]:
# Split the resampled data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, label, test_size=0.2, random_state=1)

In [157]:
# Define a Keras neural network model
def create_keras_model():
    model = keras.Sequential([
        layers.Dense(128, input_dim=X_train.shape[1], activation="relu"),
        layers.Dense(64, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(1, activation="sigmoid")
    ])
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

keras_classifier = KerasClassifier(build_fn=create_keras_model, epochs=120, batch_size=64, verbose=1)


In [158]:
# Ensemble Learning (Soft Voting)
models = [
    ("Random Forest", RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)),
    ("Gradient Boosting", GradientBoostingClassifier(n_estimators=100, random_state=1)),
    ("Decision Tree", DecisionTreeClassifier(criterion="entropy", max_depth=4, random_state=1)),
    ("SVM (RBF Kernel)", SVC(kernel="rbf", C=1.0, gamma=0.2, probability=True, random_state=1)),
    ("Logistic Regression", LogisticRegression(random_state=1, max_iter=50000)),
    ("Keras Neural Network", keras_classifier),
]

ensemble = VotingClassifier(models, voting="soft")
# Train the ensemble model
ensemble.fit(X_train, y_train)


Epoch 1/120


  X, y = self._initialize(X, y)


Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78/120
Epoch 7

In [159]:
X_test = pd.read_csv('dataset/test.csv')
X_test = preprocess(X_test)

In [160]:
# Evaluate the ensemble model on the validation set
ensemble_accuracy = accuracy_score(y_val, ensemble.predict(X_val))
print(f"Ensemble Model Accuracy: {ensemble_accuracy}")

# Get probability estimates for test data
test_probabilities = ensemble.predict_proba(X_test)[:, 1]
print(test_probabilities)

Ensemble Model Accuracy: 0.8182863714778609
[0.60352458 0.06350984 0.9566669  ... 0.90374423 0.67667278 0.62981105]


In [161]:
# Apply a threshold to convert probabilities to binary predictions
test_data = pd.read_csv('dataset/test.csv')
threshold = 0.5
test_predictions = (test_probabilities > threshold).astype(bool)

# Save the test predictions to a CSV file
output = pd.DataFrame({"PassengerId": test_data["PassengerId"], "Transported": test_predictions})
output.to_csv("submission_ensemble.csv", index=False)
print("Test predictions saved to submission_ensemble.csv")

Test predictions saved to submission_ensemble.csv
