# Data Pre-Process

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
df_all= pd.concat([train_df, test_df], sort=True).reset_index(drop=True)
# Save PassengerId for final submission
passenger_ids = test_df['PassengerId']
df_all['Age'] = df_all.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
df_all['Embarked'] = df_all['Embarked'].fillna('S')
med_fare = df_all.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
# Filling the missing value in Fare with the median Fare of 3rd class alone passenger
df_all['Fare'] = df_all['Fare'].fillna(med_fare)
df_all['Deck'] = df_all['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

df_all.loc[df_all['Deck'] == 'T', 'Deck'] = 'A'
df_all['Deck'] = df_all['Deck'].replace(['A', 'B', 'C'], 'ABC')
df_all['Deck'] = df_all['Deck'].replace(['D', 'E'], 'DE')
df_all['Deck'] = df_all['Deck'].replace(['F', 'G'], 'FG')

df_all['Deck'].value_counts()
df_all.drop(['PassengerId', 'Name', 'Ticket','Cabin'], axis=1, inplace=True)
df_all['Fare']=np.log(df_all['Fare']+1)

In [None]:
from sklearn.preprocessing import LabelEncoder

# label encoding option vs one hot
one_hot=True
if one_hot==False:
  # label encoding
  label_encoder = LabelEncoder()
  df_all['Embarked'] = label_encoder.fit_transform(df_all['Embarked'])
  df_all['Sex'] = label_encoder.fit_transform(df_all['Sex'])
  df_all['Deck'] = label_encoder.fit_transform(df_all['Deck'])
else:
  # one hot
  df_all = pd.get_dummies(df_all, columns=['Embarked', 'Sex', 'Deck'])

# Convert all categorical columns to numeric codes
for col in df_all.select_dtypes(include=['bool']).columns:
    df_all[f'{col}'] = pd.Categorical(df_all[col]).codes


train_df = df_all.loc[:890]
test_df = df_all.loc[891:]
test_df.drop('Survived', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop('Survived', axis=1, inplace=True)


In [None]:
train_df

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,Survived,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,Deck_ABC,Deck_DE,Deck_FG,Deck_M
0,22.0,2.110213,0,3,1,0.0,0,0,1,0,1,0,0,0,1
1,38.0,4.280593,0,1,1,1.0,1,0,0,1,0,1,0,0,0
2,26.0,2.188856,0,3,0,1.0,0,0,1,1,0,0,0,0,1
3,35.0,3.990834,0,1,1,1.0,0,0,1,1,0,1,0,0,0
4,35.0,2.202765,0,3,0,0.0,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,2.639057,0,2,0,0.0,0,0,1,0,1,0,0,0,1
887,19.0,3.433987,0,1,0,1.0,0,0,1,1,0,1,0,0,0
888,22.0,3.196630,2,3,1,0.0,0,0,1,1,0,0,0,0,1
889,26.0,3.433987,0,1,0,1.0,1,0,0,0,1,1,0,0,0


In [None]:
# Drop NA values
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# Separate features and target variable
X = train_df.drop('Survived', axis=1).values
y = train_df['Survived'].values
X_test = test_df.values

# Split the training data for validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# # # Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.dropna(inplace=True)


# NN

In [None]:
# Build the model
dense = Sequential()
# Input layer with first hidden layer
dense.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
# Second hidden layer (optional)
dense.add(Dense(64, activation='relu',kernel_initializer = 'he_normal'))
dense.add(tf.keras.layers.BatchNormalization())
dense.add(tf.keras.layers.Dropout(0.1))

dense.add(Dense(32, activation='relu',kernel_initializer = 'he_normal'))
dense.add(tf.keras.layers.Dropout(0.1))


# Output layer
dense.add(Dense(1, activation='sigmoid'))
# Compile the model
dense.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Train the model
history = dense.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=100,
    batch_size=32,
    verbose=2
)

Epoch 1/100
23/23 - 2s - 98ms/step - accuracy: 0.6433 - loss: 0.6505 - val_accuracy: 0.6983 - val_loss: 0.6327
Epoch 2/100
23/23 - 0s - 18ms/step - accuracy: 0.7865 - loss: 0.4890 - val_accuracy: 0.7654 - val_loss: 0.5717
Epoch 3/100
23/23 - 0s - 5ms/step - accuracy: 0.7795 - loss: 0.4841 - val_accuracy: 0.7989 - val_loss: 0.5348
Epoch 4/100
23/23 - 0s - 6ms/step - accuracy: 0.8104 - loss: 0.4673 - val_accuracy: 0.7989 - val_loss: 0.5123
Epoch 5/100
23/23 - 0s - 6ms/step - accuracy: 0.8216 - loss: 0.4400 - val_accuracy: 0.8101 - val_loss: 0.4979
Epoch 6/100
23/23 - 0s - 7ms/step - accuracy: 0.8048 - loss: 0.4464 - val_accuracy: 0.7989 - val_loss: 0.4853
Epoch 7/100
23/23 - 0s - 5ms/step - accuracy: 0.8160 - loss: 0.4161 - val_accuracy: 0.7933 - val_loss: 0.4727
Epoch 8/100
23/23 - 0s - 5ms/step - accuracy: 0.8174 - loss: 0.4191 - val_accuracy: 0.7933 - val_loss: 0.4594
Epoch 9/100
23/23 - 0s - 5ms/step - accuracy: 0.8272 - loss: 0.4088 - val_accuracy: 0.8101 - val_loss: 0.4471
Epoch 10

In [None]:
# Evaluate on validation set
val_loss, val_accuracy = dense.evaluate(X_valid, y_valid, verbose=0)
print(f'Validation Accuracy: {val_accuracy:.4f}')


Validation Accuracy: 0.8268


# Other

In [None]:
# Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.9}
# Benchmarking with XGBoost
xgb_model = xgb.XGBClassifier( eval_metric='logloss',learning_rate=0.1,max_depth=8,n_estimators=300)
xgb_model.fit(X_train, y_train)
xgb_valid_pred = xgb_model.predict(X_valid)
xgb_accuracy = accuracy_score(y_valid, xgb_valid_pred)
print(f'XGBoost Accuracy: {xgb_accuracy}')

XGBoost Accuracy: 0.8435754189944135


In [None]:
# train a random forest model
rf_model = RandomForestClassifier(min_impurity_decrease=0.0, max_depth=None, criterion='gini',
                                   min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_leaf_nodes=None, bootstrap=True, oob_score=False,
                                  n_estimators=400, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0,
                                  max_samples=None)
rf_model.fit(X_train, y_train)
rf_valid_pred = rf_model.predict(X_valid)
rf_accuracy = accuracy_score(y_valid, rf_valid_pred)
print(f'Random Forest Accuracy: {rf_accuracy}')

Random Forest Accuracy: 0.8044692737430168


In [None]:
# train a logistic regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_valid_pred = lr_model.predict(X_valid)
lr_accuracy = accuracy_score(y_valid, lr_valid_pred)
print(f'Logistic Regression Accuracy: {lr_accuracy}')

Logistic Regression Accuracy: 0.8156424581005587


In [None]:
import numpy as np
import pandas as pd

predictions = {
    'RandomForest': rf_model.predict(X_test).reshape(-1),
    'LogisticRegression': lr_model.predict(X_test).reshape(-1),
    'XGBoost': xgb_model.predict(X_test).reshape(-1),
    'Dense': (lambda probs: (probs >= 0.5).astype(int))(dense.predict(X_test)).reshape(-1)
}

# Convert predictions to a DataFrame for easy manipulation
pred_df = pd.DataFrame(predictions)

# Perform majority voting
ensemble_pred = pred_df.mode(axis=1)[0].astype(int)

# Save the results to a CSV file
submission_df = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': ensemble_pred
})
submission_df.to_csv('submission6:47.csv', index=False)

print('Submission file created successfully!')


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Submission file created successfully!


In [None]:
pred_df

Unnamed: 0,RandomForest,LogisticRegression,XGBoost,Dense
0,0.0,0.0,0,0
1,0.0,0.0,0,0
2,0.0,0.0,0,0
3,1.0,0.0,1,0
4,0.0,1.0,1,0
...,...,...,...,...
413,0.0,0.0,0,0
414,1.0,1.0,1,1
415,0.0,0.0,0,0
416,0.0,0.0,0,0


In [None]:
# Initialize a DataFrame to store pairwise agreement
models = pred_df.columns
pairwise_agreement = pd.DataFrame(index=models, columns=models)

# Calculate pairwise agreement
for model1 in models:
    for model2 in models:
        if model1 != model2:
            agreement = (pred_df[model1] == pred_df[model2]).mean()
            pairwise_agreement.loc[model1, model2] = agreement

print("Pairwise Agreement:")
print(pairwise_agreement)

Pairwise Agreement:
                   RandomForest LogisticRegression   XGBoost     Dense
RandomForest                NaN           0.842105   0.91866  0.894737
LogisticRegression     0.842105                NaN  0.832536  0.856459
XGBoost                 0.91866           0.832536       NaN  0.866029
Dense                  0.894737           0.856459  0.866029       NaN


In [None]:
correlation_matrix = pred_df.corr()

print("Correlation Matrix:")
print(correlation_matrix)


Correlation Matrix:
                    RandomForest  LogisticRegression   XGBoost     Dense
RandomForest            1.000000            0.667767  0.826567  0.774873
LogisticRegression      0.667767            1.000000  0.646247  0.696741
XGBoost                 0.826567            0.646247  1.000000  0.709266
Dense                   0.774873            0.696741  0.709266  1.000000
