In [2]:
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint



titanic = sns.load_dataset('titanic')
titanic.head()
titanic_clean = titanic.drop(columns = ['deck','embark_town', 'alive'])
titanic_clean['age'] = titanic_clean['age'].fillna(titanic_clean['age'].median())
titanic_clean = titanic_clean.dropna()
titanic_clean.isnull().sum()
titanic_clean['sex'] = titanic_clean['sex'].map({'male' : 0, 'female' : 1})
titanic_clean['class'] = titanic_clean['class'].map({'Third' : 3, 'Second' : 2, 'First' : 1})
x = titanic_clean[['pclass', 'sex', 'age', 'fare', 'class']]
y = titanic_clean['survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#model = LogisticRegression(max_iter = 1000)
#model.fit(x_train, y_train)
#y_pred = model.predict(x_test)
#acc = accuracy_score(y_test, y_pred)
#print("Accuracy:", acc)
#print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
#precision = precision_score(y_test, y_pred)
#recall = recall_score(y_test, y_pred)
#f1 = f1_score(y_test, y_pred)

#print(f"Precision: {precision:.2f}")
#print(f"Recall: {recall:.2f}")
#print(f"F1 Score: {f1:.2f}")

# -------------------------------------- #

#dt_model = DecisionTreeClassifier(random_state=42)
#dt_model.fit(x_train, y_train)
#y_pred_dt = dt_model.predict(x_test)
#acc_dt = accuracy_score(y_test, y_pred_dt)
#print(f"Decision Tree Accuracy: {acc_dt:.2f}")
#print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
#precision_dt = precision_score(y_test, y_pred_dt)
#recall_dt = recall_score(y_test, y_pred_dt)
#f1_dt = f1_score(y_test, y_pred_dt)

#print(f"Precision: {precision_dt:.2f}")
#print(f"Recall: {recall_dt:.2f}")
#print(f"F1 Score: {f1_dt:.2f}")

# -------------------------------------- #

#rf_model = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=5, min_samples_split=5, max_features='sqrt')
#rf_model.fit(x_train, y_train)
#y_pred_rf = rf_model.predict(x_test)
#acc_rf = accuracy_score(y_test, y_pred_rf)
#print(f"Random Forest Tree Accuracy: {acc_rf:.2f}")
#print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
#precision_rf = precision_score(y_test, y_pred_rf)
#recall_rf = recall_score(y_test, y_pred_rf)
#f1_rf = f1_score(y_test, y_pred_rf)

#print(f"Precision: {precision_rf:.2f}")
#print(f"Recall: {recall_rf:.2f}")
#print(f"F1 Score: {f1_rf:.2f}")

# -------------------------------------- #

#rf = RandomForestClassifier(random_state=42)
#param_grid = {
#    'n_estimators': [100, 200, 300],
#    'max_depth': [5, 10, None]
#}
#grid_search = GridSearchCV(
#   estimator=rf,
#    param_grid=param_grid,
#    cv=3,                # 3-fold cross-validation
#    n_jobs=-1,           # Use all cores
#    verbose=2
#)
#grid_search.fit(x_train, y_train)
#print("Best parameters:", grid_search.best_params_)
#best_rf = grid_search.best_estimator_
#y_pred_best = best_rf.predict(x_test)

#print(f"Best RF Accuracy: {accuracy_score(y_test, y_pred_best):.2f}")
#print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
#print(f"Precision: {precision_score(y_test, y_pred_best):.2f}")
#print(f"Recall: {recall_score(y_test, y_pred_best):.2f}")
#print(f"F1 Score: {f1_score(y_test, y_pred_best):.2f}")

# -------------------------------------- #

#xgb_model = xgb.XGBClassifier(
#    n_estimators=100,      # Number of trees
#    max_depth=5,           # Max depth of each tree
#    learning_rate=0.1,     # Step size shrinkage
#    random_state=42,
#    use_label_encoder=False,
#    eval_metric='logloss'  # Avoid warning for new versions
#)

#x_train_encoded = pd.get_dummies(x_train)
#x_test_encoded = pd.get_dummies(x_test)

#x_test_encoded = x_test_encoded.reindex(columns=x_train_encoded.columns, fill_value=0)

#xgb_model.fit(x_train_encoded, y_train)
#y_pred_xgb = xgb_model.predict(x_test_encoded)

#print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb):.2f}")
#print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
#print(f"Precision: {precision_score(y_test, y_pred_xgb):.2f}")
#print(f"Recall: {recall_score(y_test, y_pred_xgb):.2f}")
#print(f"F1 Score: {f1_score(y_test, y_pred_xgb):.2f}")

# -------------------------------------- #
# Deep Learning concepts - layers and optimizers in keras

# Scale the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

model = Sequential([
    Input(shape=(x_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
checkpoint = ModelCheckpoint("best_titanic_model.keras", save_best_only=True)

history = model.fit(
    x_train_scaled,
    y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    callbacks=[early_stop, checkpoint],
    verbose=1
)

y_pred_dl = (model.predict(x_test_scaled) > 0.5).astype("int32")

acc_dl = accuracy_score(y_test, y_pred_dl)
print(f"Accuracy: {acc_dl:.2f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dl))
print("Precision:", precision_score(y_test, y_pred_dl))
print("Recall:", recall_score(y_test, y_pred_dl))
print("F1 Score:", f1_score(y_test, y_pred_dl))


Epoch 1/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step - accuracy: 0.3602 - loss: 1.5566 - val_accuracy: 0.6853 - val_loss: 0.6563
Epoch 2/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5914 - loss: 0.7827 - val_accuracy: 0.7483 - val_loss: 0.5949
Epoch 3/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7181 - loss: 0.6231 - val_accuracy: 0.7413 - val_loss: 0.5635
Epoch 4/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.7628 - loss: 0.5704 - val_accuracy: 0.7483 - val_loss: 0.5432
Epoch 5/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.7742 - loss: 0.5381 - val_accuracy: 0.7413 - val_loss: 0.5303
Epoch 6/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7544 - loss: 0.5373 - val_accuracy: 0.7483 - val_loss: 0.5181
Epoch 7/50
[1m18/18[0m [32m━━━━