In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import joblib
import warnings
warnings.filterwarnings('ignore')  # silence annoying warnings

In [53]:
results = []
def evaluate_model(name, model, X_test, y_test, results):
    y_pred = model.predict(X_test)
    try:
        y_prob = model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_prob)
    except AttributeError:
        y_prob = None
        roc_auc = None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC-AUC': roc_auc
    })

## preparing dataset

### uploading the dataset

In [54]:
df = pd.read_csv("..\data\cleaned_titanic.csv")

### deviding the data to variables and target

In [55]:
X = df.drop(columns=['Survived', 'PassengerId'])  # input features
y = df['Survived']  # target

### spilitting the data into training and testing 

In [56]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## model training

In [57]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

y_pred = dt_model.predict(X_val)  # Predict using the validation set
print("Accuracy:", accuracy_score(y_val, y_pred))  # Check accuracy
evaluate_model("Decision Tree", dt_model, X_val, y_val, results)

Accuracy: 0.7877094972067039


In [58]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.81      0.82       105
           1       0.74      0.76      0.75        74

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



## saving the model to be used in the application

In [59]:
joblib.dump(dt_model, "titanic_decisiontree_model.pkl")
print("Model saved!")

Model saved!


## Training Another Model for Comparatement Purposes 

### LogisticRegression 

In [60]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
evaluate_model("Logistic Regression", log_model, X_val, y_val, results)

Accuracy: 0.8100558659217877


###  RandomForestClassifier

In [61]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
evaluate_model("Random Forest", rf_model, X_val, y_val, results)

Accuracy: 0.8100558659217877


### KNeighborsClassifier

In [62]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
evaluate_model("KNN", knn_model, X_val, y_val, results)

Accuracy: 0.7150837988826816


###  XGBoost

In [63]:
from sklearn.svm import SVC

xgb_model = SVC(kernel='rbf', probability=True)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
evaluate_model("XGBoost", xgb_model, X_val, y_val, results)

Accuracy: 0.659217877094972


## show model comparison

In [65]:
df_results = pd.DataFrame(results)
df_results.sort_values(by="F1 Score", ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC-AUC
2,Random Forest,0.810056,0.77027,0.77027,0.77027,0.892535
1,Logistic Regression,0.810056,0.785714,0.743243,0.763889,0.882625
0,Decision Tree,0.787709,0.736842,0.756757,0.746667,0.803475
3,KNN,0.715084,0.709091,0.527027,0.604651,0.763192
4,XGBoost,0.659218,0.76,0.256757,0.383838,0.804118


## using the randomforest model cuz it has the best F1 and ROC-AUC, meaning it balances precision & recall and has strong probabilistic separation

In [66]:
joblib.dump(rf_model, "titanic_randomforest_model.pkl")
print("Model saved!")

Model saved!
