In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


## 1. Load the Preprocessed Features

In [11]:
data_path = '../data/processed/train.csv'
titanic_data = pd.read_csv(data_path)


titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 29 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PassengerId     891 non-null    int64  
 1   Survived        891 non-null    int64  
 2   Pclass          891 non-null    int64  
 3   Age             891 non-null    float64
 4   SibSp           891 non-null    int64  
 5   Parch           891 non-null    int64  
 6   Fare            891 non-null    float64
 7   Sex_female      891 non-null    bool   
 8   Sex_male        891 non-null    bool   
 9   Embarked_C      891 non-null    bool   
 10  Embarked_Q      891 non-null    bool   
 11  Embarked_S      891 non-null    bool   
 12  Title_Capt      891 non-null    bool   
 13  Title_Col       891 non-null    bool   
 14  Title_Countess  891 non-null    bool   
 15  Title_Don       891 non-null    bool   
 16  Title_Dr        891 non-null    bool   
 17  Title_Jonkheer  891 non-null    boo

In [7]:

# Split the data into features (X) and target (y)
X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


## 2. Evaluate a variety of Models

### Logistic Regression

In [8]:

# Train and evaluate a Logistic Regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
logreg_preds = logreg_model.predict(X_val)

print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_val, logreg_preds))
print("Classification Report:\n", classification_report(y_val, logreg_preds))


Logistic Regression:
Accuracy: 0.7932960893854749
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.85      0.83       105
           1       0.77      0.72      0.74        74

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Random Forest Classifier

In [9]:

# Train and evaluate a Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_val)

print("\nRandom Forest:")
print("Accuracy:", accuracy_score(y_val, rf_preds))
print("Classification Report:\n", classification_report(y_val, rf_preds))



Random Forest:
Accuracy: 0.8324022346368715
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.87      0.86       105
           1       0.81      0.78      0.79        74

    accuracy                           0.83       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.83      0.83      0.83       179



### XGBoost

In [10]:

# Train and evaluate an XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_val)

print("\nXGBoost:")
print("Accuracy:", accuracy_score(y_val, xgb_preds))
print("Classification Report:\n", classification_report(y_val, xgb_preds))



XGBoost:
Accuracy: 0.7932960893854749
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.81      0.82       105
           1       0.74      0.77      0.75        74

    accuracy                           0.79       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.79      0.79      0.79       179



## 3. Serialize the Best Model

In [None]:
import joblib
from pathlib import Path

models_directory = Path('../models')
models_directory.mkdir(exist_ok=True)

models_filename = f""

joblib.dump(rf_model, 'RFC_model_0001.joblib')