In [519]:
%run titanic_data.ipynb

Note: you may need to restart the kernel to use updated packages.
scikit-learn has been installed.


In [520]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

### Data preprocessing

In [521]:
# Select the features to be used for prediction
features = ['pclass', 'sex', 'age']
target = 'survived'

In [522]:
# Remove unnecessary columns
data = data.drop(['name', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest', 'sibsp', 'parch'], axis=1)

# Convert categorical variables to numerical
data['sex'] = data['sex'].map({'male': 0, 'female': 1})

### Split the dataset into features and target

In [523]:
# Handle missing values
data[features] = data[features].fillna(data[features].mean())

data[target] = data[target].fillna(data[target].mean()).astype(int)

### Split the data into training and testing sets
Here, 80% of the data is used for training (X_train and y_train), and 20% is used for testing (X_test and y_test). The test_size=0.2 parameter specifies the proportion of the dataset to be used for testing, and random_state=42 ensures reproducibility of the split.

In [524]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

### Model training and prediction

We then train the model on the training data by calling the fit() method on the model object and passing in the features X_train and the corresponding target y_train. This step involves the algorithm learning patterns and relationships in the training data.

In [525]:
# Create a decision tree classifier
model = RandomForestClassifier()

# Train the model
model.fit(X_train, y_train)

In [526]:
# Predict on the test set
predictions = model.predict(X_test)

The accuracy represents the proportion of correct predictions made by the model.

In [527]:
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7557251908396947


### Calculate binary cross-entropy loss

In [528]:
# Calculate the predicted probabilities for the testing data
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Select probabilities for the survived passenger

The Binary Cross-Entropy Loss measures the dissimilarity between the predicted probabilities and the true binary labels. It quantifies the difference between the predicted probability of the positive class and the actual binary outcome. The goal is to minimize this loss function during model training.

In [529]:
# Calculate the binary cross-entropy loss
loss = log_loss(y_test, y_pred_proba)
print("Binary Cross-Entropy Loss:", loss)

Binary Cross-Entropy Loss: 1.4714281383803414
