# Titanic Survival Prediction by Logistic Regression

In [1]:
# install packages
install.packages("titanic")
library(titanic)
library(tidyverse)

In [2]:
data("titanic_train")

## glimpse(data)
glimpse(titanic_train)

## DROP NA (missing values)
titanic_train <- na.omit(titanic_train)
cat("\nThe number of rows after dropping missing values:", nrow(titanic_train))

## Convert the data type of the Survived column to factor
titanic_train <- titanic_train %>% 
  mutate(Survived = factor(Survived,
                           levels = c(0, 1),
                           labels = c("Died", "Survived")))

Rows: 891
Columns: 12
$ PassengerId [3m[90m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
$ Survived    [3m[90m<int>[39m[23m 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
$ Pclass      [3m[90m<int>[39m[23m 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
$ Name        [3m[90m<chr>[39m[23m "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
$ Sex         [3m[90m<chr>[39m[23m "male", "female", "female", "female", "male", "male", "mal…
$ Age         [3m[90m<dbl>[39m[23m 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
$ SibSp       [3m[90m<int>[39m[23m 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
$ Parch       [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
$ Ticket      [3m[90m<chr>[39m[23m "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
$ Fare        [3m[90m<dbl>[39m[23m 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583

In [3]:
## Split data
set.seed(70)
n <- nrow(titanic_train)                  # check the number of rows
id <- sample(1:n, size = n*0.7)           # 70% train 30% test
train_data <- titanic_train[id, ]         # 70% train
test_data <- titanic_train[-id, ]         # 30% test

In [6]:
## Train Model

# Fit model
LogModel <- glm(Survived ~ Pclass + Sex + Age + SibSp + Parch, 
                data = train_data, family = "binomial")
summary(LogModel)

# Predict the probability of survival using Train Data
train_data$prob_survived <- predict(LogModel, type = "response")
train_data$pred_survived <- if_else(train_data$prob_survived >= 0.5, "Survived", "Died")

# Survived vs. Predicted Survived of Train Data
train_data %>% 
    select(PassengerId, Survived, prob_survived, pred_survived) %>%
    head(10)


Call:
glm(formula = Survived ~ Pclass + Sex + Age + SibSp + Parch, 
    family = "binomial", data = train_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7120  -0.5989  -0.4244   0.6297   2.3493  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  5.025173   0.634333   7.922 2.34e-15 ***
Pclass      -1.158931   0.162463  -7.133 9.79e-13 ***
Sexmale     -2.619153   0.257547 -10.170  < 2e-16 ***
Age         -0.036895   0.009619  -3.836 0.000125 ***
SibSp       -0.232488   0.148095  -1.570 0.116449    
Parch        0.046009   0.144085   0.319 0.749488    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 675.80  on 498  degrees of freedom
Residual deviance: 450.23  on 493  degrees of freedom
AIC: 462.23

Number of Fisher Scoring iterations: 4


Unnamed: 0_level_0,PassengerId,Survived,prob_survived,pred_survived
Unnamed: 0_level_1,<int>,<fct>,<dbl>,<chr>
876,876,Survived,0.73006514,Survived
347,347,Survived,0.77408174,Survived
395,395,Survived,0.68024838,Survived
393,393,Died,0.07117268,Died
176,176,Died,0.12770939,Died
379,379,Died,0.1407982,Died
810,810,Survived,0.91805532,Survived
371,371,Survived,0.52303905,Survived
70,70,Died,0.07620779,Died
524,524,Survived,0.90795046,Survived


In [None]:
## Test Model
# Predict the probability of survival using Test Data
test_data$prob_survived <- predict(LogModel, newdata = test_data, type = "response")
test_data$pred_survived <- if_else(test_data$prob_survived >= 0.5, "Survived", "Died")

# Survived vs. Predicted Survived of Test Data
test_data %>% 
    select(PassengerId, Survived, prob_survived, pred_survived) %>%
    head(10)

Unnamed: 0_level_0,PassengerId,Survived,prob_survived,pred_survived
Unnamed: 0_level_1,<int>,<fct>,<dbl>,<chr>
1,1,Died,0.1076518,Died
4,4,Survived,0.9123301,Survived
7,7,Died,0.321857,Died
11,11,Survived,0.7710625,Survived
19,19,Died,0.5429295,Survived
23,23,Survived,0.7300651,Survived
25,25,Died,0.6460597,Survived
31,31,Died,0.4430684,Died
36,36,Died,0.3693522,Died
39,39,Died,0.6033137,Survived


In [10]:
## Model Evaluation
# Train Confusion Matrix
conMat_Train <- table(train_data$pred_survived, train_data$Survived,
                      dnn = c("Predicted", "Actual"))

# For creating a Train Dataframe
trainAccuracy <- (conMat_Train[1, 1] + conMat_Train[2, 2]) / sum(conMat_Train)
trainPrecision <- conMat_Train[2, 2] / (conMat_Train[2, 1] + conMat_Train[2, 2])
trainRecall <- conMat_Train[2, 2] / (conMat_Train[1, 2] + conMat_Train[2, 2])
trainF1Score <- 2 * ((trainPrecision * trainRecall)/(trainPrecision + trainRecall))

# Train Model Evaluation
Train <- data.frame(Model = "Train",
                    Accuracy = trainAccuracy,
                    Precision = trainPrecision,
                    Recall = trainRecall,
                    F1_Score = trainF1Score)

cat("Train Confusion Matrix:\n")
print(conMat_Train)
cat("\n")
cat("Train Model Evaluation:\n")
print(Train)

Train Confusion Matrix:
          Actual
Predicted  Died Survived
  Died      252       60
  Survived   42      145

Train Model Evaluation:
  Model  Accuracy Precision    Recall  F1_Score
1 Train 0.7955912 0.7754011 0.7073171 0.7397959


In [11]:
## Model Evaluation
# Test Confusion Matrix
conMat_Test <- table(test_data$pred_survived, test_data$Survived,
                     dnn = c("Predicted", "Actual"))

# For creating a Test Dataframe
testAccuracy <- (conMat_Test[1, 1] + conMat_Test[2, 2]) / sum(conMat_Test)
testPrecision <- conMat_Test[2, 2] / (conMat_Test[2, 1] + conMat_Test[2, 2])
testRecall <- conMat_Test[2, 2] / (conMat_Test[1, 2] + conMat_Test[2, 2])
testF1Score <- 2 * ((testPrecision * testRecall)/(testPrecision + testRecall))

# Test Model Evaluation
Test <- data.frame(Model = "Test",
                   Accuracy = testAccuracy,
                   Precision = testPrecision,
                   Recall = testRecall,
                   F1_Score = testF1Score)

cat("Test Confusion Matrix:\n")
print(conMat_Test)
cat("\n")
cat("Test Model Evaluation:\n")
print(Test)

Test Confusion Matrix:
          Actual
Predicted  Died Survived
  Died      103       20
  Survived   27       65

Test Model Evaluation:
  Model  Accuracy Precision    Recall  F1_Score
1  Test 0.7813953 0.7065217 0.7647059 0.7344633


In [12]:
## Summary
# Bind the Train Dataframe and Test Dataframe together
TrainTest <- bind_rows(Train, Test)
tibble(TrainTest)

Model,Accuracy,Precision,Recall,F1_Score
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Train,0.7955912,0.7754011,0.7073171,0.7397959
Test,0.7813953,0.7065217,0.7647059,0.7344633
