In [1]:
titanic <- read.table('train.csv', sep=",", header = T)

In [2]:
rownames(titanic) <- titanic$PassengerId
titanic$PassengerId <- NULL
titanic$Name <- NULL

In [3]:
str(titanic)

'data.frame':	891 obs. of  10 variables:
 $ Survived: int  0 1 1 1 0 0 0 0 1 1 ...
 $ Pclass  : int  3 1 3 1 3 3 1 3 3 2 ...
 $ Sex     : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
 $ Age     : num  22 38 26 35 35 NA 54 2 27 14 ...
 $ SibSp   : int  1 1 0 1 0 0 0 3 0 1 ...
 $ Parch   : int  0 0 0 0 0 0 0 1 2 0 ...
 $ Ticket  : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
 $ Fare    : num  7.25 71.28 7.92 53.1 8.05 ...
 $ Cabin   : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
 $ Embarked: Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...


In [4]:
titanic$Pclass <- factor(titanic$Pclass)
titanic$SibSp <- factor(titanic$SibSp)
titanic$Parch <- factor(titanic$Parch) # parch means number of parents
titanic$Embarked <- factor(titanic$Embarked)
# Categorize the values/information to numeric levels

In [5]:
sum(titanic$Cabin == "")
titanic$Has_Cabin <- factor(titanic$Cabin != "")
titanic$Cabin <- NULL
# If cabin is empty, we'll assign a null, and if the cabin does has value, then we factor the cabin

In [6]:
titanic$Ticket <- NULL

In [7]:
library(caret)
# attempt to streamline the model building and evaluation process.

Loading required package: lattice
Loading required package: ggplot2


In [8]:
set.seed(10)

In [9]:
train_index <- createDataPartition(titanic$Survived,p=0.8,list=FALSE)
training_data <- titanic[train_index,]
validation_data <- titanic[-train_index,]
# Train: evaluate, using resampling, the effect of model tuning parameters on performance
# choose the “optimal” model across these parameters
# estimate model performance from a training set

In [10]:
prop.table(table(training_data$Survived))
prop.table(table(validation_data$Survived))


        0         1 
0.6143058 0.3856942 


        0         1 
0.6235955 0.3764045 

In [40]:
colSums(is.na(training_data))
colSums(is.na(validation_data))
mean_age <- mean(training_data$Age, na.rm = T)
training_data$Age[is.na(training_data$Age)] <- mean_age
validation_data$Age[is.na(validation_data$Age)] <- mean_age
colSums(is.na(training_data))
colSums(is.na(validation_data))


In [41]:
str(training_data)

'data.frame':	713 obs. of  9 variables:
 $ Survived : int  0 1 1 0 0 0 0 1 1 0 ...
 $ Pclass   : Factor w/ 3 levels "1","2","3": 3 3 1 3 3 1 3 3 1 3 ...
 $ Sex      : Factor w/ 2 levels "female","male": 2 1 1 2 2 2 2 1 1 2 ...
 $ Age      : num  22 26 35 35 30.1 ...
 $ SibSp    : Factor w/ 7 levels "0","1","2","3",..: 2 1 2 1 1 1 4 2 1 1 ...
 $ Parch    : Factor w/ 7 levels "0","1","2","3",..: 1 1 1 1 1 1 2 2 1 1 ...
 $ Fare     : num  7.25 7.92 53.1 8.05 8.46 ...
 $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 4 4 4 3 4 4 4 4 4 ...
 $ Has_Cabin: Factor w/ 2 levels "FALSE","TRUE": 1 1 2 1 1 2 1 2 2 1 ...


In [42]:
attach(training_data)

In [43]:
table(Pclass)
prop.table(table(Pclass, Survived), 1)

Pclass
  1   2   3 
174 151 388 

      Survived
Pclass         0         1
     1 0.3850575 0.6149425
     2 0.5033113 0.4966887
     3 0.7603093 0.2396907

In [45]:
table(Sex)
prop.table(table(Sex, Survived), 1)

Sex
female   male 
   251    462 

        Survived
Sex              0         1
  female 0.2629482 0.7370518
  male   0.8051948 0.1948052

In [47]:
table(SibSp)
prop.table(table(SibSp, Survived), 1)

SibSp
  0   1   2   3   4   5   8 
498 164  21   8  12   4   6 

     Survived
SibSp         0         1
    0 0.6485944 0.3514056
    1 0.4512195 0.5487805
    2 0.6666667 0.3333333
    3 0.8750000 0.1250000
    4 0.8333333 0.1666667
    5 1.0000000 0.0000000
    8 1.0000000 0.0000000

In [38]:
table(Parch)
prop.table(table(Parch, Survived), 1)

Parch
  0   1   2   3   4   5   6 
547  93  59   5   4   4   1 

     Survived
Parch         0         1
    0 0.6544790 0.3455210
    1 0.4301075 0.5698925
    2 0.5084746 0.4915254
    3 0.4000000 0.6000000
    4 1.0000000 0.0000000
    5 0.7500000 0.2500000
    6 1.0000000 0.0000000

In [39]:
table(Embarked)
prop.table(table(Embarked, Survived), 1)

Embarked
      C   Q   S 
  1 121  62 529 

        Survived
Embarked         0         1
         0.0000000 1.0000000
       C 0.4628099 0.5371901
       Q 0.6129032 0.3870968
       S 0.6502836 0.3497164

In [51]:
training_data[training_data$Sex == 'female',]$Survived

In [None]:
women_and_children_survived_with_less_than_3SibSp <- rep(0, length(titanic$Survived))
    women_and_children_survived_with_less_than_3SibSp[titanic$Sex == 'female'] <- 1
    women_and_children_survived_with_less_than_3SibSp[titanic$Age < 10] <- 1
    women_and_children_survived_with_less_than_3SibSp[titanic$SibSp < 4] <- 1