In [90]:
library(ggplot2)
library(randomForest)
library(dplyr)

In [91]:
set.seed(123)

In [92]:
train <- read.csv('~/library/titanic-train.csv')
head(train)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [93]:
test <- read.csv('~/library/titanic-test.csv')
head(test)

PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S


In [108]:
extractFeatures <- function(data) {
    features <- c(
        'Pclass',
        'Age',
        'Sex',
        'Parch',
        'SibSp',
        'Fare',
        'Embarked'
    )
    fea                                <- data[,features]
    fea$Pclass                         <- as.integer(fea$Pclass)
    fea$Age[ is.na(fea$Age) ]          <- -1
    fea$Age                            <- as.integer(fea$Age)
    fea$Sex                            <- as.factor(fea$Sex)
    fea$Parch                          <- as.integer(fea$Parch)
    fea$SibSp                          <- as.integer(fea$SibSp)
    fea$Fare[ is.na(fea$Fare) ]        <- median(fea$Fare, na.rm=TRUE)
    fea$Fare                           <- as.double(fea$Fare)
    fea$Embarked[ fea$Embarked == '' ] <- 'S'
    fea$Embarked                       <- as.factor(fea$Embarked)
    return(fea)
}

In [109]:
train_extract <- extractFeatures(train)
head(train_extract)

Pclass,Age,Sex,Parch,SibSp,Fare,Embarked
3,22,male,0,1,7.25,S
1,38,female,0,1,71.2833,C
3,26,female,0,0,7.925,S
1,35,female,0,1,53.1,S
3,35,male,0,0,8.05,S
3,-1,male,0,0,8.4583,Q


In [110]:
test_extract <- extractFeatures(test)
head(test_extract)

Pclass,Age,Sex,Parch,SibSp,Fare,Embarked
3,34,male,0,0,7.8292,Q
3,47,female,0,1,7.0,S
2,62,male,0,0,9.6875,Q
3,27,male,0,0,8.6625,S
3,22,female,1,1,12.2875,S
3,14,male,0,0,9.225,S


In [111]:
# data model
rf <- randomForest(train_extract, as.factor(train$Survived), ntree=100, importance=TRUE)
head(rf)

$call
randomForest(x = train_extract, y = as.factor(train$Survived), 
    ntree = 100, importance = TRUE)

$type
[1] "classification"

$predicted
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  0   1   0   1   0   0   0   0   1   1   1   1   0   0   1   1   0   0   1   1 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  0   0   1   0   0   0   0   1   1   0   0   1   1   0   1   0   0   0   0   0 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  0   1   0   1   1   0   0   1   0   0   0   0   1   1   0   0   1   0   1   0 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  0   1   0   0   0   1   1   0   0   0   0   0   0   0   0   0   0   0   1   0 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  0   0   1   0   1   0   0   0   1   0   0   0   0   0   0   0   0   0   1   0 
101 102 103 104 105 106 107 108 109 110 111 

In [112]:
submission <- data.frame(PassengerId = test$PassengerId)
head(submission)

submission$Survived <- predict(rf, test_extract)
head(submission)

PassengerId
892
893
894
895
896
897


ERROR: Error in predict.randomForest(rf, test_extract): Type of predictors in new data do not match that of the training data.


In [None]:
write.csv(submission, file = 'titanic-result.csv', row.names=FALSE)

In [None]:
imp <- importance(rf, type=1)
featureImportance <- data.frame(feature=row.names(imp), Importance=imp[,1])