In [240]:
library(tidyverse)
library(caret)
library(tidyr)
library(dplyr)
library(pROC)

# Reading Dataset

In [225]:
data <- read.csv("/home/hasan/Data Set/titanic/train.csv")
head(data)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


# Data Preprocessing

In [226]:
# Selecting necessary columns
data <- select(data, Pclass, Sex, Age, SibSp, Parch, Fare, Embarked, Survived)
head(data)

Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
3,male,22.0,1,0,7.25,S,0
1,female,38.0,1,0,71.2833,C,1
3,female,26.0,0,0,7.925,S,1
1,female,35.0,1,0,53.1,S,1
3,male,35.0,0,0,8.05,S,0
3,male,,0,0,8.4583,Q,0


In [227]:
#data <- data.frame(data)
#head(data)

In [228]:
dim(data)

In [229]:
# Checking missing values 
na_count <- sapply(data, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count)
na_count

Unnamed: 0,na_count
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,0
Survived,0


In [230]:
# Removing rows with missing data
data <- data[rowSums(is.na(data)) == 0,]
head(data)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
1,3,male,22,1,0,7.25,S,0
2,1,female,38,1,0,71.2833,C,1
3,3,female,26,0,0,7.925,S,1
4,1,female,35,1,0,53.1,S,1
5,3,male,35,0,0,8.05,S,0
7,1,male,54,0,0,51.8625,S,0


In [231]:
# From categorical to numerical 
data$Sex <-  as.integer(data$Sex)
data$Embarked <- as.integer(data$Embarked)
data$Survived <- as.integer(data$Survived)
data$Survived = as.factor(data$Survived)
head(data)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
1,3,2,22,1,0,7.25,4,0
2,1,1,38,1,0,71.2833,2,1
3,3,1,26,0,0,7.925,4,1
4,1,1,35,1,0,53.1,4,1
5,3,2,35,0,0,8.05,4,0
7,1,2,54,0,0,51.8625,4,0


# Dataset Dividing

In [232]:
train_index <- createDataPartition(data$Survived, p=0.9, list=FALSE)
train_set <- data[train_index,]
test_set <- data[-train_index,] 

In [233]:
dim(train_set)
dim(test_set)

# Model

In [234]:
# To achieve reproducible model; set the random seed number
set.seed(100)

# Build model
model <- train(Survived ~ ., 
               data = train_set,
               method = "svmPoly",
               na.action = na.omit,
               preProcess=c("scale","center"),
               trControl= trainControl(method="none"),
               tuneGrid = data.frame(degree=1,scale=1,C=1)
)

In [235]:
model_train_pred <- predict(model, train_set)
model_test_pred <- predict(model, test_set)

In [236]:
confusionMatrix(model_test_pred, test_set$Survived)

Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 34 11
         1  8 18
                                          
               Accuracy : 0.7324          
                 95% CI : (0.6141, 0.8306)
    No Information Rate : 0.5915          
    P-Value [Acc > NIR] : 0.009644        
                                          
                  Kappa : 0.4372          
                                          
 Mcnemar's Test P-Value : 0.646355        
                                          
            Sensitivity : 0.8095          
            Specificity : 0.6207          
         Pos Pred Value : 0.7556          
         Neg Pred Value : 0.6923          
             Prevalence : 0.5915          
         Detection Rate : 0.4789          
   Detection Prevalence : 0.6338          
      Balanced Accuracy : 0.7151          
                                          
       'Positive' Class : 0               
                                    