In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.set()

# 1. Exploratory Data Analysis

In [2]:
data_latih = pd.read_csv('https://raw.githubusercontent.com/imdwipayana/Titanic-Data-Analysis/main/train.csv')
data_uji = pd.read_csv('https://raw.githubusercontent.com/imdwipayana/Titanic-Data-Analysis/main/test.csv')

In [3]:
data_latih.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data_uji.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# 2. Feature Engineering

In [5]:
data_gabung = [data_latih, data_uji]

for data in data_gabung:
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [6]:
gelar = {'Mr': 0, 'Miss': 1, 'Mrs': 2,
         'Master': 3, 'Dr': 3, 'Rev': 3, 'Col': 3, 'Major': 3, 'Mlle': 3, 'Countess': 3,
         'Ms': 3, 'Lady': 3, 'Jonkheer': 3, 'Don': 3, 'Dona': 3, 'Mme': 3, 'Capt': 3, 'Sir': 3}
for data in data_gabung:
    data['Title'] = data['Title'].map(gelar)

In [7]:
gender = {'male': 0, 'female': 1}

for data in data_gabung:
    data['Sex'] = data['Sex'].map(gender)

In [8]:
# Age
# Data pada kolom 'Age' ada yang kosong, untuk itu kita isi data yang kosong dengan median dari 'Title'
data_latih['Age'].fillna(data_latih.groupby('Title')['Age'].transform('median'), inplace=True)
data_uji['Age'].fillna(data_uji.groupby('Title')['Age'].transform('median'), inplace=True)

In [9]:
#Embarked
embarked = {'S':0, 'C':1, 'Q':2}
for data in data_gabung:
    data['Embarked']=data['Embarked'].map(embarked)

In [10]:
data_latih['Embarked'].fillna(data_latih.groupby('Pclass')['Embarked'].transform('median'),inplace=True)
data_uji['Embarked'].fillna(data_uji.groupby('Pclass')['Embarked'].transform('median'),inplace=True)

In [11]:
# Fare
data_latih['Fare'].fillna(data_latih.groupby('Pclass')['Fare'].transform('median'),inplace=True)
data_uji['Fare'].fillna(data_uji.groupby('Pclass')['Fare'].transform('median'),inplace=True)

In [12]:
# Cabin
for data in data_gabung:
    data['Cabin'] = data['Cabin'].str[:1]

In [13]:
cabin = {'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'T':7}
for data in data_gabung:
    data['Cabin'] = data['Cabin'].map(cabin)

In [14]:
data_latih['Cabin'].fillna(data_latih.groupby('Pclass')['Cabin'].transform('median'),inplace=True)
data_uji['Cabin'].fillna(data_uji.groupby('Pclass')['Cabin'].transform('median'), inplace=True)

In [15]:
# Family Size
data_latih['AnggotaKeluarga'] = data_latih['SibSp'] + data_latih['Parch'] + 1
data_uji['AnggotaKeluarga'] = data_uji['SibSp'] + data_uji['Parch'] + 1

In [16]:
# Drop data yang tidak terpakai
data_latih = data_latih.drop(['Ticket', 'SibSp','Parch'], axis=1)
data_uji = data_uji.drop(['Ticket', 'SibSp','Parch'], axis=1)
data_latih = data_latih.drop(['PassengerId'],axis=1)

In [17]:
data_latih.drop('Name', axis=1, inplace=True)
data_uji.drop('Name',axis=1,inplace=True)

In [18]:
data_fix_dilatih = data_latih.drop('Survived', axis=1)
target = data_latih['Survived']

In [19]:
data_fix_dilatih.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,AnggotaKeluarga
0,3,0,22.0,7.25,5.0,0.0,0,2
1,1,1,38.0,71.2833,2.0,1.0,2,2
2,3,1,26.0,7.925,5.0,0.0,1,1
3,1,1,35.0,53.1,2.0,0.0,2,2
4,3,0,35.0,8.05,5.0,0.0,0,1


In [20]:
data_fix_dilatih.isnull().sum()

Pclass             0
Sex                0
Age                0
Fare               0
Cabin              0
Embarked           0
Title              0
AnggotaKeluarga    0
dtype: int64

In [21]:
data_uji.isnull().sum()

PassengerId        0
Pclass             0
Sex                0
Age                0
Fare               0
Cabin              0
Embarked           0
Title              0
AnggotaKeluarga    0
dtype: int64

# Predictive Model

In [22]:
from sklearn.svm import SVC

In [23]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

In [24]:
model = SVC()
score = cross_val_score(model,data_fix_dilatih,target,cv=k_fold,n_jobs=1,scoring='accuracy')
print(score)

[0.65555556 0.78651685 0.68539326 0.58426966 0.75280899 0.69662921
 0.70786517 0.68539326 0.59550562 0.62921348]


In [25]:
score.mean()*100

67.79151061173533