# Titanic : Machine Learning from Disaster

# Part 1 - Data Preprocessing

## 1. Read Data and Analysis

In [79]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

Data Analysis:
1. Get rid of useless data (Name, Embarked ...)
2. Get rid of the part, which contains lots of missing data (Cabin)

In [80]:
# Read data from files
titanic = pd.read_csv('data_set/train.csv')
print('------ data info (column name, count ) ------')
print(titanic.info())
print('\n')
print('------ data describe (count, mean, min, max ...) ------')
print(titanic.describe())


------ data info (column name, count ) ------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


------ data describe (count, mean, min, max ...) ------
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.00

## 2. Preprocessing

###  i. Fill in Null Data (Age)


In [81]:
# The Age column has some data missing, so use age_median to fill in
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
print(titanic.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.361582    0.523008   
std     257.353842    0.486592    0.836071   13.019697    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   22.000000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   35.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


### ii. Change  char to int （Sex : male = 1， female = 0）and Embarked (S, C, Q)
 

In [82]:
# convert ['male', 'female'] values to [1, 0]
try:
    print('------ Preprocessing Sex ------')
    print(titanic['Sex'].unique())
    titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 1
    titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 0

    print(titanic['Sex'].unique())


    print('------ Preprocessing Embarked (Data contains NAN) ------')
    print(titanic['Embarked'].unique())
    # Data contains 'nan'. Because 'S' is the mode, so fill in with 'S'
    titanic['Embarked'] = titanic['Embarked'].fillna('S')
    titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0
    titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1
    titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2
except:
    print("------ Data has been updated ------")
    print('Data for Sex ', titanic['Sex'].unique())
    print('Data for embarked ', titanic['Embarked'].unique())





------ Preprocessing Sex ------
['male' 'female']
[1 0]
------ Preprocessing Embarked (Data contains NAN) ------
['S' 'C' 'Q' nan]


### iii. Feature Extraction - Ignore useless parts (name)

In [83]:
# select features and labels for training
dataset_X = titanic[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
dataset_Y = titanic[['Survived']]
print(dataset_X)
print(dataset_Y)


     Pclass  Sex   Age  SibSp  Parch      Fare  Embarked
0         3    1  22.0      1      0    7.2500         0
1         1    0  38.0      1      0   71.2833         1
2         3    0  26.0      0      0    7.9250         0
3         1    0  35.0      1      0   53.1000         0
4         3    1  35.0      0      0    8.0500         0
5         3    1  28.0      0      0    8.4583         2
6         1    1  54.0      0      0   51.8625         0
7         3    1   2.0      3      1   21.0750         0
8         3    0  27.0      0      2   11.1333         0
9         2    0  14.0      1      0   30.0708         1
10        3    0   4.0      1      1   16.7000         0
11        1    0  58.0      0      0   26.5500         0
12        3    1  20.0      0      0    8.0500         0
13        3    1  39.0      1      5   31.2750         0
14        3    0  14.0      0      0    7.8542         0
15        2    0  55.0      0      0   16.0000         0
16        3    1   2.0      4  

## 3. Divide dataset to (training set) and (test set)


Prevent from overfitting

In [84]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split


# split training data and test data (30% is test data)
X_train, X_test, y_train, y_test = train_test_split(
    dataset_X.as_matrix(), 
    dataset_Y.as_matrix(), 
    test_size = 0.2, 
    random_state = 42)

  import sys
  


# Import Machie Learning Model

In [85]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Cross Validation (K - fold)

In [86]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k_fold = KFold(n_splits = 10, shuffle = True, random_state = 0)


# Use KNN

In [87]:
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'

score = cross_val_score(clf, dataset_X, dataset_Y, cv = k_fold, n_jobs = 1, scoring = scoring)
print(score)

# kNN score
round(np.mean(score)*100, 2)


[0.66666667 0.72222222 0.71830986 0.67605634 0.71830986 0.77464789
 0.69014085 0.61971831 0.71830986 0.69014085]


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


69.95

# Decision Tree

In [88]:
clf = DecisionTreeClassifier()
scoring = 'accuracy'

score = cross_val_score(clf, dataset_X, dataset_Y, cv = k_fold, n_jobs = 1, scoring = scoring)
print(score)

# kNN score
round(np.mean(score)*100, 2)

[0.72222222 0.73611111 0.74647887 0.71830986 0.8028169  0.74647887
 0.73239437 0.85915493 0.78873239 0.77464789]


76.27

# Use sklearn Random Forest

In [89]:
clf = RandomForestClassifier()
scoring = 'accuracy'

score = cross_val_score(clf, dataset_X, dataset_Y, cv = k_fold, n_jobs = 1, scoring = scoring)
print(score)

# kNN score
round(np.mean(score)*100, 2)

[0.76388889 0.73611111 0.83098592 0.69014085 0.8028169  0.8028169
 0.76056338 0.88732394 0.83098592 0.77464789]


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


78.8

# SVM

In [90]:
clf = SVC()
scoring = 'accuracy'

score = cross_val_score(clf, dataset_X, dataset_Y, cv = k_fold, n_jobs = 1, scoring = scoring)
print(score)

# kNN score
round(np.mean(score)*100, 2)

[0.59722222 0.625      0.76056338 0.61971831 0.69014085 0.70422535
 0.61971831 0.69014085 0.71830986 0.63380282]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


66.59

# Output testset

In [91]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)


# testdata preprocessing
dataset_t = pd.read_csv('data_set/test.csv')
dataset_t['Age'] = dataset_t['Age'].fillna(dataset_t['Age'].median())

dataset_t.loc[dataset_t['Sex'] == 'male', 'Sex'] = 1
dataset_t.loc[dataset_t['Sex'] == 'female', 'Sex'] = 0
dataset_t['Embarked'] = dataset_t['Embarked'].fillna('S')
dataset_t.loc[dataset_t['Embarked'] == 'S', 'Embarked'] = 0
dataset_t.loc[dataset_t['Embarked'] == 'C', 'Embarked'] = 1
dataset_t.loc[dataset_t['Embarked'] == 'Q', 'Embarked'] = 2
dataset_t = dataset_t.fillna(0)

datafinal = dataset_t[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
print(datafinal.info())


pid = dataset_t[['PassengerId']]
# list -> dataframe

pre = clf.predict(datafinal)
pred =pd.DataFrame(pre, columns=['Survived'])
result = pd.concat([pid, pred], axis=1)

result.to_csv('titanic.csv', encoding = 'utf-8')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null int64
Age         418 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
Embarked    418 non-null int64
dtypes: float64(2), int64(5)
memory usage: 22.9 KB
None


  
