![titanic-img](https://storage.googleapis.com/kaggle-competitions/kaggle/3136/logos/header.png)  
* Data Fields from [Titanic: Machine Learning from Disaster | Kaggle](https://www.kaggle.com/c/titanic)
# Titanic: Machine Learning from Disaster
* Reference
    * LD Freeman의 [Github](https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy)

# Import Libraries

In [1]:
import sys
import pandas as pd
import matplotlib
import numpy as np
import scipy as sp

import IPython
from IPython import display

import sklearn
import random
import time

import warnings
warnings.filterwarnings('ignore')

from subprocess import check_output
# print(check_output(['ls', './data']).decode('urf8'))

# Load Data Modelling Libraries

In [5]:
# common model algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

# common model heplers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
# from pandas.tools.plotting import scatter_matrix

# confiure visualization defaults
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12, 8


In [7]:
data_raw = pd.read_csv('data/train.csv')
data_val = pd.read_csv('data/test.csv')

data1 = data_raw.copy(deep=True)

data_cleaner = [data1, data_val]

print(data_raw.info())

data_raw.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
482,483,0,3,"Rouse, Mr. Richard Henry",male,50.0,0,0,A/5 3594,8.05,,S
725,726,0,3,"Oreskovic, Mr. Luka",male,20.0,0,0,315094,8.6625,,S
199,200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24.0,0,0,248747,13.0,,S
142,143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda ...",female,24.0,1,0,STON/O2. 3101279,15.85,,S
568,569,0,3,"Doharr, Mr. Tannous",male,,0,0,2686,7.2292,,C
688,689,0,3,"Fischer, Mr. Eberhard Thelander",male,18.0,0,0,350036,7.7958,,S
266,267,0,3,"Panula, Mr. Ernesti Arvid",male,16.0,4,1,3101295,39.6875,,S
704,705,0,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
364,365,0,3,"O'Brien, Mr. Thomas",male,,1,0,370365,15.5,,Q
631,632,0,3,"Lundahl, Mr. Johan Svensson",male,51.0,0,0,347743,7.0542,,S


# Data cleaning

In [9]:
print('Train columns with null values:\n', data1.isnull().sum())
print('-'*10)

print('Test/Validation columns with null values:\n', data_val.isnull().sum())
print('-'*10)

data_raw.describe(include='all')

Train columns with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
----------
Test/Validation columns with null values:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
----------


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Nakid, Miss. Maria (""Mary"")",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [10]:
for dataset in data_cleaner:
    dataset['Age'].fillna(dataset['Age'].median(), inplace=True)
    
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)
    
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True)

drop_column = ['PassengerId', 'Cabin', 'Ticket']
data1.drop(drop_column, axis=1, inplace=True)

print(data1.isnull().sum())
print('-'*10)
print(data_val.isnull().sum())

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
----------
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


In [11]:
for dataset in data_cleaner:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0
    dataset['Title'] = dataset['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)

stat_min = 10

title_names = (data1['Title'].value_counts() < stat_min)

data1['Title'] = data1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
print(data1['Title'].value_counts())
print('-'*10)

data1.info()
data_val.info()
data1.sample(10)

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Name          891 non-null object
Sex           891 non-null object
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null object
FamilySize    891 non-null int64
IsAlone       891 non-null int64
Title         891 non-null object
FareBin       891 non-null category
AgeBin        891 non-null category
dtypes: category(2), float64(2), int64(6), object(4)
memory usage: 85.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 16 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null 

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,FareBin,AgeBin
775,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18.0,0,0,7.75,S,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
815,0,1,"Fry, Mr. Richard",male,28.0,0,0,0.0,S,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
472,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33.0,1,2,27.75,S,4,0,Mrs,"(14.454, 31.0]","(32.0, 48.0]"
207,1,3,"Albimona, Mr. Nassef Cassem",male,26.0,0,0,18.7875,C,1,1,Mr,"(14.454, 31.0]","(16.0, 32.0]"
871,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,52.5542,S,3,0,Mrs,"(31.0, 512.329]","(32.0, 48.0]"
244,0,3,"Attalah, Mr. Sleiman",male,30.0,0,0,7.225,C,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
696,0,3,"Kelly, Mr. James",male,44.0,0,0,8.05,S,1,1,Mr,"(7.91, 14.454]","(32.0, 48.0]"
353,0,3,"Arnold-Franchi, Mr. Josef",male,25.0,1,0,17.8,S,2,0,Mr,"(14.454, 31.0]","(16.0, 32.0]"
332,0,1,"Graham, Mr. George Edward",male,38.0,0,1,153.4625,S,2,0,Mr,"(31.0, 512.329]","(32.0, 48.0]"
52,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,76.7292,C,2,0,Mrs,"(31.0, 512.329]","(48.0, 64.0]"


# Convert Formats