### Import required libraries

In [2]:
import pandas as pd
import numpy as np

## Import train and test set
#### properties
| Variable | Definition | Key |
| :------- | ---------: | --: |
| survival | Survival | 0 = No, 1 = Yes |
|pclass|Ticket class|1 = 1st, 2 = 2nd, 3 = 3rd|
|sex|Sex|
|Age|Age in years|
|sibsp|# of siblings / spouses aboard the Titanic|
|parch|# of parents / children aboard the Titanic|	
|ticket|Ticket number|
|fare|Passenger fare|	
|cabin|Cabin number|
|embarked|Port of Embarkation|

In [81]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [5]:
gender_submission = pd.read_csv('data/gender_submission.csv')

In [125]:
gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


### Basic Feature Engineering with the Titanic Data

In [105]:
import string
def substrings_in_string(big_string, substrings):
    if not isinstance(big_string, str):
        return 'NaN'
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    print (big_string)
    return 'NaN'

In [106]:
def add_title(df):
    title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                        'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                        'Don', 'Jonkheer']
    df['Title'] = df['Name'].map(lambda x: substrings_in_string(x, title_list))
    
    def replace_titles(x):
        title = x['Title']
        if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
            return 'Mr'
        elif title in ['Countess', 'Mme']:
            return 'Mrs'
        elif title in ['Mlle', 'Ms']:
            return 'Miss'
        elif title == 'Dr':
            if x['Sex'] == 'Male':
                return 'Mr'
            else:
                return 'Mrs'
        else:
            return title

    df['Title'] = df.apply(replace_titles, axis=1)

In [107]:
def add_properties(df):
    #Turning cabin number into Deck
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    df['Deck'] = df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
    df['Age*Class'] = df['Age'] * df['Pclass']
    df['Family_Size'] = df['SibSp'] + df['Parch']
    df['Fare_Per_Person'] = df['Fare'] / (df['Family_Size']+1)

In [108]:
for df in [test, train]:
    add_title(df)
    add_properties(df)

### Decision Tree

In [111]:
train_X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck,Age*Class,Family_Size,Fare_Per_Person
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr,,66.0,1,3.625000
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,38.0,1,35.641650
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss,,78.0,0,7.925000
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs,C,35.0,1,26.550000
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr,,105.0,0,8.050000
5,6,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Mr,,,0,8.458300
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr,E,54.0,0,51.862500
7,8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,Master,,6.0,4,4.215000
8,9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Mrs,,81.0,2,3.711100
9,10,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Mrs,,28.0,1,15.035400


In [118]:
# train_X = train.drop(['Survived'], axis=1)
train_X = train[['Pclass', 'SibSp', 'Parch']]
train_Y = train['Survived']

In [120]:
from sklearn import tree

features_X = ['Pclass', 'SibSp', 'Parch']

train_X = train[features_X]
train_Y = train['Survived']

test_X = test[features_X]

clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_X, train_Y)

In [121]:
clf.predict(test_X)

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0,

In [133]:
result = pd.DataFrame(columns=['Survived'], index=test['PassengerId'])
# result['PassengerId'] = pd.Series(test['PassengerId'])
result['Survived'] = clf.predict(test_X)

In [134]:
result.to_csv('jeongmincha_submission.csv')