### Import required libraries

In [2]:
import pandas as pd
import numpy as np

## Import train and test set
#### properties
| Variable | Definition | Key |
| :------- | ---------: | --: |
| survival | Survival | 0 = No, 1 = Yes |
|pclass|Ticket class|1 = 1st, 2 = 2nd, 3 = 3rd|
|sex|Sex|
|Age|Age in years|
|sibsp|# of siblings / spouses aboard the Titanic|
|parch|# of parents / children aboard the Titanic|	
|ticket|Ticket number|
|fare|Passenger fare|	
|cabin|Cabin number|
|embarked|Port of Embarkation|

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
gender_submission = pd.read_csv('data/gender_submission.csv')

In [5]:
gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


### Basic Feature Engineering with the Titanic Data

In [6]:
import string
def substrings_in_string(big_string, substrings):
    if not isinstance(big_string, str):
        return 'NaN'
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    print (big_string)
    return 'NaN'

In [7]:
def add_title(df):
    title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                        'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                        'Don', 'Jonkheer']
    df['Title'] = df['Name'].map(lambda x: substrings_in_string(x, title_list))
    
    def replace_titles(x):
        title = x['Title']
        if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
            return 'Mr'
        elif title in ['Countess', 'Mme']:
            return 'Mrs'
        elif title in ['Mlle', 'Ms']:
            return 'Miss'
        elif title == 'Dr':
            if x['Sex'] == 'Male':
                return 'Mr'
            else:
                return 'Mrs'
        else:
            return title

    df['Title'] = df.apply(replace_titles, axis=1)

In [8]:
def add_properties(df):
    #Turning cabin number into Deck
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    df['Deck'] = df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
    df['Age*Class'] = df['Age'] * df['Pclass']
    df['Family_Size'] = df['SibSp'] + df['Parch']
    df['Fare_Per_Person'] = df['Fare'] / (df['Family_Size']+1)

In [9]:
for df in [test, train]:
    add_title(df)
    add_properties(df)

### Cross validation

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split
train_X = train[['Pclass', 'Family_Size']]
train_Y = train['Survived']
X_train, X_valid, Y_train, Y_valid = train_test_split(train_X, train_Y, test_size=0.2, random_state=0)

# train_len = len(train)
# ratio = 0.8
# split = train_len * ratio
# train_X = train[:train_len * split]
# test_X = train[train_len * split:]

### Decision Tree

In [12]:
from sklearn import tree, ensemble

features_X = ['Pclass', 'Family_Size', 'Sex']

train_X = train[features_X]
train_Y = train['Survived']
X_train, X_valid, Y_train, Y_valid = train_test_split(train_X, train_Y, test_size=0.3, random_state=0)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)

In [13]:
clf.score(X_valid, Y_valid)

0.70895522388059706

In [133]:
result = pd.DataFrame(columns=['Survived'], index=test['PassengerId'])
# result['PassengerId'] = pd.Series(test['PassengerId'])
result['Survived'] = clf.predict(test_X)

In [134]:
result.to_csv('jeongmincha_submission.csv')

### SVM

In [140]:
from sklearn import svm
train_X = train.drop(['PassengerId','Survived'], axis=1)
train_Y = train['Survived']
test_X = test.drop(['PassengerId'], axis=1)
clf = svm.LinearSVC()
clf.fit(train_X, train_Y)

ValueError: could not convert string to float: 'Mr'