In [91]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [92]:
fname_test = 'test.csv'
fname_train = 'train.csv'

# Currently scores 76.6%
# TODO - how to encode name string, ticket string, cabin string?

In [229]:
 def load_data(fname, test=False):
    """
    extract p data samples
    rep data as      np array, floats, n x p (n features x p samples)
    rep labels as    np array, binary, p x 1
    rep id as        np array, int, p x 1
        n = 9
        feature 0: 'Pclass', int
        feature 1: 'Age', float
        feature 2: 'Sex', int (convert str -> 'male': 0, 'female': 1)
        feature 3: 'SibSp' int
        feature 4: 'Parch' int
        feature 5: 'Fare', float
        feature 6: 'EmbarkC', bool
        feature 7: 'EmbarkS', bool
        feature 8: 'EmbarkA', bool
        remove feature: 'Name', str (REMOVE) -- prob add back somehow
        remove feature: 'Ticket', str (REMOVE) -- prob add back somehow
        remove feature: 'Cabin', str (REMOVE) -- prob add back somehow
    All columns:
              [u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
               u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'],
                  
    returns n x p features and p x 1 labels and p x 1 ids (passenger numbers)
    """
    df = pd.read_csv(fname)

    # clean the df -- TODO
    df = df.replace('male', 0)
    df = df.replace('female', 1)
    #df = df.replace(NaN, -1)  # TODO fix lol maybe take mean or median
    df = df.fillna(df.mean())
    
    n = 9
    p = len(df.index)
    
    # prep np arrays
    data_id = np.zeros(p, dtype=int)
    data_features = np.zeros((n, p))
    data_labels = np.zeros(p, dtype=int)
    
    # extract passenger ids
    indices = df.index
    data_id[:] = df['PassengerId'].values
    
    # extract id -> label (survival boolean)
    if test:
        data_labels = None
    else:
        data_labels[:] = df['Survived'].values
    
    # subset of df
    df_features = df[['Pclass', 'Age', 'Sex', 'SibSp', 'Parch', 'Fare']]
    datavals = df_features.values
    data_features[0:6,:] = datavals.T
    
    # load embarked data (bool expansion)
    df_embarked = df['Embarked'].values
    for idx, elem in enumerate(df_embarked):
        if elem == 'C':
            data_features[6, idx] = 1
        if elem == 'S':
            data_features[7, idx] = 1
        if elem == 'Q':
            data_features[8, idx] = 1
            
    return data_features, data_labels, data_id


def build_model(train_features, train_labels):
    # just logistic regression
    X = train_features.T
    y = train_labels
    model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X, y)
    print 'model training score:', model.score(X, y)
    return model


def predict(model, test_features):
    predictions = model.predict(test_features.T)
    return predictions


def save_predictions(test_id, test_predictions, fname):
    p = len(test_id)
    odata = np.zeros((p, 2), dtype=int)
    odata[:, 0] = test_id[:]
    odata[:, 1] = test_predictions[:]
    np.savetxt(fname, odata, header='PassengerId,Survived', comments='', fmt='%d', delimiter=',')
    return

In [230]:
# load training data
train_features, train_labels, train_id = load_data(fname_train)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
5              6         0       3   
6              7         0       1   
7              8         0       3   
8              9         1       3   
9             10         1       2   
10            11         1       3   
11            12         1       1   
12            13         0       3   
13            14         0       3   
14            15         0       3   
15            16         1       2   
16            17         0       3   
17            18         1       2   
18            19         0       3   
19            20         1       3   
20            21         0       2   
21            22         1       2   
22            23         1       3   
23            24         1       1   
24            25         0       3   
25          

In [226]:
# build model
model = build_model(train_features, train_labels)
# NaN to mean: 0.7946127946127947 - 74% test
# NaN to median: 0.7957351290684624 - 73% test

model training score: 0.8002244668911336


In [227]:
# predict on test data
test_features, test_labels, test_id = load_data(fname_test, test=True)
test_predictions = predict(model, test_features)

<type 'numpy.ndarray'>


In [228]:
# save predictions
save_predictions(test_id, test_predictions, 'submission.csv')