In [57]:
import numpy as np
import pandas as pd
import qgrid

from altair import Chart

In [58]:
qgrid.nbinstall(overwrite=True)
qgrid.set_defaults(remote_js=True, show_toolbar=False)

In [59]:
train = pd.read_csv('../data/titanic_train.csv')
test = pd.read_csv('../data/titanic_test.csv')
full_data = train.append(test)
full_data = full_data.drop('Survived', axis=1)

full_data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,373450


In [60]:
qgrid.show_grid(train[['Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Pclass', 'Fare', 'Embarked', 'Survived', 'Cabin', 'Ticket']])

## Helper Functions

In [61]:
def any_nan(series): 
    return series.isnull().values.any()

def get_title(name): 
    return name.split(',')[1].split('.')[0].strip()

## Find Series with missing values

In [62]:
for col in test.columns:
    missing = map(any_nan, [train[col], test[col]])
    if True in missing:
        print col, 'has NaN values', missing

Age has NaN values [True, True]
Fare has NaN values [False, True]
Cabin has NaN values [True, True]
Embarked has NaN values [True, False]


In [63]:
for item in ['Age', 'Fare', 'Cabin', 'Embarked']:
    print item + ': Train missing', len(train[item][~train[item].notnull()]) ,'values; Test missing', len(test[item][~test[item].notnull()]), 'values' 

Age: Train missing 177 values; Test missing 86 values
Fare: Train missing 0 values; Test missing 1 values
Cabin: Train missing 687 values; Test missing 327 values
Embarked: Train missing 2 values; Test missing 0 values


## Munge missing data

### Embarked and Fare have few missing values.
#### Use Embarked's mode value

In [64]:
print full_data.Embarked.value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64


In [65]:
train.Embarked = train.Embarked.fillna('S')

#### Use mean Fare of passenger's Pclass 

In [66]:
test.Pclass[~test.Fare.notnull()]

152    3
Name: Pclass, dtype: int64

In [67]:
full_data.Pclass[full_data.Pclass == 3].mean()

3.0

In [68]:
test.Fare = test.Fare.fillna(3.0)

### Cabin is missing a substantial number of values.
#### Consider usefulness of this feature

In [69]:
temp = train[train.Cabin.notnull()]
temp = temp.assign(Section = [c[0] for c in temp.Cabin.values])
t = temp[['Section', 'Survived']].groupby(['Section'], as_index=False).mean()
t.assign(Count = [temp.Section[temp.Section == foo].count() for foo in t.Section.values])

Unnamed: 0,Section,Survived,Count
0,A,0.466667,15
1,B,0.744681,47
2,C,0.59322,59
3,D,0.757576,33
4,E,0.75,32
5,F,0.615385,13
6,G,0.5,4
7,T,0.0,1


In [70]:
print len(full_data.Cabin[full_data.Cabin.notnull()]), 'values present;', len(full_data.Cabin[~full_data.Cabin.notnull()]), 'values missing'

295 values present; 1014 values missing


#### Remove 'Cabin' from train and test data

In [71]:
train = train.drop('Cabin', axis=1)
test = test.drop('Cabin', axis=1)
full_data = full_data.drop('Cabin', axis=1)

### Generate missing age data

#### males age < 14 have the title 'Master' and >=14 have title 'Mr' or other.
#### females do not have as clear a rule for determining age

##### Will Generate 'Title' feature prior to cleaning age data

In [72]:
train = train.assign(Title = train.Name.apply(get_title))
test = test.assign(Title = test.Name.apply(get_title))
full_data = full_data.assign(Title = full_data.Name.apply(get_title))

In [73]:
full_data.Title.value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Major             2
Mlle              2
Ms                2
Mme               1
the Countess      1
Don               1
Lady              1
Sir               1
Jonkheer          1
Dona              1
Capt              1
Name: Title, dtype: int64

### Group and combine similar titles

In [74]:
for dataset in [train, test, full_data]:
    dataset.Title = dataset.Title.replace(['Mlle', 'Miss'], 'Ms')
    dataset.Title = dataset.Title.replace('Mme', 'Mrs')
    dataset.Title = dataset.Title.replace(['Col', 'Major'], 'Officer')
    dataset.Title = dataset.Title.replace(['the Countess', 'Dona'], 'Lady')
    dataset.Title = dataset.Title.replace(['Jonkheer', 'Don'], 'Sir')
full_data.Title.value_counts()

Mr         757
Ms         264
Mrs        198
Master      61
Dr           8
Rev          8
Officer      6
Lady         3
Sir          3
Capt         1
Name: Title, dtype: int64

### Examine grouped data

In [75]:
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [76]:
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [77]:
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


### Find Average age and Std Dev. for 'Master', 'Miss', 'Mr', 'Mrs', and any remaining

In [78]:
full_data.Title[full_data.Age.isnull()].value_counts()

Mr        176
Ms         51
Mrs        27
Master      8
Dr          1
Name: Title, dtype: int64

In [79]:
titles = list(full_data.Title[full_data.Age.isnull()].unique())
for title in titles:
    ages = full_data.Age[(full_data.Age.notnull()) & (full_data.Title == title)]
    age_avg = ages.mean()
    age_std = ages.std()
    
    train_age_null_cnt = int(train.Age[(train.Age.isnull()) & (train.Title == title)].sum())
    if train_age_null_cnt:
        train_age_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=train_age_null_cnt)
        train.Age[(np.isnan(train.Age)) & (train.Title == title)] = train_age_list
    
    test_age_null_cnt = int(test.Age[(test.Age.isnull()) & (test.Title == title)].sum())
    if test_age_null_cnt:
        test_age_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=test_age_null_cnt)
        test.Age[(test.Age.isnull()) & (test.Title == title)] = test_age_list

    full_age_null_cnt = int(full_data.Age[(full_data.Age.isnull()) & (full_data.Title == title)].sum())
    if full_age_null_cnt:
        full_age_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=full_age_null_cnt)
        full_data.Age[(full_data.Age.isnull()) & (full.Title == title)] = full_age_list


In [80]:
for dataset in [train, test]:
    age_avg = dataset.Age.mean()
    age_std = dataset.Age.std()
    age_null_cnt = dataset.Age.isnull().sum()

    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_cnt)
    dataset.Age[np.isnan(dataset.Age)] = age_null_random_list
    dataset.Age = dataset.Age.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [81]:
bins = [0, 6, 12, 24, 36, 48, 60, 84]
train['CategoricalAge'] = pd.cut(train.Age, bins)
train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean()

Unnamed: 0,CategoricalAge,Survived
0,"(0, 6]",0.65
1,"(6, 12]",0.318182
2,"(12, 24]",0.361624
3,"(24, 36]",0.382353
4,"(36, 48]",0.339394
5,"(48, 60]",0.448276
6,"(60, 84]",0.227273


### Aggregate Sibling/Spouse and Parent/Child counts to get FamilySize

In [82]:
for dataset in [train, test]:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean()

Unnamed: 0,FamilySize,Survived
0,1,0.303538
1,2,0.552795
2,3,0.578431
3,4,0.724138
4,5,0.2
5,6,0.136364
6,7,0.333333
7,8,0.0
8,11,0.0


### Add new series for loners

In [83]:
for dataset in [train, test]:
    dataset['Loner'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'Loner'] = 1

train[['Loner', 'Survived']].groupby(['Loner'], as_index=False).mean()

Unnamed: 0,Loner,Survived
0,0,0.50565
1,1,0.303538


In [84]:
for dataset in [train, test]:
    dataset.Fare = dataset.Fare.fillna(train.Fare.median())
    
train['CategoricalFare'] = pd.qcut(train.Fare, 6)
train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean()

Unnamed: 0,CategoricalFare,Survived
0,"(-0.001, 7.775]",0.205128
1,"(7.775, 8.662]",0.190789
2,"(8.662, 14.454]",0.366906
3,"(14.454, 26.0]",0.436242
4,"(26.0, 52.369]",0.417808
5,"(52.369, 512.329]",0.697987


## Clean Data

In [85]:
for dataset in [train, test]:
    # Age
    dataset.loc[dataset.Age <= 6, 'Age'] = 0
    dataset.loc[(dataset.Age > 12) & (dataset.Age <= 24), 'Age'] = 1
    dataset.loc[(dataset.Age > 24) & (dataset.Age <= 36), 'Age'] = 2
    dataset.loc[(dataset.Age > 36) & (dataset.Age <= 48), 'Age'] = 3
    dataset.loc[(dataset.Age > 48) & (dataset.Age <= 60), 'Age'] = 4
    dataset.loc[dataset.Age > 60, 'Age'] = 5
    
    # Sex
    dataset.Sex = dataset.Sex.map({'female': 0, 'male': 1}).astype(int)
    
    # Title
    dataset.Title = dataset.Title.map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5})
    dataset.Title = dataset.Title.fillna(0)
    
    # Fare
    dataset.loc[dataset.Fare <= 7.775, 'Fare'] = 0
    dataset.loc[(dataset.Fare > 7.775) & (dataset.Fare <= 8.662), 'Fare'] = 1
    dataset.loc[(dataset.Fare > 8.662) & (dataset.Fare <= 14.454), 'Fare'] = 2
    dataset.loc[(dataset.Fare > 14.454) & (dataset.Fare <= 26.0), 'Fare'] = 3
    dataset.loc[(dataset.Fare > 26.0) & (dataset.Fare <= 52.369), 'Fare'] = 4
    dataset.loc[dataset.Fare > 52.369, 'Fare'] = 5
    
    # Embarked
    dataset.Embarked = dataset.Embarked.map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
    
# Feature Selection
drop_elements = ['PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch']

train = train.drop(drop_elements, axis=1)
train = train.drop(['CategoricalAge', 'CategoricalFare', 'FamilySize'], axis=1)

test = test.drop(drop_elements, axis=1)

print train.head(10)

train=train.values
test=test.values

   Survived  Pclass  Sex  Age  Fare  Embarked  Title  Loner
0         0       3    1    1   0.0         0    1.0      0
1         1       1    0    3   5.0         1    3.0      0
2         1       3    0    2   1.0         0    0.0      1
3         1       1    0    2   5.0         0    3.0      0
4         0       3    1    2   1.0         0    1.0      1
5         0       3    1    2   1.0         2    1.0      1
6         0       1    1    4   4.0         0    1.0      1
7         0       3    1    0   3.0         0    4.0      0
8         1       3    0    2   2.0         0    3.0      0
9         1       2    0    1   4.0         1    3.0      0


In [86]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

classifiers = [
    KNeighborsClassifier(3),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()
]

log_cols = ["Classifier", "Accuracy"]
log = pd.DataFrame(columns=log_cols)

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

X = train[0::, 1::]
y = train[0::, 0]

acc_dict = {}

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    for clf in classifiers:
        name = clf.__class__.__name__
        clf.fit(X_train, y_train)
        train_predictions = clf.predict(X_test)
        acc = accuracy_score(y_test, train_predictions)
        if name in acc_dict:
            acc_dict[name] += acc
        else:
            acc_dict[name] = acc

for clf in acc_dict:
    acc_dict[clf] = acc_dict[clf] / 10.0
    log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
    log = log.append(log_entry)

plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')

sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")

NameError: name 'plt' is not defined

In [None]:
print train[0]
print test[0]