In [1]:
import numpy as np
import pandas as pd
import qgrid

from altair import Chart

In [2]:
qgrid.nbinstall(overwrite=True)
qgrid.set_defaults(remote_js=True, show_toolbar=False)

In [3]:
train = pd.read_csv('../data/titanic_train.csv')
test = pd.read_csv('../data/titanic_test.csv')
full_data = [train, test]

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
qgrid.show_grid(train[['Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Pclass', 'Fare', 'Embarked', 'Survived', 'Cabin', 'Ticket']])

## Helper Functions

In [5]:
def any_nan(series): 
    return series.isnull().values.any()

def get_title(name): 
    return name.split(',')[1].split('.')[0].strip()

## Find Series with missing values

In [8]:
for col in test.columns:
    missing = map(any_nan, [train[col], test[col]])
    if True in missing:
        print col, 'has NaN values', missing

Age has NaN values [True, True]
Fare has NaN values [False, True]
Cabin has NaN values [True, True]
Embarked has NaN values [True, False]


In [13]:
train.Age.notnull()
# print 'Age: Train missing', train.Age[~train.Age.notnull()].count() ,'values;', 'Test missing', test.Age[~test.Age.notnull()].sum(), 'values' 

0       True
1       True
2       True
3       True
4       True
5      False
6       True
7       True
8       True
9       True
10      True
11      True
12      True
13      True
14      True
15      True
16      True
17     False
18      True
19     False
20      True
21      True
22      True
23      True
24      True
25      True
26     False
27      True
28     False
29     False
       ...  
861     True
862     True
863    False
864     True
865     True
866     True
867     True
868    False
869     True
870     True
871     True
872     True
873     True
874     True
875     True
876     True
877     True
878    False
879     True
880     True
881     True
882     True
883     True
884     True
885     True
886     True
887     True
888    False
889     True
890     True
Name: Age, Length: 891, dtype: bool

## Munge missing data

In [None]:
train.Embarked= train.Embarked.fillna('S')

### Examine grouped data

In [94]:
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [96]:
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [98]:
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


## Generate missing age data

### males age < 14 have the title 'Master' and >=14 have title 'Mr' or other.
### females do not have as clear a rule for determining age

#### Will Generate 'Title' feature prior to cleaning age data

In [101]:
for dataset in full_data:
    dataset['Title'] = dataset.Name.apply(get_title)

In [102]:
for dataset in full_data:
    dataset.Title = dataset.Title.replace(['Mlle', 'Ms'], 'Miss')
    dataset.Title = dataset.Title.replace('Mme', 'Mrs')

### Find Average age and Std Dev. for 'Master', 'Miss', 'Mr', 'Mrs', and any remaining

In [103]:
def gen_ages_by_title(title, train, test):
    train_ages = train.Age[(~train.Age.isnull()) & (train.Title == title)]
    test_ages = test.Age[(~test.Age.isnull()) & (test.Title == title)]
    all_ages = train_ages.append(test_ages)
    
    age_avg = all_ages.mean()
    age_std = all_ages.std()

    train_age_list = np.random.randint(
        age_avg - age_std, 
        age_avg + age_std, 
        size=len(train.Age[(train.Age.isnull()) & (train.Title == title)])
    )
    test_age_list = np.random.randint(
        age_avg - age_std, 
        age_avg + age_std, 
        size=len(test.Age[(test.Age.isnull()) & (test.Title == title)])
    )

    train.Age[(train.Age.isnull()) & (train.Title == title)] = train_age_list
    test.Age[(test.Age.isnull()) & (test.Title == title)] = test_age_list

In [104]:
titles = ['Master', 'Miss', 'Mr', 'Mrs']
for t in titles:
    gen_ages_by_title(t, train, test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [105]:
for dataset in full_data:
    age_avg = dataset.Age.mean()
    age_std = dataset.Age.std()
    age_null_cnt = dataset.Age.isnull().sum()

    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_cnt)
    dataset.Age[np.isnan(dataset.Age)] = age_null_random_list
    dataset.Age = dataset.Age.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [107]:
train['CategoricalAge'] = pd.cut(train.Age, 6)
train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean()

Unnamed: 0,CategoricalAge,Survived
0,"(-0.08, 13.333]",0.569767
1,"(13.333, 26.667]",0.334405
2,"(26.667, 40.0]",0.387879
3,"(40.0, 53.333]",0.394737
4,"(53.333, 66.667]",0.348837
5,"(66.667, 80.0]",0.142857


### Aggregate Sibling/Spouse and Parent/Child counts to get FamilySize

In [109]:
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean()

Unnamed: 0,FamilySize,Survived
0,1,0.303538
1,2,0.552795
2,3,0.578431
3,4,0.724138
4,5,0.2
5,6,0.136364
6,7,0.333333
7,8,0.0
8,11,0.0


### Add new series for loners

In [110]:
for dataset in full_data:
    dataset['Loner'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'Loner'] = 1

train[['Loner', 'Survived']].groupby(['Loner'], as_index=False).mean()

Unnamed: 0,Loner,Survived
0,0,0.50565
1,1,0.303538


In [111]:
for dataset in full_data:
    dataset.Fare = dataset.Fare.fillna(train.Fare.median())
    
train['CategoricalFare'] = pd.qcut(train.Fare, 6)
train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean()

Unnamed: 0,CategoricalFare,Survived
0,"(-0.001, 7.775]",0.205128
1,"(7.775, 8.662]",0.190789
2,"(8.662, 14.454]",0.366906
3,"(14.454, 26.0]",0.436242
4,"(26.0, 52.369]",0.417808
5,"(52.369, 512.329]",0.697987


In [112]:
for dataset in full_data:
    dataset['Title'] = dataset.Name.apply(get_title)

In [113]:
for dataset in full_data:
    dataset.Title = dataset.Title.replace(['Mlle', 'Ms'], 'Miss')
    dataset.Title = dataset.Title.replace('Mme', 'Mrs')
    

In [114]:
blacklist = [t for t in train.Title.unique() if train.Title[train.Title == t].count() < 10]
try:
    for dataset in full_data:
        dataset.Title = dataset.Title.replace(blacklist, 'Other')
except:
    pass

train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Other,0.347826


## Clean Data

In [115]:
for dataset in full_data:
    # Age
    dataset.loc[dataset.Age <= 13, 'Age'] = 0
    dataset.loc[(dataset.Age > 13) & (dataset.Age <= 26), 'Age'] = 1
    dataset.loc[(dataset.Age > 26) & (dataset.Age <= 40), 'Age'] = 2
    dataset.loc[(dataset.Age > 40) & (dataset.Age <= 53), 'Age'] = 3
    dataset.loc[(dataset.Age > 53) & (dataset.Age <= 66), 'Age'] = 4
    dataset.loc[dataset.Age > 66, 'Age'] = 5
    
    # Sex
    dataset.Sex = dataset.Sex.map({'female': 0, 'male': 1}).astype(int)
    
    # Title
    dataset.Title = dataset.Title.map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5})
    dataset.Title = dataset.Title.fillna(0)
    
    # Fare
    dataset.loc[dataset.Fare <= 7.775, 'Fare'] = 0
    dataset.loc[(dataset.Fare > 7.775) & (dataset.Fare <= 8.662), 'Fare'] = 1
    dataset.loc[(dataset.Fare > 8.662) & (dataset.Fare <= 14.454), 'Fare'] = 2
    dataset.loc[(dataset.Fare > 14.454) & (dataset.Fare <= 26.0), 'Fare'] = 3
    dataset.loc[(dataset.Fare > 26.0) & (dataset.Fare <= 52.369), 'Fare'] = 4
    dataset.loc[dataset.Fare > 52.369, 'Fare'] = 5
    
    # Embarked
    dataset.Embarked = dataset.Embarked.map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
    
# Feature Selection
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']

train = train.drop(drop_elements, axis=1)
train = train.drop(['CategoricalAge', 'CategoricalFare', 'FamilySize'], axis=1)

test = test.drop(drop_elements, axis=1)

print train.head(10)

train=train.values
test=test.values

   Survived  Pclass  Sex  Age  Fare  Embarked  Title  Loner
0         0       3    1    1   0.0         0      1      0
1         1       1    0    2   5.0         1      3      0
2         1       3    0    1   1.0         0      2      1
3         1       1    0    2   5.0         0      3      0
4         0       3    1    2   1.0         0      1      1
5         0       3    1    2   1.0         2      1      1
6         0       1    1    4   4.0         0      1      1
7         0       3    1    0   3.0         0      4      0
8         1       3    0    2   2.0         0      3      0
9         1       2    0    1   4.0         1      3      0


In [116]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

classifiers = [
    KNeighborsClassifier(3),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()
]

log_cols = ["Classifier", "Accuracy"]
log = pd.DataFrame(columns=log_cols)

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

X = train[0::, 1::]
y = train[0::, 0]

acc_dict = {}

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    for clf in classifiers:
        name = clf.__class__.__name__
        clf.fit(X_train, y_train)
        train_predictions = clf.predict(X_test)
        acc = accuracy_score(y_test, train_predictions)
        if name in acc_dict:
            acc_dict[name] += acc
        else:
            acc_dict[name] = acc

for clf in acc_dict:
    acc_dict[clf] = acc_dict[clf] / 10.0
    log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
    log = log.append(log_entry)

plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')

sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")

NameError: name 'plt' is not defined

In [None]:
print train[0]
print test[0]