In [325]:
%matplotlib inline

In [326]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import qgrid
import seaborn as sns

In [327]:
qgrid.nbinstall(overwrite=True)
qgrid.set_defaults(remote_js=True, show_toolbar=False)

sns.set()

plt.rcParams['figure.figsize'] = (12, 8)
%config InlineBackend.figure_format='retina'

In [328]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
full_data = [train, test]
train.head()
# qgrid.show_grid(train[['Survived', 'Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked']].head(3))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [329]:
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [330]:
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [331]:
for dataset in full_data:
    dataset.Embarked = dataset.Embarked.fillna('S')
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [332]:
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    

train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean()

Unnamed: 0,FamilySize,Survived
0,1,0.303538
1,2,0.552795
2,3,0.578431
3,4,0.724138
4,5,0.2
5,6,0.136364
6,7,0.333333
7,8,0.0
8,11,0.0


In [333]:
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


In [334]:
for dataset in full_data:
    age_avg = dataset.Age.mean()
    age_std = dataset.Age.std()
    age_null_cnt = dataset.Age.isnull().sum()

    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_cnt)
    dataset.Age[np.isnan(dataset.Age)] = age_null_random_list
    dataset.Age = dataset.Age.astype(int)

train['CategoricalAge'] = pd.cut(train.Age, 6)
train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,CategoricalAge,Survived
0,"(-0.08, 13.333]",0.591549
1,"(13.333, 26.667]",0.329073
2,"(26.667, 40.0]",0.40597
3,"(40.0, 53.333]",0.368852
4,"(53.333, 66.667]",0.348837
5,"(66.667, 80.0]",0.142857


In [335]:
for dataset in full_data:
    dataset.Fare = dataset.Fare.fillna(train.Fare.median())
    
train['CategoricalFare'] = pd.qcut(train.Fare, 6)
train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean()

Unnamed: 0,CategoricalFare,Survived
0,"(-0.001, 7.775]",0.205128
1,"(7.775, 8.662]",0.190789
2,"(8.662, 14.454]",0.366906
3,"(14.454, 26.0]",0.436242
4,"(26.0, 52.369]",0.417808
5,"(52.369, 512.329]",0.697987


In [336]:
def get_title(name): 
    try:
        return name.split(',')[1].split('.')[0].strip()
    except:
        return ''

for dataset in full_data:
    dataset['Title'] = dataset.Name.apply(get_title)

In [337]:
for dataset in full_data:
    dataset.Title = dataset.Title.replace(['Mlle', 'Ms'], 'Miss')
    dataset.Title = dataset.Title.replace('Mme', 'Mrs')
    

In [338]:
blacklist = [t for t in train.Title.unique() if train.Title[train.Title == t].count() < 10]
try:
    for dataset in full_data:
        dataset.Title = dataset.Title.replace(blacklist, 'Other')
except:
    pass

train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Other,0.347826


## Clean Data

In [340]:
for dataset in full_data:
    # Age
    dataset.loc[dataset.Age <= 13, 'Age'] = 0
    dataset.loc[(dataset.Age > 13) & (dataset.Age <= 26), 'Age'] = 1
    dataset.loc[(dataset.Age > 26) & (dataset.Age <= 40), 'Age'] = 2
    dataset.loc[(dataset.Age > 40) & (dataset.Age <= 53), 'Age'] = 3
    dataset.loc[(dataset.Age > 53) & (dataset.Age <= 66), 'Age'] = 4
    dataset.loc[dataset.Age > 66, 'Age'] = 5
    
    # Sex
    dataset.Sex = dataset.Sex.map({y: x for x, y in enumerate(dataset.Sex.unique())}).astype(int)
    
    # Title
    dataset.Title = dataset.Title.map({y: x for x, y in enumerate(dataset.Title.unique())})
    
    # Fare
    dataset.loc[dataset.Fare <= 7.775, 'Fare'] = 0
    dataset.loc[(dataset.Fare > 7.775) & (dataset.Fare <= 8.662), 'Fare'] = 1
    dataset.loc[(dataset.Fare > 8.662) & (dataset.Fare <= 14.454), 'Fare'] = 2
    dataset.loc[(dataset.Fare > 14.454) & (dataset.Fare <= 26.0), 'Fare'] = 3
    dataset.loc[(dataset.Fare > 26.0) & (dataset.Fare <= 52.369), 'Fare'] = 4
    dataset.loc[dataset.Fare > 52.369, 'Fare'] = 5
    
    # Embarked
    dataset.Embarked = dataset.Embarked.map({y: x for x, y in enumerate(dataset.Embarked.unique())})
    
    # Feature Selection
    drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']
    
    train = train.drop(drop_elements, axis=1)
    train = train.drop(['CategoricalAge', 'CategoricalFare', 'FamilySize'], axis=1)
    
    test = test.drop(drop_elements, axis=1)
    
    print train.head(10)
    
    train=train.values
    test=test.values

ValueError: Cannot setitem on a Categorical with a new category, set the categories first