In [92]:
import numpy as np
import pandas as pd

train = pd.read_csv("data/train.csv")
test  = pd.read_csv("data/test.csv")

In [93]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


In [94]:
def clean_data(titanic):
    
    titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
    titanic["Age"].median()
    
    titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
    titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
    
    titanic["Embarked"] = titanic["Embarked"].fillna("S")

    titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
    titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
    titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

    titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median())
    
    titanic["FamilySize"] = titanic['Parch'] + titanic['SibSp']

    return titanic

In [95]:
def create_submission(model, train, test, predictors, filename):

    model.fit(train[predictors], train["Survived"])
    predictions = model.predict(test[predictors])

    submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": predictions
    })
    
    submission.to_csv(filename, index=False)

In [96]:
train_data = clean_data(train)
test_data  = clean_data(test)

In [97]:
train_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,891,29.361582,13.019697,0.42,22.0,28.0,35.0,80.0
SibSp,891,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292
FamilySize,891,0.904602,1.613459,0.0,0.0,0.0,1.0,10.0


In [98]:
test_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,418,1100.5,120.810458,892.0,996.25,1100.5,1204.75,1309.0
Pclass,418,2.26555,0.841838,1.0,1.0,3.0,3.0,3.0
Age,418,29.599282,12.70377,0.17,23.0,27.0,35.75,76.0
SibSp,418,0.447368,0.89676,0.0,0.0,0.0,1.0,8.0
Parch,418,0.392344,0.981429,0.0,0.0,0.0,0.0,9.0
Fare,418,35.576535,55.850103,0.0,7.8958,14.4542,31.471875,512.3292
FamilySize,418,0.839713,1.519072,0.0,0.0,0.0,1.0,10.0


In [103]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

predictors = ["Pclass", "Sex", "Age", "FamilySize", "Fare", "Embarked"]

model = RandomForestClassifier(
    random_state=1,
    n_estimators=150,
    min_samples_split=4,
    min_samples_leaf=2
)

scores = cross_validation.cross_val_score(
    model,
    train_data[predictors],
    train_data["Survived"],
    cv=3
)

print(scores.mean())

0.827160493827


In [100]:
# from sklearn.cross_validation import KFold

features = train[["Pclass", "Sex", "Age", "FamilySize", "Fare", "Embarked"]].values
target = train.Survived.values

def crossValidate(features, target, classifier, k_fold, r_state=None) :
    # derive a set of (random) training and testing indices
    k_fold_indices = KFold(len(features), n_folds=k_fold,
                           shuffle=True, random_state=r_state)
    
    # for each set of training and testing indices 
    # train the classifier, and score the results
    k_score_total = 0
    for train_indices, test_indices in k_fold_indices :

        model = classifier.fit(features[train_indices],
                           target[train_indices])

        k_score = model.score(features[test_indices],
                              target[test_indices])

        k_score_total = k_score_total + k_score

    # return the average accuracy
    return k_score_total/k_fold

In [101]:
for x in range (1,200,10):
    print crossValidate(features, target, RandomForestClassifier(x), 10, 0), x

0.766579275905 1
0.794581772784 11
0.810287141074 21
0.810312109863 31
0.808052434457 41
0.813670411985 51
0.804694132335 61
0.808052434457 71
0.809176029963 81
0.815917602996 91
0.809176029963 101
0.813670411985 111
0.814781523096 121
0.811435705368 131
0.810312109863 141
0.810287141074 151
0.806941323346 161
0.810274656679 171
0.811448189763 181
0.809176029963 191


In [102]:
create_submission(model, train_data, test_data, predictors, "submission.csv")