In [8]:
# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV

In [9]:
def clean(data):
    data["Age"]      = data["Age"].fillna(data["Age"].mean())
    data["Embarked"] = data["Embarked"].fillna("S")
    data.loc[data["Sex"] == "male", "Sex"] = 0
    data.loc[data["Sex"] == "female", "Sex"] = 1
    data.loc[data["Embarked"] == "S", "Embarked"] = 0
    data.loc[data["Embarked"] == "C", "Embarked"] = 1
    data.loc[data["Embarked"] == "Q", "Embarked"] = 2
    return data

In [10]:
# get titanic & test csv files as a DataFrame
titanic_df = pd.read_csv("./train.csv", dtype={"Age": np.float64}, )
test_df    = pd.read_csv("./test.csv", dtype={"Age": np.float64}, )
train_clean = clean(titanic_df)
test_clean  = clean(test_df)

In [11]:
predictors = ["Pclass","Sex","Age","Embarked","SibSp"]

In [12]:
xtrain = train_clean[predictors]
ytrain = train_clean['Survived']
ytest = test_clean[predictors]

In [13]:
RF = RandomForestClassifier(n_estimators=50)

In [14]:
# define the parameter values that should be searched
k_range = range(1, 101)
maxdepth_options = range(1,10)
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_estimators=k_range, max_depth=maxdepth_options)
# instantiate and fit the grid
grid = GridSearchCV(RF, param_grid, cv=10, scoring='accuracy')
grid.fit(xtrain, ytrain)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
       pre_dispatch='2*n_jobs', refit=True, score_func

In [15]:
# examine the best model
print grid.best_score_
print grid.best_params_

0.836139169473
{'n_estimators': 20, 'max_depth': 4}


In [16]:
RFBest = RandomForestClassifier(n_estimators=20,max_depth=4)
RFBest.fit(xtrain,ytrain)
RFBest.score(xtrain,ytrain)

0.84175084175084181

In [17]:
knn = KNeighborsClassifier(n_neighbors = 6,weights='uniform')

In [18]:
# define the parameter values that should be searched
k_range = range(1, 101)
weights_options = ['uniform','distance']
# create a parameter grid: map the parameter names to the values that should be searched
param_grid_knn = dict(n_neighbors=k_range, weights=weights_options)
# instantiate and fit the grid
grid = GridSearchCV(knn, param_grid_knn, cv=10, scoring='accuracy')
grid.fit(xtrain, ytrain)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=6, p=2, weights='uniform'),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='accuracy', verbose=0)

In [19]:
# examine the best model
print grid.best_score_
print grid.best_params_

0.789001122334
{'n_neighbors': 17, 'weights': 'distance'}


In [21]:
KNNBest = KNeighborsClassifier(n_neighbors=17,weights='distance')
KNNBest.fit(xtrain,ytrain)
KNNBest.score(xtrain,ytrain)

0.92480359147025815

In [22]:
LG = LogisticRegression(C=1.0, penalty='l1', tol=0.01)

In [23]:
# define the parameter values that should be searched
C_range = [0.01, 0.1, 1.0, 10.0, 100.]
penalty_options = ['l1','l2']
# create a parameter grid: map the parameter names to the values that should be searched
param_grid_lg = dict(C=C_range, penalty=penalty_options)
# instantiate and fit the grid
grid = GridSearchCV(LG, param_grid_lg, cv=10, scoring='accuracy')
grid.fit(xtrain, ytrain)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l1', random_state=None, solver='liblinear', tol=0.01,
          verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1.0, 10.0, 100.0]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='accuracy', verbose=0)

In [24]:
# examine the best model
print grid.best_score_
print grid.best_params_

0.800224466891
{'penalty': 'l2', 'C': 0.1}


In [25]:
LGBest = LogisticRegression(C=0.1,penalty='l2')
LGBest.fit(xtrain,ytrain)
LGBest.score(xtrain,ytrain)

0.8058361391694725

In [26]:
from sklearn.ensemble import VotingClassifier

ImportError: cannot import name VotingClassifier

In [28]:
ytest = test_clean[predictors]
predicted = KNNBest.predict(ytest)
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": predicted
    })
submission.to_csv("result_knn.csv", index=False)