In [19]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline  import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

train = pd.read_csv('./titanic/train.csv')
test = pd.read_csv('./titanic/test.csv')

# Подготовка данных

In [2]:
train_dummies = pd.get_dummies(train, columns=['Sex', 'Pclass', 'Embarked'])
test_dummies = pd.get_dummies(test, columns=['Sex', 'Pclass', 'Embarked'])


In [3]:
X_train = train_dummies.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
X_test = test_dummies.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [4]:
y_train = train_dummies['Survived']

In [5]:
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)
imputer.fit(X_train)
X_train_fit = imputer.transform(X_train)

imputer.fit(X_test)
X_test_fit = imputer.transform(X_test)

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(copy=True, with_mean=True, with_std=True)

scaler.fit(X_train_fit)
X_train_scaled = scaler.transform(X_train_fit)

scaler.fit(X_test_fit)
X_test_scaled = scaler.transform(X_test_fit)

# LogisticRegression

In [7]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(X_train_scaled, y_train)

In [121]:
predicted = model.predict(X_train_scaled)
predictedY = model.predict(X_test_scaled)
accuracy_score(predicted, y_train)

0.80471380471380471

In [21]:
kfold = KFold(n_splits=3, shuffle=True, random_state=145)

In [122]:
scores = cross_val_score(LogisticRegression(),
                         X_train_scaled, y_train, cv = kfold)
np.average(scores)

0.79012345679012341

# RandomForestClassifier

In [202]:
pipelineCheck = Pipeline(steps=[ ('imputer', imputer),
                            ('scale', scaler),
                           ('model', Ridge(alpha=16.0)) ] )

param_grid = {'model': [Ridge(alpha=10.5),
                        RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=15, 
                                               min_samples_split=2, min_samples_leaf=3, 
                                               min_weight_fraction_leaf=0.0, max_features='auto', 
                                               max_leaf_nodes=None, min_impurity_split=1e-07, 
                                               bootstrap=True, oob_score=False, n_jobs=1, random_state=None, 
                                               verbose=0, warm_start=False, class_weight=None)]}

grid = GridSearchCV(pipelineCheck, param_grid, 
#                    scoring=make_scorer(r2_score),
                    cv=kfold)

In [207]:
model2 = grid.fit(X_train_scaled, y_train)
print('mean_test_score:', model2.cv_results_['mean_test_score'], end='\t')
predicted2 = grid.best_estimator_.predict(X_train_scaled)
print('accuracy_score:', accuracy_score(predicted2, y_train), end='\t')

scores2 = cross_val_score(RandomForestClassifier(),
                         X_train_scaled, y_train, cv = kfold)
print('average score:', np.average(scores2), end='\t')



mean_test_score: [ 0.37834376  0.82042649]	accuracy_score: 0.879910213244	average score: 0.79012345679	

# Расчет целевого показателя

In [197]:
predictedY2 = model2.predict(X_test_scaled)

# Создание файла

In [194]:
with open('submission4.csv', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y in zip(test['PassengerId'], predictedY2):
        out.write('%s,%s\n' % (passenger, y))

В основном, оптимизировался расчет с использованием RandomForestClassifier по нескольким параметрам (количество деревьев, максимальная глубина, минимальная выборка в листе дерева). 
Оценка по кросс-валидации последней итерации RandomForestClassifier: 0.7957; LogisticRegression: 0.7901.
Оценка в таблице участников конкурса: 0.77512.