In [1]:
import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import missingno
import numpy as np
import pandas as pd
import seaborn as sns
from fancyimpute import MICE
from IPython.core.debugger import set_trace
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.preprocessing import LabelEncoder

plt.style.use("fivethirtyeight")

Using TensorFlow backend.


In [2]:
train = pd.read_csv("/home/iyed/.kaggle/competitions/titanic/train.csv")
test = pd.read_csv("/home/iyed/.kaggle/competitions/titanic/test.csv")
train.rename(str.lower, axis=1, inplace=True)
test.rename(str.lower, axis=1, inplace=True)

### Transformers

In [3]:
# https://zablo.net/blog/post/pandas-dataframe-in-scikit-learn-feature-union
from scipy import sparse
from sklearn.externals.joblib import Parallel, delayed
from sklearn.pipeline import FeatureUnion, _fit_transform_one, _transform_one


class PandasFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(trans,
                                        weight,
                                        X,
                                        y,
                                        **fit_params) for name,
            trans,
            weight in self._iter()
        )

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(trans,
                                    weight,
                                    X) for name,
            trans,
            weight in self._iter()
        )
        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

In [4]:
class NoFitMixin():
    def fit(self, X, y=None):
        return self


class MICEImputer(BaseEstimator, TransformerMixin, NoFitMixin):
    def transform(self, X, *args, **kwargs):
        """
            Fill a 1-D array missing values with MICE
        """
        assert isinstance(X, pd.Series)
        X = X.copy()    # Should avoid error of already full for repeat execution
        has_null = X.isnull().any()    # TODO: Used to avoid error of no null values from MICE
        if has_null:
            mice = MICE(verbose=False, *args, **kwargs)
            imputed = mice.complete(X.values.reshape(-1, 1))
            X.loc[:] = imputed.reshape(X.loc[:].shape)
            return pd.DataFrame(X)
        else:
            return X


class ColumnSelector(BaseEstimator, TransformerMixin, NoFitMixin):
    def __init__(self, columns, one_col=True):
        self.columns = columns
        self.one_col = one_col

    def transform(self, X, y=None):
        if self.one_col:
            return X[self.columns].iloc[:, 0]
        else:
            return X[self.columns]


class ColumnDummifier(BaseEstimator, TransformerMixin, NoFitMixin):
    def transform(self, X, y=None):
        return pd.get_dummies(X, sparse=True, drop_first=True)


class LabelEncoderWNaN(TransformerMixin, BaseEstimator):
    """
    Applies the sklearn.LabelEncoder while keeping missing values
    """

    def fit(self, X, y=None):
        self.le_ = LabelEncoder()
        self.le_.fit(X.loc[X.notnull()])
        return self

    def transform(self, X, y=None):
        X = X.copy(deep=True)    # Do not apply tranform to the actual DF
        X.loc[X.notnull()] = self.le_.transform(X.loc[X.notnull()])
        return X.astype("float")

## Explore

In [5]:
train_x = train.drop(columns=["survived"])
train_y = train.survived

In [6]:
benchmark_pipeline = Pipeline([(
    "prep",
    PandasFeatureUnion([
        ("age",
         make_pipeline(ColumnSelector(["age"]),
                       MICEImputer())),
        ("sex_dummy",
         make_pipeline(ColumnSelector(["sex"]),
                       ColumnDummifier())),
        (
            "embarked",
            make_pipeline(ColumnSelector(["embarked"]),
                          LabelEncoderWNaN(),
                          MICEImputer(),
                          ColumnDummifier())
        ),
        ("fare",
         make_pipeline(ColumnSelector(["fare"]),
                       MICEImputer())),
        ("rest",
         make_pipeline(ColumnSelector(["parch",
                                       "sibsp",
                                       "pclass"],
                                      one_col=False)))
    ])
)])

In [12]:
train.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

## Benchmark model

In [7]:
lr_pipeline = make_pipeline(benchmark_pipeline, LogisticRegression())

In [8]:
lr_pipeline.fit(train, train_y)

Pipeline(memory=None,
     steps=[('pipeline', Pipeline(memory=None,
     steps=[('prep', PandasFeatureUnion(n_jobs=1,
          transformer_list=[('age', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(columns=['age'], one_col=True)), ('miceimputer', MICEImputer())])), ('sex_dummy', Pipeline(memory=None,
...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [9]:
accuracy_score(train_y, lr_pipeline.predict(train_x))

0.8024691358024691

In [10]:
validate = pd.read_excel("/home/iyed/Downloads/titanic3.xls")

In [11]:
accuracy_score(validate.survived, lr_pipeline.predict(validate))

0.7899159663865546