In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class PreProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def transform(self, dataset):

        dataset = dataset.drop(['Ticket', 'Cabin'], axis=1)
        dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
        dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
                                                         'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

        title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
        dataset['Title'] = dataset['Title'].map(title_mapping)
        dataset['Title'] = dataset['Title'].fillna(0)

        dataset = dataset.drop(['Name'], axis=1)

        dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
        '''
        guess_ages = np.zeros((2,3))
        for i in range(0, 2):
            for j in range(0, 3):
                guess_df = dataset[(dataset['Sex'] == i) & \
                                      (dataset['Pclass'] == j+1)]['Age'].dropna()

                age_guess = guess_df.median()

                # Convert random age float to nearest .5 age
                guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5

        for i in range(0, 2):
            for j in range(0, 3):
                dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                        'Age'] = guess_ages[i,j]
        '''
        dataset['Age'] = dataset['Age'].fillna(40).astype(int)

        dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
        dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
        dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
        dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
        dataset.loc[ dataset['Age'] > 64, 'Age'] = 4

        dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
        dataset['IsAlone'] = 0
        dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

        dataset = dataset.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)

        dataset['Age*Class'] = dataset.Age * dataset.Pclass

        dataset['Embarked'] = dataset['Embarked'].fillna("S")

        dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

        dataset['Fare'].fillna(dataset['Fare'].dropna().median(), inplace=True)

        dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
        dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
        dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
        dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
        dataset['Fare'] = dataset['Fare'].astype(int)

        return dataset.as_matrix()
    
    def fit(self, dataset, y=None):
        return self

In [4]:
pipe = Pipeline([('preprocessing', PreProcessing()), ('RF', RandomForestClassifier())])
pipe

Pipeline(memory=None,
     steps=[('preprocessing', PreProcessing()), ('RF', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [5]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
X_train = df_train.drop(["Survived", "PassengerId"], axis=1)
Y_train = df_train["Survived"]
X_test  = df_test.drop("PassengerId", axis=1).copy()

In [6]:
pipe.fit(X_train, Y_train)

Pipeline(memory=None,
     steps=[('preprocessing', PreProcessing()), ('RF', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [8]:
pipe.predict(X_test.loc[0:5])

array([0, 0, 0, 0, 1, 0], dtype=int64)

In [15]:
import dill as pickle
filename = 'model_v1.pk'
with open('./model/'+filename, 'wb') as file:
    pickle.dump(pipe, file)

In [18]:
with open('./model/'+filename ,'rb') as f:
    loaded_model = pickle.load(f)
loaded_model.predict(X_test.loc[0:5])

array([0, 0, 0, 0, 1, 0], dtype=int64)