In [121]:
import pandas as pd
import numpy as np

from feature_engine.encoding import WoEEncoder
from feature_engine.imputation import (CategoricalImputer, 
                                       MeanMedianImputer)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [7]:
df_train = pd.read_csv('../dataset/train.csv')
df_test = pd.read_csv('../dataset/test.csv')

* Data preprocessing
* Linear model usage

In [126]:
TARGET = 'Transported'

FEATURES = [
    'HomePlanet',
    'CryoSleep',
    'Destination',
    'Age',
    'VIP',
    'RoomService',
    'FoodCourt',
    'ShoppingMall',
    'Spa',
    'VRDeck',
    'Cabin'
]

NUMERICAL_VARS = [
    'Age',
    'RoomService',
    'FoodCourt',
    'ShoppingMall',
    'Spa',
    'VRDeck'
]

CATEGORICAL_VARS = [
    'HomePlanet',
    'Destination',
    'Cabin',
    'VIP',
    'CryoSleep',
    'CabinDeck',
    'CabinSide'
]

NUM_MEAN_IMPUTE_VARS = [
    'Age',
    'RoomService',
    'FoodCourt',
    'ShoppingMall',
    'Spa',
    'VRDeck'
]

CAT_WOE_ENCODING = [
    'HomePlanet',
    'Destination',
    'CabinDeck',
    'CabinSide'
]

CAT_MISSING_IMPUTE_VARS = [
    'Cabin',
    'Destination',
    'HomePlanet',
]

CAT_ARBITRARY_IMPUTE_VARS = [
    'CryoSleep',
    'VIP'
]

UNUSED_FIELDS = [
    'PassengerId',
    'Name'
]

In [138]:
from sklearn.base import BaseEstimator, TransformerMixin

class CabinPreprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        
        def helper_na(x):
            if x == 'NA':
                return ['NA', 'NA', 'NA']
            else:
                return x.split('/')
        
        splitted_series = X['Cabin'].apply(helper_na)
        X['CabinDeck'] = splitted_series.apply(lambda x: x[0])
        X['CabinSide'] = splitted_series.apply(lambda x: x[2])
        X.drop('Cabin', axis=1, inplace=True)
        return X
        

In [162]:
pipeline = Pipeline([
    (
        'CatMissingImputer', 
        CategoricalImputer(
             fill_value='NA',
             variables=CAT_MISSING_IMPUTE_VARS
         )
    ),
    (
        'CatMissingArbitraryImputer',
        CategoricalImputer(
            variables=CAT_ARBITRARY_IMPUTE_VARS,
            fill_value=False,
        )
    ),
    (
        'CabinPreprocess',
        CabinPreprocess()
    ),
    (
        'NumMeanImputer',
        MeanMedianImputer(
             variables=NUM_MEAN_IMPUTE_VARS
         )
    ),
    (
        'CatWOEEncoder',
        WoEEncoder(
            variables=CAT_WOE_ENCODING
        )
    ),
    (
        'StandardScaler',
        StandardScaler()
    ),
    (
        'LogisticRegression',
        LogisticRegression()
    )
])

In [163]:
X_, X, y_, y = train_test_split(df_train[FEATURES], df_train[TARGET])

In [164]:
pipeline.fit(X_, y_)

In [165]:
from sklearn.metrics import accuracy_score

In [167]:
accuracy_score(y, pipeline.predict(X))

0.795768169273229

In [161]:
pipeline.predict(df_test[FEATURES])

array([ True, False,  True, ...,  True,  True,  True])