In [2]:
import pandas as pd

data = pd.read_csv('data/train.csv')
X = data.copy()
y = X.pop('Transported')

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer

cat_cols = [cname for cname in X.columns if
                    X[cname].nunique() < 10 and 
                    X[cname].dtype == "object"]

num_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

def assign(cell):
    if type(cell) == float:
        return pd.Series([0,0,0])
    cell = cell.split('/')
    return pd.Series([cell[0], cell[1], cell[2]])

def bool_to_int(X, cols):
    for col in cols:
        X[col] = X[col] *1
    return X


def pandas_transform(x):
    if 'PassengerId' in list(x.columns):
        x.drop('PassengerId', axis=1, inplace=True)
    
    if 'Cabin' in list(x.columns):
        x[['deck','room_num','side']] = x.Cabin.apply(assign)
        x.drop('Cabin', axis=1, inplace=True)
    
    x[['deck', 'side']] = x[['deck', 'side']].astype(str)
    x['room_num'] = x['room_num'].astype(int)
    
    deck_mean = {'0': 0,
     'T': 1,
     'A': 127,
     'D': 207,
     'E': 313,
     'C': 508,
     'B': 572,
     'F': 1229,
     'G': 1321}
    
    g1 = [0,1,2,3]
    g1[0] = list(deck_mean.keys())[:2]
    g1[1] = list(deck_mean.keys())[2:5]
    g1[2] = list(deck_mean.keys())[5:7]
    g1[3] = list(deck_mean.keys())[7:]

    for y in range(len(g1)):
        x.deck.replace(g1[y], y, inplace=True)
        
    x = bool_to_int(x, ['VIP', 'CryoSleep'])
    
    return x

cat_transform_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

cat_transform_ord = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

imputers = ColumnTransformer([
    ('cat_imputer_ohe', cat_transform_ohe, ['deck','HomePlanet','Destination']),
    ('cat_imputer_ord', cat_transform_ord, ['CryoSleep', 'VIP', 'side']),
    ('num_imputer', SimpleImputer(strategy='mean'), num_cols)
    ])

def make_pipeline(model):
    return Pipeline([
        ('imputers', imputers),
        ('num_transforms', StandardScaler())
        # ,('model', model)
        ])

Deep learning approach

In [1]:
import tensorflow as tf
from tensorflow import keras

In [5]:
X = pandas_transform(X)
pipe = make_pipeline(0)

In [22]:
X_transformed = pipe.fit_transform(X)

In [23]:
X_transformed.shape

(8693, 19)

In [28]:
from keras import layers

model = keras.Sequential([
    layers.Dense(50, input_shape=[19], activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(20, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(10, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

In [29]:
model.compile(optimizer='adam',loss=keras.losses.binary_crossentropy, metrics=['accuracy'])
history = model.fit(X_transformed,y, validation_split=0.15, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [34]:
pd.read_csv('data/test.csv').PassengerId

0       0013_01
1       0018_01
2       0019_01
3       0021_01
4       0023_01
         ...   
4272    9266_02
4273    9269_01
4274    9271_01
4275    9273_01
4276    9277_01
Name: PassengerId, Length: 4277, dtype: object

In [35]:
X_test = pd.read_csv('data/test.csv')
X_test = pandas_transform(X_test)
X_test = pipe.transform(X_test)
preds = model.predict(X_test)

In [42]:
preds.reshape(4277).shape

(4277,)

In [44]:
output = pd.DataFrame({'PassengerId': pd.read_csv('data/test.csv').PassengerId,
                       'Transported': preds.reshape(4277)>0.5})
output.to_csv('./submission3.csv', index=False)

Results: best model so far, but not by that much: Scored 0.79869 in kaggle's competition, as opposed to 0.79378 obtained with traditional algorithms