### Starting with loading and splitting the data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

# load data from data/train.csv into a pandas dataframe
df = pd.read_csv('data/train.csv')

y = df['Transported']

#copy X
X = df.copy(deep=True)
# split Cabin into Deck, Number and Side
X[['Cabin_Deck', 'Cabin_Number', 'Cabind_Side']] = X['Cabin'].str.split('/', expand=True)
# cast Cabin_number as int
X['Cabin_Number'] = X['Cabin_Number'].astype('float64')
# drop Cabin, PassengerId and Name
X.drop(['Cabin', 'PassengerId', 'Name', 'Transported'], axis=1, inplace=True)

# identify columns with bool data type
bool_cols = [col for col in X.columns if X[col].dtype == 'bool']
# set bool columns to int
X[bool_cols] = X[bool_cols].astype('int64')



X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=48)

### Create preprocessing pipeline

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler

# generate numerical columns and categorical column lists from X_train datatypes   
numerical_columns = X_train.select_dtypes(include=['float64', 'int64', 'bool']).columns
categorical_columns = X_train.select_dtypes(include=['object']).columns

# create inputers for numerical and categorical data
numerical_inputer = SimpleImputer(strategy='median')
categorical_inputer = SimpleImputer(strategy='most_frequent')
robust_scaler = RobustScaler()

# create one hot encoder for categorical data
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

# create column transformer for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
            ('num', Pipeline(steps = [('imputer', numerical_inputer),
                                      ('scaler', robust_scaler)]), numerical_columns),
            ('cat', Pipeline(steps = [('imputer', categorical_inputer),
                                      ('onehot', one_hot_encoder)]), categorical_columns)
])

# assemble the pipeline
preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# apply the pipeline to the training data
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)
# apply to validation data
X_val_preprocessed = preprocessing_pipeline.transform(X_val)

# print the shape of the training and validation data
print("Training data shape:", X_train_preprocessed.shape)
print("Validation data shape:", X_val_preprocessed.shape)

Training data shape: (7389, 27)
Validation data shape: (1304, 27)


In [4]:
# Trying Deep Neural Network
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Create the neural network model
nn_model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_preprocessed.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
nn_model.fit(X_train_preprocessed, y_train, epochs=10, batch_size=32)

# Evaluate the model
nn_loss, nn_accuracy = nn_model.evaluate(X_val_preprocessed, y_val)
print("Neural Network Accuracy:", nn_accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Accuracy: 0.8052147030830383


In [26]:

import keras_tuner as kt
from tensorflow.keras.layers import Dense, Dropout

def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units', min_value=32, max_value=512, step=32),
                    activation='relu', input_shape=(X_train_preprocessed.shape[1],)))
    model.add(Dropout(hp.Float('dropout', min_value=0.0, max_value=0.5, default=0.25, step=0.05)))
    model.add(Dense(units=hp.Int('units', min_value=32, max_value=256, step=32)))
    model.add(Dropout(hp.Float('dropout', min_value=0.0, max_value=0.5, default=0.25, step=0.05)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=10,
                     directory='my_dir/test_02',
                     project_name='intro_to_kt')

tuner.search(X_train_preprocessed, y_train, epochs=10, validation_data=(X_val_preprocessed, y_val))

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.values)
# print best accuracy
print(tuner.get_best_models()[0].evaluate(X_val_preprocessed, y_val))

Trial 30 Complete [00h 00m 02s]
val_accuracy: 0.8174846768379211

Best val_accuracy So Far: 0.8190184235572815
Total elapsed time: 00h 00m 38s
{'units': 320, 'dropout': 0.30000000000000004, 'tuner/epochs': 2, 'tuner/initial_epoch': 0, 'tuner/bracket': 2, 'tuner/round': 0}
[0.41052091121673584, 0.8190184235572815]


In [18]:
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 480)               13440     
                                                                 
 dense_1 (Dense)             (None, 480)               230880    
                                                                 
 dense_2 (Dense)             (None, 1)                 481       
                                                                 
Total params: 244801 (956.25 KB)
Trainable params: 244801 (956.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
# load the test data
df_test = pd.read_csv('data/test.csv')

X_test = df_test.copy(deep=True)
# split Cabin into Deck, Number and Side
X_test[['Cabin_Deck', 'Cabin_Number', 'Cabind_Side']] = X_test['Cabin'].str.split('/', expand=True)
# cast Cabin_number as int
X_test['Cabin_Number'] = X_test['Cabin_Number'].astype('float64')

# save passenger ids for submission
passenger_ids = X_test['PassengerId']

# drop Cabin, PassengerId and Name
X_test.drop(['Cabin', 'PassengerId', 'Name'], axis=1, inplace=True)

# identify columns with bool data type
bool_cols = [col for col in X_test.columns if X_test[col].dtype == 'bool']
# set bool columns to int
X_test[bool_cols] = X_test[bool_cols].astype('int64')

# apply the pipeline to the test data
X_test_preprocessed = preprocessing_pipeline.transform(X_test)

# use the best model to predict the test data
y_test_pred = best_model.predict(X_test_preprocessed)

# transform predictions into boolean values
y_test_submit = (y_test_pred > 0.5).astype(bool)

# create a dataframe with the passenger ids and predicted values
df_submit = pd.DataFrame({'PassengerId': passenger_ids, 'Transported': y_test_submit.flatten()})
# save the dataframe as a csv file
df_submit.to_csv('data/submit.csv', index=False)
print(df_submit.head())


  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True
