### Starting with loading and splitting the data

#### This version has worse performance.
- Calculated TotalSpending column.
- Then made categorical column binning the TotalSpending column

#### Will create a v3 that uses the TotalSpending column without bins.

#### Did not complete the analysis with processing of test data

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

# load data from data/train.csv into a pandas dataframe
df = pd.read_csv('data/train.csv')

y = df['Transported'].astype('int64')

#copy X
X = df.copy(deep=True)
# split Cabin into Deck, Number and Side
X[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = X['Cabin'].str.split('/', expand=True)
# cast Cabin_number as int
X['Cabin_Number'] = X['Cabin_Number'].astype('float64')
# drop Cabin, PassengerId and Name
X.drop(['Cabin', 'PassengerId', 'Name', 'Transported'], axis=1, inplace=True)

# convert CryoSleep to bool
#X['CryoSleep'] = X['CryoSleep'].astype('bool')

# convert VIP to bool
#X['VIP'] = X['VIP'].astype('bool')

# identify columns with bool data type
bool_cols = [col for col in X.columns if X[col].dtype == 'bool']
# set bool columns to int
X[bool_cols] = X[bool_cols].astype('int64')

# create TotalSpending column
onboard_spending = ['FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'RoomService']
X['TotalSpending'] = X[onboard_spending].sum(axis=1, skipna=True)

# create categorical column for TotalSpending
X['TotalSpendingBin'] = pd.cut(X['TotalSpending'], 
                                         bins=[0, 1, 500, 1000, np.inf],
                                         labels=['zero', 'low', 'medium', 'high'],
                                         right=False)

# drop numeric columns for spending
X.drop(onboard_spending, axis=1, inplace=True)
X.drop(['TotalSpending'], axis=1, inplace=True)


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=48)
print(X_train.head(10))
# print data types in X_train
X_train.dtypes

     HomePlanet  CryoSleep    Destination   Age  VIP Cabin_Deck  Cabin_Number  \
2461        NaN          0    TRAPPIST-1e  24.0    0          G         430.0   
250       Earth          0    55 Cancri e  50.0    0          F          61.0   
1458      Earth          0    TRAPPIST-1e  47.0    0          F         298.0   
2280       Mars          1    TRAPPIST-1e   7.0    0          F         508.0   
7465       Mars          0    TRAPPIST-1e  21.0    0          F        1655.0   
2628       Mars          0    55 Cancri e  43.0    0          F         538.0   
122       Earth          1  PSO J318.5-22  22.0    0          G          22.0   
572       Earth          1    TRAPPIST-1e  19.0    0          G          89.0   
1645     Europa          0    TRAPPIST-1e  28.0    0          E         130.0   
185       Earth          1  PSO J318.5-22  19.0    0          G          32.0   

     Cabin_Side TotalSpendingBin  
2461          P           medium  
250           P             high  
145

HomePlanet            object
CryoSleep              int64
Destination           object
Age                  float64
VIP                    int64
Cabin_Deck            object
Cabin_Number         float64
Cabin_Side            object
TotalSpendingBin    category
dtype: object

### Create preprocessing pipeline

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler

# generate numerical columns and categorical column lists from X_train datatypes   
numerical_columns = X_train.select_dtypes(include=['float64', 'int64', 'bool']).columns
categorical_columns = X_train.select_dtypes(include=['object']).columns

# create inputers for numerical and categorical data
numerical_inputer = SimpleImputer(strategy='median')
categorical_inputer = SimpleImputer(strategy='most_frequent')
robust_scaler = RobustScaler()

# not sure if this robust scaler is better
#robust_scaler = RobustScaler(quantile_range=(10.0, 90.0), unit_variance=True)

# create one hot encoder for categorical data
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

# create column transformer for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
            ('num', Pipeline(steps = [('imputer', numerical_inputer),
                                      ('scaler', robust_scaler)]), numerical_columns),
            ('cat', Pipeline(steps = [('imputer', categorical_inputer),
                                      ('onehot', one_hot_encoder)]), categorical_columns)
])

# assemble the pipeline
preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# apply the pipeline to the training data
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)
# apply to validation data
X_val_preprocessed = preprocessing_pipeline.transform(X_val)

# print the shape of the training and validation data
print("Training data shape:", X_train_preprocessed.shape)
print("Validation data shape:", X_val_preprocessed.shape)

Training data shape: (7389, 20)
Validation data shape: (1304, 20)


In [12]:
# Trying Deep Neural Network
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Create the neural network model
nn_model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_preprocessed.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
nn_model.fit(X_train_preprocessed, y_train, epochs=10, batch_size=32)

# Evaluate the model
nn_loss, nn_accuracy = nn_model.evaluate(X_val_preprocessed, y_val)
print("Neural Network Accuracy:", nn_accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Accuracy: 0.7300613522529602


In [13]:

import keras_tuner as kt
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2

def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units', min_value=32, max_value=512, step=32),
                    activation='relu', input_shape=(X_train_preprocessed.shape[1],)))
    model.add(Dropout(hp.Float('dropout', min_value=0.0, max_value=0.5, default=0.25, step=0.05)))
    model.add(Dense(units=hp.Int('units', min_value=32, max_value=256, step=32),
                    activation='relu',))
    model.add(Dropout(hp.Float('dropout', min_value=0.0, max_value=0.5, default=0.25, step=0.05)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=10,
                     directory='my_dir/test_06',
                     project_name='intro_to_kt')

tuner.search(X_train_preprocessed, y_train, epochs=10, validation_data=(X_val_preprocessed, y_val))

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.values)
# print best accuracy
print(tuner.get_best_models()[0].evaluate(X_val_preprocessed, y_val))

Trial 30 Complete [00h 00m 02s]
val_accuracy: 0.7277607321739197

Best val_accuracy So Far: 0.737730085849762
Total elapsed time: 00h 00m 36s
{'units': 224, 'dropout': 0.25, 'tuner/epochs': 10, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
[0.511960506439209, 0.737730085849762]


In [14]:
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 224)               4704      
                                                                 
 dropout (Dropout)           (None, 224)               0         
                                                                 
 dense_1 (Dense)             (None, 224)               50400     
                                                                 
 dropout_1 (Dropout)         (None, 224)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 225       
                                                                 
Total params: 55329 (216.13 KB)
Trainable params: 55329 (216.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [40]:
# load the test data
df_test = pd.read_csv('data/test.csv')

X_test = df_test.copy(deep=True)
# split Cabin into Deck, Number and Side
X_test[['Cabin_Deck', 'Cabin_Number', 'Cabind_Side']] = X_test['Cabin'].str.split('/', expand=True)
# cast Cabin_number as int
X_test['Cabin_Number'] = X_test['Cabin_Number'].astype('float64')

# save passenger ids for submission
passenger_ids = X_test['PassengerId']

# drop Cabin, PassengerId and Name
X_test.drop(['Cabin', 'PassengerId', 'Name'], axis=1, inplace=True)

# identify columns with bool data type
bool_cols = [col for col in X_test.columns if X_test[col].dtype == 'bool']
# set bool columns to int
X_test[bool_cols] = X_test[bool_cols].astype('int64')

# apply the pipeline to the test data
X_test_preprocessed = preprocessing_pipeline.transform(X_test)

# use the best model to predict the test data
y_test_pred = best_model.predict(X_test_preprocessed)

# transform predictions into boolean values
y_test_submit = (y_test_pred > 0.5).astype(bool)

# create a dataframe with the passenger ids and predicted values
df_submit = pd.DataFrame({'PassengerId': passenger_ids, 'Transported': y_test_submit.flatten()})
# save the dataframe as a csv file
df_submit.to_csv('data/submit_DNN_05.csv', index=False)
print(df_submit.head())


  PassengerId  Transported
0     0013_01        False
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01        False
