## Titanic word2vec -> Raw data

In [59]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.models import Model

from keras.layers import Input
from keras.layers import Dense
from keras.layers import Activation
from keras.layers import BatchNormalization
from keras.layers import Dropout
from keras.layers import initializers
from keras.layers import regularizers
from keras.callbacks import EarlyStopping

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten
from keras.layers.wrappers import TimeDistributed
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
 

from keras.optimizers import Adam, Adagrad, Adadelta

import keras.backend as K
from keras.utils.np_utils import to_categorical

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import chi2, SelectKBest

import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
%matplotlib inline


In [2]:
from numpy.random import seed
seed(123)
from tensorflow import set_random_seed
set_random_seed(234)


In [38]:
train = pd.read_csv("../raw_data/titanic_train.csv")
train.shape

(891, 12)

### Prep for test submission

In [29]:
test = pd.read_csv("../raw_data/titanic_test.csv")

In [30]:
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [31]:
Pass = pd.DataFrame(test.PassengerId)

In [32]:
Pass.shape

(418, 1)

## Merge data for processing

In [39]:
df = train.append(test, ignore_index = True) #restart here if needed

In [40]:
junk = df.drop(['Survived'], axis = 1)

In [None]:
y = df['Survived']

y.iloc[890, ]

y = y[:891, ]

y.tail(1)

y = pd.get_dummies(y, drop_first = True)

labels = np.asarray(y)

In [55]:
junk = junk.astype(str)
junk = junk.apply(lambda x: ' '.join(x), axis=1) 

In [58]:
print(junk.head(2))
junk.shape #should be a column vec

0    22.0 nan S 7.25 Braund, Mr. Owen Harris 0 1 3 ...
1    38.0 C85 C 71.2833 Cumings, Mrs. John Bradley ...
dtype: object


(1309,)

In [60]:
np.unique(y, return_counts = True) # pretty balanced

(array([0, 1], dtype=uint8), array([549, 342]))

In [61]:
tokenizer = Tokenizer(num_words = None)

In [62]:
tokenizer.fit_on_texts(junk)

In [63]:
sequences = tokenizer.texts_to_sequences(junk)

In [64]:
# compare, just note that the cleaned data is now a keras object, Junk is not cleaned
print(sequences[0:1])
print(junk[0:1])

[[35, 1, 2, 3, 11, 23, 654, 6, 655, 253, 1, 4, 7, 5, 4, 18, 13, 1189]]
0    22.0 nan S 7.25 Braund, Mr. Owen Harris 0 1 3 ...
dtype: object


In [65]:
word_index = tokenizer.word_index # type = dict

In [68]:
print('Found {} unique tokens.'.format(len(word_index)))

Found 4487 unique tokens.


In [69]:
data = pad_sequences(sequences, maxlen = None) #numpyarray of dim (samples, maxlen)

In [71]:
len(data[0])
len(data[1])

27

In [73]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (1309, 27)
Shape of label tensor: (891, 1)


In [74]:
rev_word_index = {v+3:k for k,v in word_index.items()}

In [75]:
rev_word_index[0] = 'padding_char'
rev_word_index[1] = 'start_char'
rev_word_index[2] = 'oov_char'
rev_word_index[3] = 'unk_char'

In [76]:
example_feat = ' '.join([rev_word_index[word] for word in data[3]])
example_feat #notice the zeros -> this is the padding. All lowercase, etc -> cleaned


'padding_char padding_char padding_char padding_char padding_char padding_char padding_char padding_char 20 start_char c85 unk_char 83 0 cumings 7 johnston 17599 6607 2833 heikkinen start_char 05 0 mr 0 c123'

In [77]:
data.shape

labels.shape

junk = data[:891, ]

(1309, 27)

In [172]:
X_test_f = data[891:]

In [173]:
X_test_f = X_test_f.reset_index(drop=True)

In [174]:
X_test_f.shape

(418, 27)

In [177]:
X_test_f = pd.DataFrame(X_test_f)

In [82]:
X_train, X_test, y_train, y_test = train_test_split(junk, labels,
                                                    test_size=0.3)


In [83]:
print(X_train.shape)
print(X_test.shape)
print(y_test.shape)
print(y_train.shape)

(623, 27)
(268, 27)
(268, 1)
(623, 1)


In [84]:
max_features = max([max(x) for x in X_train] + 
                   [max(x) for x in X_test]) + 1
max_features # number of features we have in the dictionary

3429

In [156]:
K.clear_session()

model = Sequential()
model.add(Embedding(max_features, 128))

model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(16, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [157]:
model.fit(X_train, y_train,
          batch_size=32,
          epochs=20,
          callbacks=[EarlyStopping(monitor='acc', patience=1)],
          validation_split=0.3)

Train on 436 samples, validate on 187 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


<keras.callbacks.History at 0x13958ce80>

In [158]:
score, acc = model.evaluate(X_test, y_test)
print('Test score:', score)
print('Test accuracy:', acc)


Test score: 0.447656134171
Test accuracy: 0.817164178215


In [159]:
y_pred = model.predict(X_test)

y_pred_class = [ ]
for i in y_pred:
    if i >= 0.5:
        y_pred_class.append(1)
    else:
        y_pred_class.append(0)


confusion_matrix(y_test, y_pred_class)

array([[153,  17],
       [ 32,  66]])

In [160]:
print(classification_report(y_test, y_pred_class))

             precision    recall  f1-score   support

          0       0.83      0.90      0.86       170
          1       0.80      0.67      0.73        98

avg / total       0.82      0.82      0.81       268



In [161]:
model.save('titanic2_model.h5') 

In [162]:
model.evaluate(X_test, y_test)



[0.44765613417127237, 0.8171641782148561]

In [180]:
X_test_f.shape

(418, 27)

In [181]:
y_pred = model.predict(X_test_f)

InvalidArgumentError: indices[0,22] = 3429 is not in [0, 3429)
	 [[Node: embedding_1/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding_1/embeddings/read, embedding_1/Cast)]]

Caused by op 'embedding_1/Gather', defined at:
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2802, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-156-52768ffd2a09>", line 4, in <module>
    model.add(Embedding(max_features, 128))
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/keras/models.py", line 467, in add
    layer(x)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/keras/engine/topology.py", line 619, in __call__
    output = self.call(inputs, **kwargs)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/keras/layers/embeddings.py", line 138, in call
    out = K.gather(self.embeddings, inputs)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py", line 1211, in gather
    return tf.gather(reference, indices)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1359, in gather
    validate_indices=validate_indices, name=name)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
    op_def=op_def)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2395, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/jacobanderson/anaconda/envs/tfdeeplearning/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1264, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): indices[0,22] = 3429 is not in [0, 3429)
	 [[Node: embedding_1/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding_1/embeddings/read, embedding_1/Cast)]]
