In [1]:
import numpy as np
import pandas as pd

In [2]:
def load_train_data(path):
    train = pd.read_csv(path)
    
    train['Sex'][train['Sex'] == 'male'] = 0
    train['Sex'][train['Sex'] == 'female'] = 1
    train['Sex'] = train['Sex'].astype(int)
    
    fill_train = train.iloc[:, [0,1,2,4,5,6,7,9]]
    med = fill_train['Age'].median()
    fill_train.loc[:, 'Age'] = fill_train['Age'].fillna(med)
    
    fill_train = fill_train.values
    ids = fill_train[:, 0]
    labels = fill_train[:, 1]
    data = fill_train[:, 2:]
    
    return ids, data, labels, med

In [7]:
train_ids, train_data, train_labels, med = load_train_data('data/train.csv')
print(train_data.shape)

# データを揃える
## 平均取る
train_mean = np.mean(train_data, axis=0)
train_std = np.std(train_data, axis=0)

## 正規化 (normlize / standard)
## 分布を揃える
## (trainデータ - 平均) / 分散
train_data = (train_data - train_mean) / train_std

# 学習データの準備
## validation data
val_data = train_data[-100:] # 後ろから100番目とる
val_labels = train_labels[-100:]
## train data
train_data = train_data[:-100] # 後ろから数えて100個目まで取る
train_labels = train_labels[:-100]

(891, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [4]:
train_mean

array([ 2.30864198,  0.35241302, 29.36158249,  0.52300786,  0.38159371,
       32.20420797])

In [6]:
train_std

array([ 0.83560193,  0.47772176, 13.01238827,  1.10212444,  0.80560476,
       49.66553444])

In [8]:
import tensorflow as tf
from tensorflow.keras import layers

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [11]:
class TitanicModel(tf.keras.Model):
    def __init__(self, output_dim):
        super(TitanicModel, self).__init__()
        # activation : 関数
        self.dense1 = layers.Dense(16, activation='relu') # 16次元に拡張
        self.dense2 = layers.Dense(16, activation='relu') # 16次元の層
        # 0~1で収める処理
        self.out = layers.Dense(output_dim, activation='sigmoid') # 1次元で0,1判定するときにsigmoidを利用. softmax
        
    def call(self, inputs):
        # 分岐判定を作るならここで
        # inputの次元定義しなくても勝手に設定してくれる
        # callしたら、一番最初の次元固定で処理する
        ## keras : define and run (対義語 difine by run)
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.out(x)
        return x

In [12]:
def scheduler(epoch):
  # epoch : データを
  if epoch < 25:
    return 0.001
  else:
    return 0.001 * 0.9 ** (epoch - 24)

In [16]:
from datetime import datetime

model = TitanicModel(output_dim=1)
# optimizer : 基本何やっているか分からない
# Adamが自動で調整してくれる. learning_rate : うまくいかないときに調整するparams
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              # lossについて
              # sigmoid : negative log_lossを利用
              # softmax -> cross entropy
              # 回帰 -> mean squared loss, L1

              loss=tf.losses.log_loss,
              # loss='binary_crossentropy',
              metrics=['accuracy'])

ckpt_path = './ckpt/titanic/titanic'
logdir="./logs/titanic-" + datetime.now().strftime("%Y%m%d-%H%M%S")
# logdir = "./logs"
# ModelCheckpoint : 重みを自動で付けてくれる
# TensorBoard : どう学習が進んでいるのか見れるようにする
## tensorboard --logdir=logs で実行.localhost:6006で閲覧
# LearningRateScheduler : 重みの更新と学習が進むごとのlearning_rateのチューニング
callbacks = [tf.keras.callbacks.ModelCheckpoint(ckpt_path, save_best_only=True, monitor='val_acc'),
             tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1),
             tf.keras.callbacks.LearningRateScheduler(scheduler),
            ]
# ⊿Wの計算
# batch : 1回の重みの更新
# batch_size=4 : 1 itelationで4サンプルの使用していることを意味
# 1 ecpock = (781/4). epochs = 50で 1epocksを50回実施
model.fit(train_data, train_labels, batch_size=4, epochs=50, callbacks=callbacks,
          validation_data=(val_data, val_labels))

Train on 791 samples, validate on 100 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x129ee6e48>

In [17]:
cnt = 0
for IDX in range(100):
    predict = model.predict(val_data[IDX:IDX + 1])[0][0]
    if predict > 0.5:
        pred_label = 1
    else:
        pred_label = 0
    # print(pred_label, val_labels[IDX], pred_label == val_labels[IDX])
    if pred_label == val_labels[IDX]:
        cnt += 1
print(cnt)

87


In [18]:
model.load_weights(ckpt_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x12a963978>

In [19]:
def load_test_data(path, med):
    test = pd.read_csv(path)
    
    test['Sex'][test['Sex'] == 'male'] = 0
    test['Sex'][test['Sex'] == 'female'] = 1
    test['Sex'] = test['Sex'].astype(int)
    
    fill = test.iloc[:, [0,1,3,4,5,6,8]]
    fill.loc[:, 'Age'] = fill['Age'].fillna(med)
    
    fill = fill.values
    ids = fill[:, 0]
    data = fill[:, 1:]
    
    return ids, data

In [20]:
test_ids, test_data = load_test_data('data/test.csv', med)
test_data = (test_data - train_mean) / train_std

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [22]:
# test dataを導入
predictions = model.predict(test_data)

result = []
for id, pred in zip (test_ids, predictions):
    if pred[0] < 0.5:
        label = 0
    else:
        label = 1
    result.append('%d,%d' % (id, label))
    
with open('result.csv', 'w') as fout:
    fout.write('PassengerId,Survived\n')
    fout.write('\n'.join(result))

In [23]:
len(test_data)

418

In [24]:
predictions

array([[0.11777797],
       [0.3803744 ],
       [0.09675562],
       [0.12236565],
       [0.5581681 ],
       [0.19540974],
       [0.6065711 ],
       [0.25720906],
       [0.6471713 ],
       [0.09003505],
       [0.12126753],
       [0.31761548],
       [0.9880656 ],
       [0.08899257],
       [0.98172957],
       [0.92703974],
       [0.15615001],
       [0.14009541],
       [0.5646351 ],
       [0.5163789 ],
       [0.29926986],
       [0.48647267],
       [0.97973514],
       [0.6829626 ],
       [0.82455117],
       [0.07786891],
       [0.98054814],
       [0.13452405],
       [0.34836778],
       [0.07570651],
       [0.11498386],
       [0.1680682 ],
       [0.39165708],
       [0.45663774],
       [0.46157908],
       [0.15310279],
       [0.61236715],
       [0.6341326 ],
       [0.12722161],
       [0.16634741],
       [0.1332728 ],
       [0.40318978],
       [0.11411482],
       [0.92096376],
       [0.9810474 ],
       [0.12618506],
       [0.32811046],
       [0.121