In [1]:
# タイタニック生存予測

In [2]:
# ドライブのマウント
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/kaggle/titanic

/content/drive/MyDrive/kaggle/titanic


In [4]:
import pandas as pd
import numpy as np

In [5]:
# データの読み込み
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [6]:
# データの先頭の５行を表示
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# データの型を調べる
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
# nullかどうかを調べてから、その数を調べる
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
test.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [10]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [11]:
x = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = train[['Survived']]

In [12]:
# one-hotエンコーディングをする
# 展開してそれぞれ0か１を入れる
x = pd.get_dummies(x, columns=['Pclass', 'Sex','Embarked'])
x

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,1,0,7.2500,0,0,1,0,1,0,0,1
1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,26.0,0,0,7.9250,0,0,1,1,0,0,0,1
3,35.0,1,0,53.1000,1,0,0,1,0,0,0,1
4,35.0,0,0,8.0500,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,13.0000,0,1,0,0,1,0,0,1
887,19.0,0,0,30.0000,1,0,0,1,0,0,0,1
888,,1,2,23.4500,0,0,1,1,0,0,0,1
889,26.0,0,0,30.0000,1,0,0,0,1,1,0,0


In [13]:
y = pd.get_dummies(y, columns=['Survived'])
y

Unnamed: 0,Survived_0,Survived_1
0,1,0
1,0,1
2,0,1
3,0,1
4,1,0
...,...,...
886,1,0
887,0,1
888,1,0
889,0,1


In [14]:
# nullの数を調べる
x['Age'].isnull().sum()

177

In [15]:
# nullを埋める
x['Age'] = x['Age'].fillna(x['Age'].mean())
# 埋まったか確認
x['Age'].isnull().sum()

0

In [16]:
x['Age'].max()

80.0

In [17]:
x['Age'] = x['Age']/80
x['Age']

0      0.275000
1      0.475000
2      0.325000
3      0.437500
4      0.437500
         ...   
886    0.337500
887    0.237500
888    0.371239
889    0.325000
890    0.400000
Name: Age, Length: 891, dtype: float64

In [18]:
x['Fare'].max()

512.3292

In [19]:
x['Fare'] = x['Fare']/512.3292
x['Fare']

0      0.014151
1      0.139136
2      0.015469
3      0.103644
4      0.015713
         ...   
886    0.025374
887    0.058556
888    0.045771
889    0.058556
890    0.015127
Name: Fare, Length: 891, dtype: float64

In [20]:
# 訓練データとテストデータに分割
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)

In [21]:
# ニューラルネットを試す
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras import optimizers
#from keras.optimizers import Adam

In [22]:
model = keras.Sequential()
model.add(Dense(20, input_dim=12))
model.add(Activation('sigmoid'))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(2, activation='softmax'))


In [23]:
model.layers

[<keras.layers.core.dense.Dense at 0x7f9eec113850>,
 <keras.layers.core.activation.Activation at 0x7f9f640ca210>,
 <keras.layers.core.dense.Dense at 0x7f9ee80b6610>,
 <keras.layers.core.dense.Dense at 0x7f9ee80b6e50>]

In [24]:
#keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
#adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model.compile(optimizer = adam, loss = 'binary_crossentropy', metrics=['accuracy'])

  super(Adam, self).__init__(name, **kwargs)


In [25]:
model.fit(x_train, y_train, batch_size=100, epochs=100, verbose=2)

Epoch 1/100
7/7 - 1s - loss: 0.6882 - accuracy: 0.6138 - 911ms/epoch - 130ms/step
Epoch 2/100
7/7 - 0s - loss: 0.6811 - accuracy: 0.6138 - 24ms/epoch - 3ms/step
Epoch 3/100
7/7 - 0s - loss: 0.6757 - accuracy: 0.6138 - 12ms/epoch - 2ms/step
Epoch 4/100
7/7 - 0s - loss: 0.6719 - accuracy: 0.6138 - 17ms/epoch - 2ms/step
Epoch 5/100
7/7 - 0s - loss: 0.6690 - accuracy: 0.6138 - 14ms/epoch - 2ms/step
Epoch 6/100
7/7 - 0s - loss: 0.6669 - accuracy: 0.6138 - 13ms/epoch - 2ms/step
Epoch 7/100
7/7 - 0s - loss: 0.6653 - accuracy: 0.6138 - 19ms/epoch - 3ms/step
Epoch 8/100
7/7 - 0s - loss: 0.6639 - accuracy: 0.6138 - 14ms/epoch - 2ms/step
Epoch 9/100
7/7 - 0s - loss: 0.6627 - accuracy: 0.6138 - 16ms/epoch - 2ms/step
Epoch 10/100
7/7 - 0s - loss: 0.6616 - accuracy: 0.6138 - 13ms/epoch - 2ms/step
Epoch 11/100
7/7 - 0s - loss: 0.6604 - accuracy: 0.6138 - 15ms/epoch - 2ms/step
Epoch 12/100
7/7 - 0s - loss: 0.6593 - accuracy: 0.6138 - 17ms/epoch - 2ms/step
Epoch 13/100
7/7 - 0s - loss: 0.6581 - accurac

<keras.callbacks.History at 0x7f9ee7ff1610>

In [26]:
# テストデータで評価
model.fit(x_test, y_test, batch_size=100, epochs=100, verbose=2)

Epoch 1/100
3/3 - 0s - loss: 0.4446 - accuracy: 0.8027 - 11ms/epoch - 4ms/step
Epoch 2/100
3/3 - 0s - loss: 0.4440 - accuracy: 0.8027 - 12ms/epoch - 4ms/step
Epoch 3/100
3/3 - 0s - loss: 0.4432 - accuracy: 0.8072 - 9ms/epoch - 3ms/step
Epoch 4/100
3/3 - 0s - loss: 0.4425 - accuracy: 0.8117 - 14ms/epoch - 5ms/step
Epoch 5/100
3/3 - 0s - loss: 0.4418 - accuracy: 0.8072 - 13ms/epoch - 4ms/step
Epoch 6/100
3/3 - 0s - loss: 0.4417 - accuracy: 0.8072 - 11ms/epoch - 4ms/step
Epoch 7/100
3/3 - 0s - loss: 0.4413 - accuracy: 0.8027 - 9ms/epoch - 3ms/step
Epoch 8/100
3/3 - 0s - loss: 0.4408 - accuracy: 0.8027 - 7ms/epoch - 2ms/step
Epoch 9/100
3/3 - 0s - loss: 0.4404 - accuracy: 0.8072 - 10ms/epoch - 3ms/step
Epoch 10/100
3/3 - 0s - loss: 0.4399 - accuracy: 0.8072 - 6ms/epoch - 2ms/step
Epoch 11/100
3/3 - 0s - loss: 0.4395 - accuracy: 0.8072 - 6ms/epoch - 2ms/step
Epoch 12/100
3/3 - 0s - loss: 0.4393 - accuracy: 0.8027 - 9ms/epoch - 3ms/step
Epoch 13/100
3/3 - 0s - loss: 0.4384 - accuracy: 0.8027

<keras.callbacks.History at 0x7f9ee666ba10>

In [27]:
# submit
x_test = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
submit = test[['PassengerId']]

x_test = pd.get_dummies(x_test, columns=['Pclass', 'Sex','Embarked'])
x_test

# Fareのnullを埋める
# nullにFareの平均値を埋め込む
x_test['Fare'] = x_test['Fare'].fillna(x_test['Fare'].mean())
# AgeとFareの正規化
x_test['Age'] = x_test['Age']/70
x_train['Fare'] = x_train['Fare']/263

submit[['Survived_0', 'Survived_1']] = model.predict(x_test)
submit

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[k] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See t

Unnamed: 0,PassengerId,Survived_0,Survived_1
0,892,0.005983,0.994017
1,893,0.004738,0.995262
2,894,0.004666,0.995334
3,895,0.006039,0.993961
4,896,0.004296,0.995704
...,...,...,...
413,1305,,
414,1306,0.004166,0.995834
415,1307,0.008023,0.991977
416,1308,,


In [28]:
# nullを0で埋める
submit[['Survived_0', 'Survived_1']] = submit[['Survived_0', 'Survived_1']].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [29]:
# 0に置き換えられたか確認
submit

Unnamed: 0,PassengerId,Survived_0,Survived_1
0,892,0.005983,0.994017
1,893,0.004738,0.995262
2,894,0.004666,0.995334
3,895,0.006039,0.993961
4,896,0.004296,0.995704
...,...,...,...
413,1305,0.000000,0.000000
414,1306,0.004166,0.995834
415,1307,0.008023,0.991977
416,1308,0.000000,0.000000


In [30]:
submit['Survived_0'].max()

0.88519686460495

In [31]:
submit['Survived_0'].mean()

0.0064793004972958255

In [32]:
submit['Survived_0'].max()

0.88519686460495

In [33]:
submit['Survived_0'].mean()

0.0064793004972958255

In [34]:
# one hot エンコーディングしたい

#for i in range(len(submit['Survived'])):
 # if submit['Survived'][i] > 0.5:
  #  submit['Survived'][i] = 1
  #elif submit['Survived'][i] <= 0.5:
  #  submit['Survived'][i] = 0

#submit['Survived']
#submit

In [35]:
# csvファイルを作成
submit.to_csv('submission/submit07.csv', index=False)