In [14]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

# data loading
df = pd.read_csv('./data/titanic/train.csv')
train = df

# 필요없는 컬럼 제거
train.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=True)
display(train)

# 성별 문자열을 숫자로 바꾸기
sex_mapping = { 'male' : 0, 'female' : 1 }
train['Sex'] = train['Sex'].map(sex_mapping)

# 가족 처리 : sibsp 랑 parch 합체
train['Family'] = train['SibSp'] + train['Parch']
train.drop(['SibSp', 'Parch'], axis=1, inplace=True)

# 결측치확인
print(train.isnull().sum())

# 결측치처리
train['Embarked'] = train['Embarked'].fillna('S')
train['Age'] = train['Age'].fillna(train['Age'].mean())
# print(train.isnull().sum())

# Embarked 바꾸기 숫자로
emb_mapping = { 'S' : 0, 'C' : 1, 'Q' : 2 }
train['Embarked'] = train['Embarked'].map(emb_mapping)

# Age에 대해서 Binning 처리 (Numerical value -> categorical value)
train.loc[train['Age'] < 8, 'Age'] = 0
train.loc[(train['Age'] >= 8) & (train['Age'] < 20), 'Age'] = 1
train.loc[(train['Age'] >= 20) & (train['Age'] < 65), 'Age'] = 2
train.loc[train['Age'] >= 65, 'Age'] = 3

display(train)

# Training Data Set
x_data = train.drop('Survived', axis=1, inplace=False).values
t_data = train['Survived'].values.reshape(-1,1)

# Tensorflow로 구현하기
X = tf.placeholder(shape=[None,5], dtype=tf.float32) # 독립변수의 데이터
T = tf.placeholder(shape=[None,1], dtype=tf.float32) # 종속변수(label)의 데이터

# Weight & bias
W = tf.Variable(tf.random.normal([5,1]), name='weight')
b = tf.Variable(tf.random.normal([1]), name='bias')

# hypothesis
logit = tf.matmul(X,W) + b
H = tf.sigmoid(logit)

# loss function
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, labels=T))

# train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-3).minimize(loss)

# session 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 학습
for step in range(30000):
    _, W_val, b_val, loss_val = sess.run([train, W, b, loss], 
                                        feed_dict={X:x_data, T:t_data})
    if step % 3000 == 0:
        print('W:{}, b:{}, loss:{}'.format(W_val,b_val,loss_val))

        
# 정확도 측정
predict = tf.cast(H >= 0.5, dtype=tf.float32)
correct = tf.equal(predict, T)
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

acc_val = sess.run(accuracy, feed_dict={X:x_data, T:t_data})
print('Accuracy : ', acc_val)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,S
887,1,1,female,19.0,0,0,S
888,0,3,female,,1,2,S
889,1,1,male,26.0,0,0,C


Survived      0
Pclass        0
Sex           0
Age         177
Embarked      2
Family        0
dtype: int64


Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,0,2.0,0,1
1,1,1,1,2.0,1,1
2,1,3,1,2.0,0,0
3,1,1,1,2.0,0,1
4,0,3,0,2.0,0,0
...,...,...,...,...,...,...
886,0,2,0,2.0,0,0
887,1,1,1,1.0,0,0
888,0,3,1,2.0,0,3
889,1,1,0,2.0,1,0


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W:[[ 0.47835204]
 [ 1.2296634 ]
 [ 1.8161106 ]
 [ 1.2276794 ]
 [-1.332235  ]], b:[0.04648732], loss:2.7665464878082275
W:[[-0.7446479 ]
 [ 1.3801322 ]
 [ 0.66900355]
 [ 0.9427656 ]
 [-0.83403087]], b:[-0.40443042], loss:0.6664590239524841
W:[[-0.7356385]
 [ 1.5896393]
 [ 0.4659204]
 [ 0.8085331]
 [-0.3146602]], b:[-0.3724084], loss:0.5393332242965698
W:[[-0.7207163 ]
 [ 1.7348475 ]
 [ 0.31553876]
 [ 0.7045607 ]
 [-0.07557084]], b:[-0.34185705], loss:0.49971848726272583
W:[[-0.7009016 ]
 [ 1.8475946 ]
 [ 0.2197643 ]
 [ 0.6302577 ]
 [-0.01226858]], b:[-0.30350482], loss:0.4882500171661377
W:[[-0.68097365]
 [ 1.943531  ]
 [ 0.15473871]
 [ 0.57457304]
 [-0.0076914 ]], b:[-0.25980508], loss:0.48191753029823303
W:[[-0.6657208 ]
 [ 2.0273385 ]
 [ 0.10495625]
 [ 0.530568  ]
 [-0.01792898]], b:[-0.21444467], loss:0.4772951006889343
W:[[-0.655686  ]
 [ 2.1011844 ]
 [ 0.06416263]
 [ 0.49492273]
 [-0.0307

In [22]:
test_data = pd.read_csv('./data/titanic/test.csv')
display(test_data.head())

# 필요없는 컬럼 제거
test_data.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=True)

# 성별 문자열을 숫자로 바꾸기
sex_mapping = { 'male' : 0, 'female' : 1 }
test_data['Sex'] = test_data['Sex'].map(sex_mapping)

# 가족 처리 : sibsp 랑 parch 합체
test_data['Family'] = test_data['SibSp'] + test_data['Parch']
test_data.drop(['SibSp', 'Parch'], axis=1, inplace=True)

# 결측치확인
print(test_data.isnull().sum())

# 결측치처리
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())

# Embarked 바꾸기 숫자로
emb_mapping = { 'S' : 0, 'C' : 1, 'Q' : 2 }
test_data['Embarked'] = test_data['Embarked'].map(emb_mapping)

# Age에 대해서 Binning 처리 (Numerical value -> categorical value)
test_data.loc[test_data['Age'] < 8, 'Age'] = 0
test_data.loc[(test_data['Age'] >= 8) & (test_data['Age'] < 20), 'Age'] = 1
test_data.loc[(test_data['Age'] >= 20) & (test_data['Age'] < 65), 'Age'] = 2
test_data.loc[test_data['Age'] >= 65, 'Age'] = 3

display(test_data.head())

# Training Data Set


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Pclass       0
Sex          0
Age         86
Embarked     0
Family       0
dtype: int64


Unnamed: 0,Pclass,Sex,Age,Embarked,Family
0,3,0,2.0,2,0
1,3,1,2.0,0,1
2,2,0,2.0,2,0
3,3,0,2.0,0,0
4,3,1,2.0,0,2
