In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
# raw data
train = pd.read_csv('./data/titanic/train.csv')
test = pd.read_csv('./data/titanic/test.csv')
# display(df.head())


test.set_index('PassengerId',inplace=True)

# 필요없는 칼럼 삭제처리
train = train.drop(['PassengerId','Name','Ticket','Fare','Cabin'],
             axis=1,
             inplace=False)
test = test.drop(['Name','Ticket','Fare','Cabin'],
             axis=1,
             inplace=False)

# display(df.head())

# 컬럼을 보고 하나로 합칠 수 있는 칼럼은 하나로 합쳐줘요!
train['Family'] = train['SibSp'] + train['Parch']
train= train.drop(['SibSp','Parch'], axis=1,inplace=False)
display(train.head())

test['Family'] = test['SibSp'] + test['Parch']
test= test.drop(['SibSp','Parch'], axis=1,inplace=False)
display(test.head())

# 결측치 처리부터
train.isnull().sum()

test.isnull().sum()

# 'Embarked' column은 결측치가 2개
# 최빈값을 이용해서 missing value를 채워주자
train['Embarked'] = train['Embarked'].fillna('Q')

test['Embarked'] = test['Embarked'].fillna('Q')

# 'Age' column의 결측치는 평균값으로 대체
train['Age'] = train['Age'].fillna(train['Age'].mean())

test['Age'] = test['Age'].fillna(test['Age'].mean())

train.isnull().sum()
test.isnull().sum()

# 문자로 되어 있는 값을 숫자로 변경
gender_srting={'male' : 0, 'female':1}
train['Sex'] = train['Sex'].map(gender_srting)
test['Sex'] = test['Sex'].map(gender_srting)


embarked_string = {'S':0, 'C':1, 'Q':2}
train['Embarked'] = train['Embarked'].map(embarked_string)
test['Embarked'] = test['Embarked'].map(embarked_string)

def age_category(age):
    if((age >= 0) & (age < 25)):
        return 0
    elif ((age >=25) & (age < 50)):
        return 1
    else:
        return 2

train['Age'] = train['Age'].map(age_category)

test['Age'] = test['Age'].map(age_category)

display(train.head())
display(test.head())

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,male,22.0,S,1
1,1,1,female,38.0,C,1
2,1,3,female,26.0,S,0
3,1,1,female,35.0,S,1
4,0,3,male,35.0,S,0


Unnamed: 0_level_0,Pclass,Sex,Age,Embarked,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
892,3,male,34.5,Q,0
893,3,female,47.0,S,1
894,2,male,62.0,Q,0
895,3,male,27.0,S,0
896,3,female,22.0,S,2


Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,0,0,0,1
1,1,1,1,1,1,1
2,1,3,1,1,0,0
3,1,1,1,1,0,1
4,0,3,0,1,0,0


Unnamed: 0_level_0,Pclass,Sex,Age,Embarked,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
892,3,0,1,2,0
893,3,1,1,0,1
894,2,0,2,2,0
895,3,0,1,0,0
896,3,1,0,0,2


In [3]:
# data split
train_x_data, test_x_data, train_t_data, test_t_data=\
train_test_split(train.drop('Survived', axis=1, inplace=False),
                 train['Survived'],
                 test_size=0.3,
                 random_state=1,
                 stratify=train['Survived'])

# 정규화
scaler = MinMaxScaler()
scaler.fit(train_x_data)
norm_train_x_data = scaler.transform(train_x_data)
norm_test_x_data = scaler.transform(test_x_data)

# norm_train_x_data.shape
# # train_t_data.shape

norm_test_x = scaler.transform(test)

In [4]:
# tensorflow
# placeholder
X = tf.placeholder(shape=[None,5], dtype=tf.float32)
T = tf.placeholder(shape=[None,1], dtype=tf.float32)

# Weight & bias
W = tf.Variable(tf.random.normal([5,1]))
b = tf.Variable(tf.random.normal([1]))

# Hypothesis, model
logit = tf.matmul(X,W) + b
H = tf.sigmoid(logit)

# loss function
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit,
                                                              labels=T))
# train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-4).minimize(loss)

# Session 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 반복학습
for step in range(300000):
    _, loss_val = sess.run([train, loss], feed_dict={X: norm_train_x_data,
                                                     T: train_t_data.values.reshape(-1,1)})
    if step % 30000 == 0:
        print('loss value : {}'.format(loss_val))


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



loss value : 0.699329674243927
loss value : 0.5518949627876282
loss value : 0.5232254862785339
loss value : 0.5081645250320435
loss value : 0.4972035586833954
loss value : 0.48872965574264526
loss value : 0.482054740190506
loss value : 0.47672247886657715
loss value : 0.47242993116378784
loss value : 0.4689275324344635


In [5]:
# 정확도 측정
# validation data 사용

predict = tf.cast(H>=0.5, dtype=tf.float32)
correct = tf.equal(predict, T)
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))
accuracy_val = sess.run(accuracy, feed_dict={X:norm_test_x_data,
                                             T:test_t_data.values.reshape(-1,1)})
print('Accuracy : {}'.format(accuracy_val))

Accuracy : 0.8208954930305481


In [None]:
# predict

result = sess.run(H, feed_dict={X:norm_test_x})
# display(result)

submit = tf.cast(result >= 0.5, dtype=tf.int32 )
predict = sess.run(submit)

result_df = pd.DataFrame(predict)
result_df['PassengerId'] = test.index
result_df.rename(columns={0:'Survived'},inplace=True)
# display(result_df)

result_df = result_df[['PassengerId','Survived']]
result_df.set_index('PassengerId',inplace=True)
display(result_df)
result_df.to_csv('./data/survive_submission.csv')