In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import StandardScaler


train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [2]:
train_test_data = [train, test] # combining train and test dataset
title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, 
                 "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3,
                 "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 }




# 1. Title 처리
for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Title 단순화
    dataset['Title'] =dataset['Title'].apply(lambda x: x if x in [ 'Mr', 'Miss', 'Mrs'] else 'etc')
    
    # Age Missing Data 처리
    dataset["Age"].fillna(dataset.groupby("Title")["Age"].transform("median"), inplace=True)
    
    # Fare Missing을 Pclass 중간값으로
    dataset["Fare"].fillna(dataset.groupby("Pclass")["Fare"].transform("median"), inplace=True)
    
    # Embarked Missing을 가장 많은 S로
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    
    # Cabin 정보를 첫번째 alphabet만
    dataset['Cabin'] = dataset['Cabin'].str[:1]
    # test data에 T class가 없기 때문에 이렇게 해야, get_dummies가 잘 작동
    dataset['Cabin'] = dataset['Cabin'].astype('category',categories=["A", "B", "C", "D", "E", "F", "G", "T"])
    
    
    # Fare Missing을 Pclass 빈도가 가장 많은 data로
    dataset['Cabin'] = dataset.groupby('Pclass').Cabin.transform(lambda x: x.fillna(x.mode()[0]))
    
    # 가족 data합치기
    dataset["FamilySize"] = dataset["SibSp"] + dataset["Parch"] + 1
    
    # 불필요한 자료 제거
    dataset.drop( ['Name','Ticket', 'SibSp', 'Parch'], axis=1, inplace=True)

 
train = train.drop(['PassengerId'], axis=1)    

# one hot encoding
train = pd.get_dummies(train,columns=['Title', 'Sex','Pclass','Cabin','Embarked'])
test = pd.get_dummies(test,columns=['Title', 'Sex','Pclass','Cabin','Embarked'])





train_data = train.drop('Survived', axis=1)
target_data = train['Survived']



  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
train_data.head(7)

Unnamed: 0,Age,Fare,FamilySize,Title_Miss,Title_Mr,Title_Mrs,Title_etc,Sex_female,Sex_male,Pclass_1,...,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,22.0,7.25,2,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
1,38.0,71.2833,2,0,0,1,0,1,0,1,...,0,1,0,0,0,0,0,1,0,0
2,26.0,7.925,1,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
3,35.0,53.1,2,0,0,1,0,1,0,1,...,0,1,0,0,0,0,0,0,0,1
4,35.0,8.05,1,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
5,30.0,8.4583,1,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
6,54.0,51.8625,1,0,1,0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,1


In [4]:
inputs = train_data.values
targets = target_data.values.reshape(-1,1)


scaler = StandardScaler()
inputs_temp = scaler.fit_transform(inputs[:,:3] )
inputs=np.concatenate([inputs_temp,inputs[:,3:]],axis=1)
print(inputs.shape, targets.shape)

(891, 23) (891, 1)


In [179]:
# class MyTitanic():
#     def __init__(self,is_training=True,name=None):
#         self.name=name
#         self.is_training = is_training
        
#         self.build()
#     def build(self):
#         with tf.variable_scope(self.name):
#             self.X = tf.placeholder(tf.float32, shape=[None, 23])
#             self.Y = tf.placeholder(tf.float32, shape=[None,1])
        
            
#             x = tf.layers.dense(self.X,units=256,activation=tf.nn.relu) 
#             x = tf.layers.dropout(x,0.5,self.is_training)
#             x = tf.layers.dense(self.X,units=256,activation=tf.nn.relu)  
#             x = tf.layers.dropout(x,0.8,self.is_training)
#             x = tf.layers.dense(x,units=10,activation=tf.nn.relu)  
#             x = tf.layers.dense(x,units=10,activation=tf.nn.relu) 
#             logits = tf.layers.dense(x,units=1,activation=None)
           
#             self.predict = (tf.nn.sigmoid(logits) >=0.5)
#             self.loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.Y,logits=logits))
            
#             self.optimizer = tf.train.AdamOptimizer(learning_rate=1e-2).minimize(self.loss)
            

# [32, dropout 0.5, 3, 1]  ==> test acc = 76.79
# [32, dropout 0.5, 3, 1]  epoch 5000 ==> test acc = 77.51
class MyTitanic():
    def __init__(self,is_training=True,name=None):
        self.name=name
        self.is_training = is_training
        
        self.build()
    def build(self):
        with tf.variable_scope(self.name):
            self.X = tf.placeholder(tf.float32, shape=[None, 23])
            self.Y = tf.placeholder(tf.float32, shape=[None,1])
        
            
            x = tf.layers.dense(self.X,units=32,activation=tf.nn.relu) 
            x = tf.layers.dropout(x,0.5,self.is_training)
            x = tf.layers.dense(x,units=3,activation=tf.nn.relu) 
            logits = tf.layers.dense(x,units=1,activation=None)
           
            self.predict = (tf.nn.sigmoid(logits) >=0.5)
            self.loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.Y,logits=logits))
            
            self.optimizer = tf.train.AdamOptimizer(learning_rate=1e-2).minimize(self.loss)

In [186]:
tf.reset_default_graph()
tf.set_random_seed(1234)


with tf.variable_scope('model') as scope:
    model = MyTitanic(is_training=True,name="titanic")
    
with tf.variable_scope('model',reuse=True) as scope:
    model_test = MyTitanic(is_training=False,name="titanic")

    
sess = tf.Session()
sess.run(tf.global_variables_initializer())


for i in range(5000):
    sess.run(model.optimizer,feed_dict={model.X: inputs,model.Y: targets})
    if i% 100 ==0:
        loss = sess.run(model.loss,feed_dict={model.X: inputs,model.Y: targets})
        print('step: {}, loss = {:.4f}'.format(i, loss))
        
        
predict = sess.run(model.predict,feed_dict={model.X: inputs,model.Y: targets}).astype(np.int32)
acc = np.mean(1*(predict==targets))

print('train acc: ', acc)

step: 0, loss = 0.6214
step: 100, loss = 0.3193
step: 200, loss = 0.2602
step: 300, loss = 0.2400
step: 400, loss = 0.2309
step: 500, loss = 0.2222
step: 600, loss = 0.2180
step: 700, loss = 0.2150
step: 800, loss = 0.2085
step: 900, loss = 0.2047
step: 1000, loss = 0.2027
step: 1100, loss = 0.2023
step: 1200, loss = 0.1991
step: 1300, loss = 0.1966
step: 1400, loss = 0.1941
step: 1500, loss = 0.1917
step: 1600, loss = 0.1912
step: 1700, loss = 0.1909
step: 1800, loss = 0.1876
step: 1900, loss = 0.1868
step: 2000, loss = 0.1862
step: 2100, loss = 0.1847
step: 2200, loss = 0.1840
step: 2300, loss = 0.1840
step: 2400, loss = 0.1838
step: 2500, loss = 0.1822
step: 2600, loss = 0.1802
step: 2700, loss = 0.1819
step: 2800, loss = 0.1793
step: 2900, loss = 0.1795
step: 3000, loss = 0.1809
step: 3100, loss = 0.1782
step: 3200, loss = 0.1869
step: 3300, loss = 0.1806
step: 3400, loss = 0.1784
step: 3500, loss = 0.1767
step: 3600, loss = 0.1758
step: 3700, loss = 0.1728
step: 3800, loss = 0.173

In [187]:
test_temp = test.drop(['PassengerId'], axis=1).values
test_temp = np.concatenate([scaler.transform( test_temp[:, :3]),test_temp[:,3:]],axis=1)

test_predict = sess.run(model_test.predict,feed_dict={model_test.X: test_temp}).astype(np.int32).reshape(-1)

In [188]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": test_predict
    })

submission.to_csv('submission.csv', index=False)