In [210]:
import os

In [211]:
import pandas as pd
import numpy as np
import tensorflow as tf
from IPython.display import clear_output

In [212]:
# Training Data
train = pd.read_csv("titanic_data/train.csv",
                 na_values=['', '?', '-'])
# Testing Data
test = pd.read_csv("titanic_data/test.csv",
                 na_values=['', '?', '-'])
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


###  Cleaning and filling data


In [213]:
# Training data
train = train.fillna(method='ffill')
train = train.fillna(method='bfill')
train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [215]:
# Testing Data
test =test.fillna(method='bfill')
test = test.fillna(method='ffill')
test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [216]:
labels = train.pop('Survived')
labels.head()
train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,C85,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,C85,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,C123,S


In [217]:
CATEGORICAL_COLUMN = ["Name" , "Sex" , "Ticket","Cabin","Embarked"]
NUMERICAL_COLUMN = ["PassengerId" , "Pclass","Age","SibSp","Parch", "Fare"]
features_column = []

for feature_name in CATEGORICAL_COLUMN :
    vocab = x_train[feature_name].unique()
    features_column.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name,vocab))

for feature_name in NUMERICAL_COLUMN :
    features_column.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))
s = pd.DataFrame(features_column)
s.head()

Unnamed: 0,key,vocabulary_list,dtype,default_value,num_oov_buckets
0,Name,"(Braund, Mr. Owen Harris, Cumings, Mrs. John B...",<dtype: 'string'>,-1,0.0
1,Sex,"(male, female)",<dtype: 'string'>,-1,0.0
2,Ticket,"(A/5 21171, PC 17599, STON/O2. 3101282, 113803...",<dtype: 'string'>,-1,0.0
3,Cabin,"(C85, C123, E46, G6, C103, D56, A6, C23 C25 C2...",<dtype: 'string'>,-1,0.0
4,Embarked,"(S, C, Q)",<dtype: 'string'>,-1,0.0


In [218]:
# Input Function
def making_data_ready(data_df , label_df , num_epochs=10, shuffle=True , batch_size=32):
    def input_function(): #inner function, this will be returned
        # Create tf.data.Dataset object with data and its label
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle :
            ds = ds.shuffle(1000) #random order of data
        ds = ds.batch(batch_size).repeat(num_epochs)#split dataset into batches of 32 and repeat process for number of epochs
        return ds # return a batch of dataset
    return input_function # return a function object for use

# here we will call the input function that was return to
train_eval = making_data_ready(train , labels) 
# test_eval = making_data_ready(x_test , y_test , num_epochs=1 , shuffle=False)

In [219]:
# Creating Model

# we are creating linear estimator by passing feature_columns we created earlier. 
linear_est = tf.estimator.LinearClassifier(feature_columns=features_column)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\india\\AppData\\Local\\Temp\\tmplo8h3rm8', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [231]:
# Training
linear_est.train(train_eval) #train

INFO:tensorflow:Calling model_fn.


  self.bias = self.add_variable(


INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\india\AppData\Local\Temp\tmplo8h3rm8\model.ckpt-560
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 560...
INFO:tensorflow:Saving checkpoints for 560 into C:\Users\india\AppData\Local\Temp\tmplo8h3rm8\model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 560...
INFO:tensorflow:loss = 0.44913316, step = 560
INFO:tensorflow:global_step/sec: 176.958
INFO:tensorflow:loss = 0.94337827, step = 660 (0.565 sec)
INFO:tensorflow:global_step/sec: 415.53
INFO:tensorflow:loss = 0.27886498, step = 760 (0.249 sec)
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 840...
INFO:tensorflow:Saving checkpoints for 840 into C:\Users\india\AppData\Local\Temp\tmplo8h3rm8\model.ckpt.
INFO:tens

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x293f26af0a0>

In [232]:
def predict_labels(features , batch_size=256):
    return tf.data.Dataset.from_tensor_slices(dict(features)).batch(batch_size)
predict_label = lambda:predict_labels(test)
predictions = linear_est.predict(predict_label)
predict = pd.DataFrame(predictions)
predict

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\india\AppData\Local\Temp\tmplo8h3rm8\model.ckpt-840
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


Unnamed: 0,logits,logistic,probabilities,class_ids,classes,all_class_ids,all_classes
0,[-2.0936434],[0.10971615],"[0.89028376, 0.10971618]",[0],[b'0'],"[0, 1]","[b'0', b'1']"
1,[0.9243085],[0.71591914],"[0.2840808, 0.71591914]",[1],[b'1'],"[0, 1]","[b'0', b'1']"
2,[-2.1936042],[0.10032627],"[0.8996737, 0.1003263]",[0],[b'0'],"[0, 1]","[b'0', b'1']"
3,[-1.8963635],[0.13052061],"[0.86947936, 0.13052061]",[0],[b'0'],"[0, 1]","[b'0', b'1']"
4,[1.5607119],[0.8264555],"[0.17354453, 0.8264555]",[1],[b'1'],"[0, 1]","[b'0', b'1']"
...,...,...,...,...,...,...,...
413,[-1.727992],[0.15084457],"[0.84915537, 0.1508446]",[0],[b'0'],"[0, 1]","[b'0', b'1']"
414,[4.520496],[0.9892336],"[0.010766448, 0.9892336]",[1],[b'1'],"[0, 1]","[b'0', b'1']"
415,[-1.7229767],[0.15148813],"[0.8485119, 0.15148814]",[0],[b'0'],"[0, 1]","[b'0', b'1']"
416,[-1.7122114],[0.15287712],"[0.84712285, 0.1528771]",[0],[b'0'],"[0, 1]","[b'0', b'1']"


In [233]:
survive = []
for i in range(len(predict)):
    if predict.loc[i]['probabilities'][0] >= 0.504 :
        survive.append(0)
    else:
        survive.append(1)

gender_submission = pd.DataFrame({
    "PassengerId" : test['PassengerId'],
    "Survived" : survive
})
gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [234]:
submission = pd.read_csv("titanic_data/gender_submission.csv")
submission['Survived'] = survive
submission.to_csv('submission.csv', index=False)