# Loading Training Data

In [1]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

  from ._conv import register_converters as _register_converters


In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Drop unused columns
train = train.drop(['Name', 'Ticket', 'PassengerId'], axis=1)

In [5]:
# Add some more useful columns
def IsChild(x):
    return 1 if x<18 else 0
train['Child'] = train['Age'].apply(IsChild)
train['Female'] = train['Sex'].apply(lambda x : x=='female')

# Analyze Correlation

In [6]:
train.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Child,Female
Survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307,0.122239,0.543351
Pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495,0.12562,-0.1319
Age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067,-0.617063,-0.093254
SibSp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651,0.324068,0.114631
Parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225,0.31767,0.245489
Fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0,-0.007546,0.182333
Child,0.122239,0.12562,-0.617063,0.324068,0.31767,-0.007546,1.0,0.10715
Female,0.543351,-0.1319,-0.093254,0.114631,0.245489,0.182333,0.10715,1.0


Fare, Child, Female and Pclass have highest correlation with Survived. Intuitively, women and children are more likely to survive. Fare and Pclass are probably playing an important role as well (at least according screen adaptations :) )

In [7]:
#train.Cabin.value_counts() # Use this instead to see the distribution
train.Cabin.value_counts().head()

B96 B98        4
G6             4
C23 C25 C27    4
F2             3
D              3
Name: Cabin, dtype: int64

Cabin does not seem to be a useful data at all.

# Prepare features

In [8]:
# Find which columns have missing data
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
Child         0
Female        0
dtype: int64

In [9]:
# Fill empty data
train.Age = train.Age.fillna(train.Age.median())

In [10]:
fare_max = train.Fare.max()
fare_min = train.Fare.min()
train.Fare = train.Fare.apply(lambda x: (x-fare_min)/(fare_max-fare_min))

In [11]:
Age = tf.feature_column.bucketized_column(tf.feature_column.numeric_column('Age'), [18, 30, 50, 70])
Pclass = tf.feature_column.categorical_column_with_vocabulary_list('Pclass', [1,2,3])
Fare = tf.feature_column.numeric_column('Fare')
Sex = tf.feature_column.categorical_column_with_vocabulary_list('Sex', ['male', 'female'])

In [12]:
feat_cols = [Age, Pclass, Fare, Sex]

# Linear classifier model

In [13]:
model = tf.estimator.LinearClassifier(feat_cols)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\trule\\AppData\\Local\\Temp\\5\\tmpdst_kew5', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000017A463308D0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [14]:
X_data = train[['Age','Pclass','Fare', 'Sex']]
y_label = train['Survived']

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
>>> X_train, X_test, y_train, y_test = train_test_split(
...     X_data, y_label, test_size=0.2, random_state=42)

In [17]:
input_fn = tf.estimator.inputs.pandas_input_fn(X_train, y_train, num_epochs=None, shuffle=True)

In [18]:
model.train(input_fn, steps =1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into C:\Users\trule\AppData\Local\Temp\5\tmpdst_kew5\model.ckpt.
INFO:tensorflow:loss = 88.72288, step = 1
INFO:tensorflow:global_step/sec: 278.292
INFO:tensorflow:loss = 56.873978, step = 101 (0.375 sec)
INFO:tensorflow:global_step/sec: 457.138
INFO:tensorflow:loss = 46.571846, step = 201 (0.219 sec)
INFO:tensorflow:global_step/sec: 426.665
INFO:tensorflow:loss = 60.553932, step = 301 (0.219 sec)
INFO:tensorflow:global_step/sec: 457.14
INFO:tensorflow:loss = 54.280693, step = 401 (0.219 sec)
INFO:tensorflow:global_step/sec: 457.14
INFO:tensorflow:loss = 65.35539, step = 501 (0.219 sec)
INFO:tensorflow:global_step/sec: 457.139
INFO:tensorflow:loss = 54.025497, step = 601 (0.234 sec)
INFO:tensorflow:global_step/se

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x17a463307f0>

In [45]:
model.get_variable_names()

['global_step',
 'linear/linear_model/Age_bucketized/weights',
 'linear/linear_model/Age_bucketized/weights/part_0/Ftrl',
 'linear/linear_model/Age_bucketized/weights/part_0/Ftrl_1',
 'linear/linear_model/Fare/weights',
 'linear/linear_model/Fare/weights/part_0/Ftrl',
 'linear/linear_model/Fare/weights/part_0/Ftrl_1',
 'linear/linear_model/Pclass/weights',
 'linear/linear_model/Pclass/weights/part_0/Ftrl',
 'linear/linear_model/Pclass/weights/part_0/Ftrl_1',
 'linear/linear_model/Sex/weights',
 'linear/linear_model/Sex/weights/part_0/Ftrl',
 'linear/linear_model/Sex/weights/part_0/Ftrl_1',
 'linear/linear_model/bias_weights',
 'linear/linear_model/bias_weights/part_0/Ftrl',
 'linear/linear_model/bias_weights/part_0/Ftrl_1']

In [48]:
print([(x, model.get_variable_value(x)) for x in model.get_variable_names() if x.endswith('weights')])

[('linear/linear_model/Age_bucketized/weights', array([[ 0.63349855],
       [-0.23712228],
       [-0.21339013],
       [-0.6173166 ],
       [-0.47678962]], dtype=float32)), ('linear/linear_model/Fare/weights', array([[0.8734556]], dtype=float32)), ('linear/linear_model/Pclass/weights', array([[ 1.0835161 ],
       [ 0.46494257],
       [-0.7967654 ]], dtype=float32)), ('linear/linear_model/Sex/weights', array([[-1.170805 ],
       [ 1.4122323]], dtype=float32)), ('linear/linear_model/bias_weights', array([-0.2481249], dtype=float32))]


1. Age: Weight for first bucket (<18) is the highest --> Children is more likely to survived
2. Sex: Male and Female weights show clearly that Female is more likely to survived.

# Evaluate

In [19]:
input_fn_test = tf.estimator.inputs.pandas_input_fn(X_test, batch_size=len(X_test), shuffle=False)

In [20]:
pred_fn = model.predict(input_fn_test)

In [21]:
predictions = list(pred_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\trule\AppData\Local\Temp\5\tmpdst_kew5\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [22]:
final_preds = [x['class_ids'][0] for x in predictions]

In [23]:
from sklearn.metrics import classification_report

In [24]:
print(classification_report(y_test, final_preds))

             precision    recall  f1-score   support

          0       0.81      0.84      0.82       105
          1       0.76      0.72      0.74        74

avg / total       0.79      0.79      0.79       179



# Load submission data

In [25]:
test = pd.read_csv('test.csv')
test_fare_max = test.Fare.max()
test_fare_min = test.Fare.min()
test.Fare = test.Fare.apply(lambda x: (x-test_fare_min)/(test_fare_max-test_fare_min))
test.Age = test.Age.fillna(value = test.Age.median())

In [26]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,0.015282,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,0.013663,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,0.018909,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,0.016908,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,0.023984,,S


In [27]:
X_test_final = test[['Age','Pclass','Fare', 'Sex']]

In [28]:
input_fn_test_final = tf.estimator.inputs.pandas_input_fn(X_test_final, batch_size=len(X_test_final), shuffle=False)

In [29]:
pred_fn = model.predict(input_fn_test_final)

In [30]:
predictions = list(pred_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\trule\AppData\Local\Temp\5\tmpdst_kew5\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [31]:
final_preds = [x['class_ids'][0] for x in predictions]

In [32]:
pd_preds = pd.DataFrame()

In [33]:
pd_preds['PassengerId'] = range(892, 1310)

In [34]:
pd_preds['Survived'] = final_preds

In [35]:
pd_preds.to_csv('final_preds.csv', index=False)

In [36]:
pd_preds.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
