In [None]:
print("Make good code today my friend!!")

In [None]:
from __future__ import print_function
import math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)

# read csv
train_data = pd.read_csv("./dataSet/train.csv")
test_data = pd.read_csv("./dataSet/test.csv")

test_data_copy = test_data.copy()
train_data_copy = train_data.copy()


In [None]:
# fill age NaN
train_data.Age = train_data.Age.fillna(train_data.Age.mean())
test_data.Age = test_data.Age.fillna(test_data.Age.mean())

# Replace male and female to 1 and 0
mapping = {'male': 1, 'female': 0}
train_data = train_data.replace({'Sex': mapping})
test_data = test_data.replace({'Sex': mapping})

# One-hot-encoding
train_data['Pclass_0'] = train_data['Pclass'] == 1
train_data['Pclass_1'] = train_data['Pclass'] == 2
train_data['Pclass_2'] = train_data['Pclass'] == 3
test_data['Pclass_0'] = test_data['Pclass'] == 1
test_data['Pclass_1'] = test_data['Pclass'] == 2
test_data['Pclass_2'] = test_data['Pclass'] == 3

train_data['Embarked_0'] = train_data['Embarked'] == 'S'
train_data['Embarked_1'] = train_data['Embarked'] == 'C'
train_data['Embarked_2'] = train_data['Embarked'] == 'Q'
test_data['Embarked_0'] = test_data['Embarked'] == 'S'
test_data['Embarked_1'] = test_data['Embarked'] == 'C'
test_data['Embarked_2'] = test_data['Embarked'] == 'Q'


# display.display(train_data.head(10))
# display.display(test_data.head(10))
# display.display(train_data.describe())
# display.display(test_data.describe())

# drop unused columns

In [None]:
# MODEL 1
tmp = {'Sex': train_data['Sex'], 'Survived': train_data['Survived']}
sex_train_example = pd.DataFrame(data=tmp)
sex_test_example = pd.DataFrame(data={'Sex':test_data['Sex']})
# sex_test_example

In [None]:
def get_quantile_based_boundaries(feature_values, num_buckets):
    boundaries = np.arange(1.0, num_buckets) / num_buckets
    quantiles = feature_values.quantile(boundaries)
    return [quantiles[q] for q in quantiles.keys()]

def construct_feature_columns(input_features, bin_age):
    tmp_feature = [tf.feature_column.numeric_column(my_feature) for my_feature in input_features]
    if bin_age:
        tmp_age = tf.feature_column.numeric_column("Age")
        bucketized_age = tf.feature_column.bucketized_column(
          tmp_age, boundaries=get_quantile_based_boundaries(
            train_data["Age"], 4))
        tmp_feature += [bucketized_age]
    return (set(tmp_feature))

In [None]:
# train input function
def train(train_example, learning_rate, batch_size, steps, periods, model_dir, bin_age=False):
    train_set = train_example.sample(frac=0.8, replace=False, random_state=42)
    cv_set = train_example.loc[ set(train_example.index) - set(train_set.index)]
    feature_columns = construct_feature_columns(train_example.drop('Survived', axis=1), bin_age)
    
    train_input_fn = tf.estimator.inputs.pandas_input_fn(
          x=train_set.drop('Survived', axis=1),
          y=train_set.Survived,
          num_epochs=None, # for training, use as many epochs as necessary
          shuffle=True,
          target_column='target',
          batch_size=batch_size
    )

    cv_input_fn_tmp = tf.estimator.inputs.pandas_input_fn(
          x=cv_set.drop('Survived', axis=1),
          y=cv_set.Survived,
          num_epochs=None,
          shuffle=False,
          batch_size=batch_size
    )
    
    cv_input_fn = tf.estimator.inputs.pandas_input_fn(
          x=cv_set.drop('Survived', axis=1),
          y=cv_set.Survived,
          num_epochs=1, # only to score
          shuffle=False,
          batch_size=batch_size
    )
    my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
    estimator = tf.estimator.LinearClassifier(
        feature_columns=feature_columns,
        optimizer=my_optimizer,
        model_dir=model_dir + '/validation',
    )
    estimator2 = tf.estimator.LinearClassifier(
        feature_columns=feature_columns,
        optimizer=my_optimizer,
        model_dir=model_dir + '/training',
    )

    training_accuracy = []
    validation_accuracy = []
    for i in range(periods):
        # train the model
        print("%d period:" % i, end=' ')
        estimator.train(input_fn=train_input_fn, steps=steps)
        estimator2.train(input_fn=cv_input_fn_tmp, steps=steps)
        
        # evalute validation example
#         validation_eval = estimator.evaluate(input_fn=cv_input_fn)
        
        # validation RMSE
        validation_predictions = estimator.predict(input_fn=cv_input_fn)
        validation_predictions = np.array([item['class_ids'][0] for item in validation_predictions])
        validation_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(validation_predictions, cv_set.Survived))

        print("Test RMSE: {0:f}, ".format(validation_root_mean_squared_error))
#         print("Validation accuracy: {}".format(validation_eval['accuracy']))
        validation_accuracy.append(validation_root_mean_squared_error)

    # print the plot
    plt.ylabel("RMSE")
    plt.xlabel("Periods")
    plt.title("RMSE vs. Periods")
    plt.tight_layout()
    plt.plot(validation_accuracy, label="validation")
    plt.legend()
    return estimator

In [None]:
# MODEL 1
estimator = train(
    learning_rate = 0.01,
    batch_size = 5,
    steps = 500,
    periods = 10,
    train_example = sex_train_example,
    model_dir = './tensorboard/model1_v4'
)

In [None]:
final_data = pd.read_csv("./dataSet/gender_submission.csv")

def calculate_accuracy(estimator, model_test):  
    predictions = list(estimator.predict(input_fn=model_test))
    predictions = [prediction['class_ids'][0] for prediction in predictions]
    
    accuracy = pd.Series(predictions == final_data['Survived'])
    accuracy = accuracy.sum() / accuracy.count()
    return accuracy

In [None]:
def test_input(test_example):
    test_input_fn = tf.estimator.inputs.pandas_input_fn(
          x=test_example,
          num_epochs=1, # only to predict
          shuffle=False 
    )
    return test_input_fn

model_input = test_input(sex_test_example)
predict = calculate_accuracy(estimator, model_input)

print("predict = {}".format(predict))


In [None]:
predictions = list(estimator.predict(input_fn=model_input))
predicted_classes = [prediction['class_ids'][0] for prediction in predictions]

gender_submission = pd.read_csv("./dataSet/gender_submission.csv")
final_accuracy = pd.Series(predicted_classes == gender_submission['Survived'])
final_accuracy = final_accuracy.sum() / final_accuracy.count()
print(final_accuracy)

evaluation = test_data_copy['PassengerId'].copy().to_frame()
evaluation["Survived"] = predicted_classes
evaluation.to_csv("evaluation_submission.csv", index=False)
evaluation