#### 更新版本v1.1到v1.2

1. 特征标准化
2. 尝试其他优化器

In [1]:
import os

os.chdir(r'G:\pycharm-workspace\2018ATEC\data')

In [2]:
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.set_option('display.max_columns', 350)
pd.options.display.float_format = '{:.4f}'.format

anti_fraud_dataframe = pd.read_csv("atec_anti_fraud_train.csv")
anti_fraud_dataframe = anti_fraud_dataframe[anti_fraud_dataframe.label != -1]
anti_fraud_dataframe = anti_fraud_dataframe.reindex(
    np.random.permutation(anti_fraud_dataframe.index))
anti_fraud_dataframe

  from ._conv import register_converters as _register_converters


MemoryError: 

In [None]:
def preprocess_features(anti_fraud_dataframe):
    selected_feature = anti_fraud_dataframe[
        ["f28", 
         "f29", 
         "f30", 
         "f31", 
         "f52", 
         "f53", 
         "f111", 
         "f112"
#          "f113", 
#          "f114", 
#          "f115", 
#          "f116", 
#          "f185", 
#          "f259", 
#          "f260", 
#          "f261", 
#          "f270", 
#          "f271"
        ]]
    processed_features = selected_feature.copy()
    processed_features = processed_features.fillna(-1)
    return processed_features

def preprocess_targets(anti_fraud_dataframe):
    output_targets = pd.DataFrame()
    output_targets["label"] = anti_fraud_dataframe["label"]
    return output_targets

display.display(preprocess_features(anti_fraud_dataframe))
display.display(preprocess_targets(anti_fraud_dataframe))

In [None]:
def construct_feature_columns(input_features):
    return set([tf.feature_column.numeric_column(my_feature)
               for my_feature in input_features])

In [None]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    features = {key:np.array(value) for key,value in dict(features).items()}
    
    ds = Dataset.from_tensor_slices((features,targets))
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    if shuffle:
        ds = ds.shuffle(10000)
    
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

In [None]:
def train_nn_regression_model(
    my_optimizer,
    steps,
    batch_size,
    hidden_units,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):
    
    periods = 10
    steps_per_period = steps / periods
    
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
    dnn_regressor = tf.estimator.DNNRegressor(
        feature_columns=construct_feature_columns(training_examples),
        hidden_units=hidden_units,
        optimizer=my_optimizer
    )
    
    training_input_fn = lambda: my_input_fn(training_examples, 
                                            training_targets["label"], 
                                            batch_size=batch_size)
    predict_training_input_fn = lambda: my_input_fn(training_examples, 
                                                    training_targets["label"], 
                                                    num_epochs=1, 
                                                    shuffle=False)
    predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
                                                      validation_targets["label"], 
                                                      num_epochs=1, 
                                                      shuffle=False)
    
    print("Training model...")
    print("RMSE (on training data):")
    training_rmse = []
    validation_rmse = []
    for period in range (0, periods):
        dnn_regressor.train(
            input_fn=training_input_fn,
            steps=steps_per_period
        )
        training_predictions = dnn_regressor.predict(input_fn=predict_training_input_fn)
        training_predictions = np.array([item['predictions'][0] for item in training_predictions])

        validation_predictions = dnn_regressor.predict(input_fn=predict_validation_input_fn)
        validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])

        training_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(training_predictions, training_targets))
        validation_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(validation_predictions, validation_targets))
        print("  period %02d : %0.2f" % (period, training_root_mean_squared_error))
        training_rmse.append(training_root_mean_squared_error)
        validation_rmse.append(validation_root_mean_squared_error)
    print("Model training finished.")
    
    plt.ylabel("RMSE")
    plt.xlabel("Periods")
    plt.title("Root Mean Squared Error vs. Periods")
    plt.tight_layout()
    plt.plot(training_rmse, label="training")
    plt.plot(validation_rmse, label="validation")
    plt.legend()

    print("Final RMSE (on training data):   %0.2f" % training_root_mean_squared_error)
    print("Final RMSE (on validation data): %0.2f" % validation_root_mean_squared_error)

    return dnn_regressor, training_rmse, validation_rmse

In [None]:
def linear_scale(series):
    min_val = series.min()
    max_val = series.max()
    scale = (max_val - min_val) / 2.0
    return series.apply(lambda x:((x - min_val) / scale) - 1.0)

def log_normalize(series):
    return series.apply(lambda x:math.log(x+1.0))

def clip(series, clip_to_min, clip_to_max):
    return series.apply(lambda x:(
        min(max(x, clip_to_min), clip_to_max)))

def z_score_normalize(series):
    mean = series.mean()
    std_dv = series.std()
    return series.apply(lambda x:(x - mean) / std_dv)

def binary_threshold(series, threshold):
    return series.apply(lambda x:(1 if x > threshold else 0))

In [None]:
def normalize(examples_dataframe):
    processed_features = examples_dataframe.copy()
    
    processed_features["f111"] = linear_scale(examples_dataframe["f111"])
    processed_features["f112"] = linear_scale(examples_dataframe["f112"])
    
    return processed_features

normalized_dataframe = normalize(preprocess_features(anti_fraud_dataframe))
normalized_training_examples = normalized_dataframe.head(696312)
normalized_validation_examples = normalized_dataframe.tail(298419)

training_targets = preprocess_targets(anti_fraud_dataframe.head(696312))
validation_targets = preprocess_targets(anti_fraud_dataframe.tail(298419))

print("Training examples summary:")
display.display(normalized_training_examples.describe())
print("Validation examples summary:")
display.display(normalized_validation_examples.describe())

print("Training targets summary:")
display.display(training_targets.describe())
print("Validation targets summary:")
display.display(validation_targets.describe())

In [None]:
dnn_regressor, adagrad_training_losses, adagrad_validation_losses = train_nn_regression_model(
    my_optimizer=tf.train.AdagradOptimizer(learning_rate=0.5),
    steps=500,
    batch_size=100,
    hidden_units=[10, 10],
    training_examples=normalized_training_examples,
    training_targets=training_targets,
    validation_examples=normalized_validation_examples,
    validation_targets=validation_targets)

In [None]:
dnn_regressor1, adam_training_losses, adam_validation_losses = train_nn_regression_model(
    my_optimizer=tf.train.AdamOptimizer(learning_rate=0.009),
    steps=500,
    batch_size=100,
    hidden_units=[10, 10],
    training_examples=normalized_training_examples,
    training_targets=training_targets,
    validation_examples=normalized_validation_examples,
    validation_targets=validation_targets)

In [None]:
dnn_regressor2, gradientdescent_training_losses, gradientdescent_validation_losses = train_nn_regression_model(
    my_optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.005),
    steps=5000,
    batch_size=50,
    hidden_units=[10, 10],
    training_examples=normalized_training_examples,
    training_targets=training_targets,
    validation_examples=normalized_validation_examples,
    validation_targets=validation_targets)

In [None]:
plt.ylabel("RMSE")
plt.xlabel("Periods")
plt.title("Root Mean Squared Error vs. Periods")
plt.plot(adagrad_validation_losses, label='Adagrad validation')
plt.plot(adam_validation_losses, label='Adam validation')
plt.plot(gradientdescent_validation_losses, label='GradientDescent validation')
_ = plt.legend()

In [None]:
anti_fraud_dataframe_test_data = pd.read_csv("atec_anti_fraud_test_a.csv")
display.display(anti_fraud_dataframe_test_data.describe())

anti_fraud_dataframe_test_data['label'] = 0.0
test_examples = normalize(preprocess_features(anti_fraud_dataframe_test_data))
print("Test examples summary:")
display.display(test_examples.describe())

test_targets = pd.DataFrame()
test_targets["label"] = (anti_fraud_dataframe_test_data["label"])
display.display(test_targets.describe())

predict_testing_input_fn = lambda: my_input_fn(test_examples, 
                                               test_targets["label"], 
                                               num_epochs=1, 
                                               shuffle=False)

In [None]:
test_predictions = dnn_regressor.predict(input_fn=predict_testing_input_fn)
test_predictions = np.array([item['predictions'][0] for item in test_predictions])
display.display(test_predictions, len(test_predictions))

predictions = pd.DataFrame()
predictions['score'] = pd.Series(test_predictions)
predictions['score']

In [None]:
result = pd.concat([anti_fraud_dataframe_test_data["id"], predictions['score']], axis=1)
result.describe()

result.loc[result["score"]<0, "score"] = 0
result.describe()

In [None]:
result[["id", "score"]].to_csv('submission.csv', index=False)