In [9]:
import pandas as pd
import tensorflow as tf
from keras import Sequential
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Activation
from keras.layers import Dense, np
from keras.optimizers import RMSprop
from keras.utils import to_categorical
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

# CONSTANTS
HIDDEN_UNITS = [64, 32, 32, 16]
LABEL = 'AdoptionSpeed'
TRAINING_TEST_SPLIT = 0.33
RANDOM_NUMBER_SEED = 42
N_CLASSES = 5
EPOCHS = 100
TRAIN_BATCH_SIZE = 10
TRAIN_FILENAME = 'weights.best.hdf5'

np.random.seed(RANDOM_NUMBER_SEED)


def prepare_data(data):
    pet_id = data.PetID

    # Remove unused features
    data.drop(['RescuerID', 'Description', 'PetID', 'State'], axis=1, inplace=True)

    # Apply binning to ages
    data['Age'] = pd.cut(data['Age'], [-1, 2, 3, 6, 255], labels=[0, 1, 2, 3])

    # Apply binning to fee
    data['Fee'] = pd.cut(data['Fee'], [-1, 50, 100, 200, 3000], labels=[0, 1, 2, 3])

    # Apply binning to photo amount
    data['PhotoAmt'] = pd.cut(data['PhotoAmt'], [-1, 1, 5, 10, 100], labels=[0, 1, 2, 3])

    # Apply binning to video amount
    data['VideoAmt'] = pd.cut(data['VideoAmt'], [-1, 1, 100], labels=[0, 1])

    # Replace names with 1 is present, 0 if not present
    data.loc[data['Name'].notnull(), 'Name'] = 1
    data.loc[data['Name'].isnull(), 'Name'] = 0

    # Fill missing continuous data
    data_continuous = data.select_dtypes(exclude=['object'])
    data_continuous.fillna(0, inplace=True)

    # Fill missing string data
    data_categorical = data.select_dtypes(include=['object'])
    data_categorical.fillna('NONE', inplace=True)

    final_data = data_continuous.merge(data_categorical, left_index=True, right_index=True)

    return final_data, data_categorical, data_continuous, pet_id, data.shape[1]


def create_mlp(input_dim, output_dim, arch=None):
    # Default mlp architecture
    model = Sequential([
        Dense(64, input_dim=19),
        Activation('relu'),
        Dense(32),
        Activation('relu'),
        Dense(32),
        Activation('relu'),
        Dense(16),
        Activation('relu'),
        Dense(5),
        Activation('sigmoid'),
    ])

    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

    model.compile(optimizer=rmsprop,
                  loss='mse',
                  metrics=['accuracy'])

    return model


if __name__ == '__main__':
    # Import and split
    train, train_categorical, train_continuous, train_pet_id, training_dimension = prepare_data(
        pd.read_csv('../all/train.csv'))
    test, test_categorical, test_continuous, test_pet_id, _ = prepare_data(pd.read_csv('../all/test/test.csv'))

    # Remove the outliers
    clf = IsolationForest(max_samples=100, random_state=RANDOM_NUMBER_SEED)
    clf.fit(train_continuous)
    y_no_outliers = clf.predict(train_continuous)
    y_no_outliers = pd.DataFrame(y_no_outliers, columns=['Top'])

    train_continuous = train_continuous.iloc[y_no_outliers[y_no_outliers['Top'] == 1].index.values]
    train_continuous.reset_index(drop=True, inplace=True)

    train_categorical = train_categorical.iloc[y_no_outliers[y_no_outliers['Top'] == 1].index.values]
    train_categorical.reset_index(drop=True, inplace=True)

    train = train.iloc[y_no_outliers[y_no_outliers['Top'] == 1].index.values]
    train.reset_index(drop=True, inplace=True)

    # Extract columns
    columns = list(train_continuous.columns)
    columns_minus_label = columns[:-1]

    features_continuous = list(train_continuous.columns)
    features_continuous.remove(LABEL)

    features_categorical = list(train_categorical.columns)

    # Extract matrices
    matrix_train = np.matrix(train_continuous)
    matrix_test = np.matrix(test_continuous)
    matrix_test_no_label = np.matrix(train_continuous.drop(LABEL, axis=1))
    matrix_train_y = np.array(train[LABEL])

    # Scale data
    train_scaler = MinMaxScaler()
    train_scaler.fit(matrix_train)

    test_scaler = MinMaxScaler()
    test_scaler.fit(matrix_test_no_label)

    train[columns] = pd.DataFrame(train_scaler.transform(matrix_train), columns=columns)
    train[LABEL] = matrix_train_y
    test[features_continuous] = pd.DataFrame(test_scaler.transform(matrix_test), columns=features_continuous)

    # Extract continuous and categorical features
    engineered_features = []

    for continuous_feature in features_continuous:
        engineered_features.append(tf.contrib.layers.real_valued_column(continuous_feature))

    for categorical_feature in features_categorical:
        sparse_column = tf.contrib.layers.sparse_column_with_hash_bucket(categorical_feature, hash_bucket_size=1000)

        engineered_features.append(tf.contrib.layers.embedding_column(sparse_id_column=sparse_column,
                                                                      dimension=16,
                                                                      combiner='sum'))

    # Split training set data between train and test
    x_train, x_test, y_train, y_test = train_test_split(train[features_continuous + features_categorical],
                                                        train[LABEL],
                                                        test_size=TRAINING_TEST_SPLIT,
                                                        random_state=RANDOM_NUMBER_SEED)
    # Convert back to DataFrame
    y_train = pd.DataFrame(y_train, columns=[LABEL])
    x_train = pd.DataFrame(x_train, columns=features_continuous + features_categorical) \
        .merge(y_train, left_index=True, right_index=True)

    y_test = pd.DataFrame(y_test, columns=[LABEL])
    x_test = pd.DataFrame(x_test, columns=features_continuous + features_categorical) \
        .merge(y_test, left_index=True, right_index=True)

    # Labels must be one-hot encoded for loss='categorical_crossentropy'
    y_train_onehot = to_categorical(y_train, N_CLASSES)
    y_test_onehot = to_categorical(y_test, N_CLASSES)
#     np.set_printoptions(threshold=np.inf)
#     print(y_train_onehot)
    # Get neural network architecture and save to disk
    model = create_mlp(input_dim=training_dimension, output_dim=N_CLASSES)

    with open(TRAIN_FILENAME, 'w') as f:
        f.write(model.to_yaml())

    # only save model weights for best performing model
    checkpoint = ModelCheckpoint(TRAIN_FILENAME,
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True)

    # Stop training early if validation accuracy doesn't improve for long enough
    early_stopping = EarlyStopping(monitor='val_acc', patience=10)

    # Shuffle data for good measure before fitting
    x_train, y_train_onehot = shuffle(x_train, y_train_onehot)
    x_train, x_val, y_train_onehot, y_val = train_test_split(x_train,
                                                        y_train_onehot,
                                                        test_size=TRAINING_TEST_SPLIT,
                                                        random_state=RANDOM_NUMBER_SEED)
    x_train = x_train.drop([LABEL], axis=1);
    x_test = x_test.drop([LABEL], axis=1);
    x_val = x_val.drop([LABEL], axis=1)
    model.fit(x_train, y_train_onehot, validation_data=(x_test, y_test_onehot), epochs=EPOCHS,
              batch_size=TRAIN_BATCH_SIZE,
              shuffle=True,
              callbacks=[checkpoint, early_stopping])

    print('\nTesting ------------')
    # Evaluate the model with the metrics we defined earlier
    loss, accuracy = model.evaluate(x_val, y_val)

    print('test loss: ', loss)
    print('test accuracy: ', accuracy)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Train on 6056 samples, validate on 4453 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.36133, saving model to weights.best.hdf5
Epoch 2/100

Epoch 00002: val_acc did not improve from 0.36133
Epoch 3/100

Epoch 00003: val_acc improved from 0.36133 to 0.37548, saving model to weights.best.hdf5
Epoch 4/100

Epoch 00004: val_acc did not improve from 0.37548
Epoch 5/100

Epoch 00005: val_acc improved from 0.37548 to 0.37750, saving model to weights.best.hdf5
Epoch 6/100

Epoch 00006: val_acc did not improve from 0.37750
Epoch 7/100

Epoch 00007: val_acc did not improve from 0.37750
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.37750
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.37750
Epoch 10/100

Epoch 00010: val_acc did not improve from 0.37750
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.37750
Epoch 12/100

Epoch 00012: val_acc improved from 0.37750 to 0.38087, saving model to weights.best.hdf5
Epoch 13/100

Epoch 00013: val_acc did n

In [3]:
x_train

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,VideoAmt,PhotoAmt,AdoptionSpeed
13118,0.0,1.0,1.000000,1.000000,0.335505,0.0,0.166667,0.000000,0.000000,0.333333,0.5,0.0,0.5,0.5,0.0,0.000000,0.000000,0.0,0.000000,2
3218,0.0,1.0,0.666667,1.000000,0.000000,0.5,0.166667,0.571429,0.000000,0.333333,0.5,1.0,0.0,0.5,0.0,0.000000,0.000000,0.0,0.000000,4
6280,0.0,1.0,1.000000,1.000000,0.000000,0.5,0.000000,0.285714,1.000000,0.333333,0.0,0.5,0.5,0.5,0.0,0.000000,0.000000,0.0,0.333333,4
5187,0.0,1.0,1.000000,1.000000,0.000000,0.0,0.166667,0.000000,0.000000,0.333333,0.0,0.0,0.0,0.5,0.0,0.000000,0.000000,0.0,0.333333,4
1286,0.0,0.0,0.000000,1.000000,0.000000,1.0,0.000000,0.285714,0.000000,0.333333,0.0,0.0,0.0,0.5,0.0,0.157895,0.000000,0.0,0.666667,4
11874,0.0,1.0,0.000000,1.000000,0.000000,0.5,0.666667,1.000000,0.000000,0.333333,0.5,0.5,0.5,0.5,0.0,0.000000,0.000000,0.0,0.333333,1
7718,0.0,1.0,1.000000,1.000000,0.000000,0.0,0.166667,1.000000,0.000000,0.333333,0.5,0.0,0.0,0.0,0.0,0.052632,0.000000,0.0,0.000000,4
13095,1.0,1.0,0.000000,0.866450,0.000000,0.5,0.000000,0.571429,1.000000,0.000000,0.0,0.5,0.5,0.5,0.0,0.052632,0.000000,0.0,0.333333,3
6836,0.0,1.0,0.000000,1.000000,1.000000,0.0,0.166667,0.000000,0.000000,0.333333,0.5,0.5,0.0,0.5,0.0,0.000000,0.000000,0.0,0.000000,2
9580,0.0,1.0,1.000000,1.000000,0.000000,0.5,0.000000,0.285714,1.000000,0.333333,0.5,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.333333,4
