In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# preprocessing/decomposition
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA, FastICA, FactorAnalysis, KernelPCA

# keras 
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.regularizers import l2
from keras.layers.convolutional import MaxPooling2D, Convolution2D, AveragePooling2D
from keras.layers import Input, Dropout, Dense, Flatten, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import concatenate
from keras import regularizers
from keras import initializers
from keras.layers.advanced_activations import ThresholdedReLU
from keras.models import Model
from keras.callbacks import ReduceLROnPlateau

# Utils
from keras.utils.layer_utils import convert_all_kernels_in_model
from keras.utils.data_utils import get_file

# model evaluation
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# supportive models
from sklearn.ensemble import GradientBoostingRegressor
# feature selection (from supportive model)
from sklearn.feature_selection import SelectFromModel

Using TensorFlow backend.


In [2]:
seed = 1989
type = 2
all_dat = pd.read_csv("../data/cleanData.csv")
test_id = pd.read_csv("../data/testID.csv")
all_dat.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,sdX3,sdX4,sdX5,sdX6,sdX8,X0_A_Only,X0_A_Only_MeanY,X0_A_Only_SDY,dupCnt,IDScale
0,0,130.81,0.185185,0.777778,0.9,0.0,1.0,0.606061,0.818182,0.583333,...,0.380245,0.999251,0.666138,0.63232,0.145967,0,0,0,0.0,0.0
1,1,71.34112,0.944444,0.777778,0.26,0.833333,1.0,0.575758,0.0,0.916667,...,1.0,0.999251,0.666138,0.611275,0.601269,1,1,1,0.0,0.000119
2,2,,0.351852,0.037037,0.68,0.0,1.0,0.030303,0.545455,1.0,...,0.380245,0.999251,0.666138,0.589487,0.332904,0,0,0,0.0,0.000238
3,3,,0.944444,0.777778,0.88,0.833333,1.0,0.0,0.818182,0.375,...,1.0,0.999251,0.666138,0.63232,0.298495,1,1,1,0.0,0.000356
4,4,,0.944444,0.407407,0.26,0.833333,1.0,0.757576,1.0,0.541667,...,1.0,0.999251,0.666138,0.568091,0.788322,1,1,1,0.0,0.000475


In [3]:
train = all_dat[-all_dat['y'].isnull()].copy()
test = all_dat[all_dat['y'].isnull()].copy()
test_raw = pd.merge(all_dat, test_id, on = 'ID', how='inner')
test_raw.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,sdX3,sdX4,sdX5,sdX6,sdX8,X0_A_Only,X0_A_Only_MeanY,X0_A_Only_SDY,dupCnt,IDScale
0,1,71.34112,0.944444,0.777778,0.26,0.833333,1.0,0.575758,0.0,0.916667,...,1.0,0.999251,0.666138,0.611275,0.601269,1,1,1,0.0,0.000119
1,2,,0.351852,0.037037,0.68,0.0,1.0,0.030303,0.545455,1.0,...,0.380245,0.999251,0.666138,0.589487,0.332904,0,0,0,0.0,0.000238
2,3,,0.944444,0.777778,0.88,0.833333,1.0,0.0,0.818182,0.375,...,1.0,0.999251,0.666138,0.63232,0.298495,1,1,1,0.0,0.000356
3,4,,0.944444,0.407407,0.26,0.833333,1.0,0.757576,1.0,0.541667,...,1.0,0.999251,0.666138,0.568091,0.788322,1,1,1,0.0,0.000475
4,5,,0.407407,0.666667,0.88,0.333333,1.0,0.727273,0.727273,0.5,...,0.579123,0.999251,0.666138,0.491877,0.444345,0,0,0,0.0,0.000594


In [4]:
# check shape
print('\nTrain shape: {}\nTest shape: {}'.format(train.shape, test.shape))

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 
    
def r2_keras(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true - y_pred )) 
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

def createDeepModel(dropout_level = 0.25, activation = 'tanh'):
    model = Sequential()
    model.add(Dense(1024, input_dim=input_dims, kernel_initializer="he_normal"))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout_level))
    
    model.add(Dense(1024, kernel_initializer="he_normal"))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout_level))
    
    model.add(Dense(1024, kernel_initializer="he_normal"))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout_level))
    
    model.add(Dense(1024, kernel_initializer="he_normal"))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout_level))
    
    model.add(Dense(1024, kernel_initializer="he_normal"))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout_level))
    
    model.add(Dense(768, kernel_initializer="he_normal"))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout_level))
    
    model.add(Dense(768, kernel_initializer="he_normal"))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout_level))
    
    model.add(Dense(768, kernel_initializer="he_normal"))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout_level))
    
    model.add(Dense(768, kernel_initializer="he_normal"))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout_level))
    
    model.add(Dense(768, kernel_initializer="he_normal"))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout_level))

    model.add(Dense(768, kernel_initializer="he_normal", kernel_regularizer = l2(1.e-5)))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout_level))
    
    model.add(Dense(1, activation='linear'))
    return model

# initialize input dimension
input_dims = train.shape[1]-2



Train shape: (4251, 730)
Test shape: (4167, 730)


In [5]:
# X, y preparation
X, y = train.drop(['y', 'ID'], axis=1).values, train.y.values
print(X.shape)

(4251, 728)


In [6]:
# X_test preparation
X_test = test_raw.drop(['y', 'ID'], axis=1).values
print(X_test.shape)

(4209, 728)


In [7]:
# define path to save model
import os
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

numFolds = 5
kf = KFold(n_splits=numFolds)
kfidx = 1

yfull_test = []
yfull_train = []
yfull_target = []

def r2_keras(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true - y_pred )) 
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

# make np.seed fixed
np.random.seed(111)

for train, valid in kf.split(X):
    # prepare callbacks
    K.clear_session()
    #model = simpleModel_x()
    model = createDeepModel(dropout_level=0.15,activation='relu')
    model.compile(loss= 'mean_squared_error', # one may use 'mean_absolute_error' as alternative
                  optimizer='adadelta',
                  metrics=[r2_keras] # you can add several if needed
                 )
    
    if kfidx == 1:
        print(model.summary())
    
    X_train, X_valid, y_train, y_valid = X[train], X[valid], y[train], y[valid]
    
    kfold_weights_path = os.path.join('', 'kf13_' + str(kfidx) + '.h5')
    print('Computing fold {} out of {}...'.format(kfidx, numFolds))
    callbacks = [
        EarlyStopping(
            monitor='val_loss', 
            patience=35, # was 10
            verbose=1),

        ModelCheckpoint(
            kfold_weights_path, 
            monitor='val_loss', 
            save_best_only=True, 
            verbose=0),

        ReduceLROnPlateau(monitor='val_loss', factor=0.25, patience=15, min_lr=0.00005, verbose=1),
    ]
    
    model.fit(X_train, y_train, epochs = 500, verbose = 0, batch_size=64,
                        validation_data = (X_valid, y_valid), 
                        callbacks = callbacks, shuffle = True)
    
    if os.path.isfile(kfold_weights_path):
        model.load_weights(kfold_weights_path)
        
    # check performance on train set
    print('MSE train: {}'.format(mean_squared_error(y_train, model.predict(X_train))**0.5)) # mse train
    print('R^2 train: {}'.format(r2_score(y_train, model.predict(X_train)))) # R^2 train

    # check performance on validation set
    print('MSE val: {}'.format(mean_squared_error(y_valid, model.predict(X_valid))**0.5)) # mse val
    print('R^2 val: {}'.format(r2_score(y_valid, model.predict(X_valid)))) # R^2 val
    
    # keep track of OOF results
    res_oof = model.predict(X_valid)
    yfull_train.append(res_oof)
    yfull_target.append(y_valid)
    
    # predict results
    res = model.predict(X_test)
    yfull_test.append(res)
    
    # update the counter
    kfidx += 1
    
# then.. overall..
train_full_result = np.array(yfull_train[0])
train_target_full = np.array(yfull_target[0])
for i in range(1, numFolds):
    train_full_result = np.concatenate((train_full_result, yfull_train[i]), axis = 0)
    train_target_full = np.concatenate((train_target_full, yfull_target[i]), axis = 0)
print('OVERALL MSE OOF VAL: {}'.format(mean_squared_error(train_target_full, train_full_result)**0.5)) # mse val
print('OVERALL R^2 OOF VAL: {}'.format(r2_score(train_target_full, train_full_result))) # R^2 val

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              746496    
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024)              4096      
_________________________________________________________________
activation_1 (Activation)    (None, 1024)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
batch_normalization_2 (Batch (None, 1024)              4096      
_________________________________________________________________
activation_2 (Activation)    (None, 1024)              0         
__________

In [8]:
test_full_result = np.array(yfull_test[0])
for i in range(1, numFolds):
    test_full_result += np.array(yfull_test[i])
test_full_result /= numFolds
# create df and convert it to csv
test_df = {'ID': test_raw['ID'], 'y': test_full_result.ravel()}
test_full_result_df = pd.DataFrame(test_df, columns = ['ID', 'y'])
test_full_result_df.to_csv('benchmark_keras_v7_oof_0_5639.csv', index=False)