In [1]:
import pandas as pd
import numpy as np

def PrepareData(trainpath,nrows=None):
    df = pd.read_csv(trainpath,nrows=nrows)
    df = df[df['target']==0]
    
    #categorical
    cats = pd.DataFrame()
    for col in df.columns:
        if 'cat' in col:
            cats = pd.concat([cats,pd.get_dummies(df[col],prefix=col)],axis=1)
            
    #binary        
    cols = []
    for col in df.columns:
        if 'bin' in col:
            cols.append(col)
    bins = df[cols]
    
    return np.array(pd.concat([bins,cats],axis=1))

In [9]:
from sklearn.model_selection import KFold

def KfoldCV(model, X):  
    kf = KFold(n_splits=3, shuffle=True, random_state=2)
    scores = []
    for train_index, test_index in kf.split(X):
        train, test = X[train_index], X[test_index]
        model.fit(train)
        score = model.evaluate(test)
        scores.append(score)
    scores = np.array(scores)

    return scores.mean(), scores.std()
    

In [18]:
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout

class ThreeLayer(object):
    
    def __init__(self, hfrac=0.5, dfrac=0.2):
        self.hfrac = hfrac
        self.dfrac = dfrac
        self.model = Sequential()
    
    def fit(self,X):
        self.dim = len(X[0])
        self.hdim = int(self.hfrac*self.dim)
        
        self.model.add(Dense(self.hdim, input_dim=self.dim, activation='relu'))
        self.model.add(Dropout(self.dfrac))
        self.model.add(Dense(self.dim, activation='sigmoid')) 
        self.model.compile(loss='binary_crossentropy', optimizer='adadelta')
        
        callbacks = [EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=2, verbose=0)]

        self.model.fit(X, X, 
                       epochs=100, 
                       batch_size=1000, 
                       validation_split=0.2,
                       callbacks=callbacks,
                       verbose=False)
        
    def evaluate(self, X):
        return self.model.evaluate(X,X)

In [24]:
trainpath = "/Users/guoli/Desktop/kaggle/Porto/train.csv"
X = PrepareData(trainpath,nrows=20000)

In [25]:
np.random.seed(2017)
model = ThreeLayer(hfrac=0.5)
KfoldCV(model, X)



(0.11964345814521637, 0.016618451546306215)

In [26]:
np.random.seed(2017)
model = ThreeLayer(hfrac=0.7)
KfoldCV(model, X)



(0.11324251168868939, 0.016684483777653754)

In [27]:
np.random.seed(2017)
model = ThreeLayer(hfrac=0.3)
KfoldCV(model, X)



(0.13215258103470642, 0.017003933254268969)

In [28]:
np.random.seed(2017)
model = ThreeLayer(hfrac=0.8)
KfoldCV(model, X)



(0.11375792567933662, 0.017044387617117324)

In [29]:
np.random.seed(2017)
model = ThreeLayer(hfrac=0.6)
KfoldCV(model, X)



(0.11573273572578598, 0.017295164847717343)