## Stacked then averaged with Leaked

In [2]:
%run Functions.py

In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score

class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed


train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))



n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

#save columns list before adding the decomposition components

usable_columns = list(set(train.columns) - set(['y']))

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

#usable_columns = list(set(train.columns) - set(['y']))

y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID'].values
#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays) 
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values


'''Train the xgb model then predict the test data'''

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = 0
for fold in range(1,3):
    np.random.seed(fold)
    xgb_params = {
        'n_trees': 520, 
        'eta': 0.0045,
        'max_depth': 4,
        'subsample': 0.93,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'base_score': y_mean, # base prediction = mean(target)
        'silent': True,
        'seed': fold,
    }
    # NOTE: Make sure that the class is labeled 'class' in the data file
    
    dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
    dtest = xgb.DMatrix(test)
    
    num_boost_rounds = 1250
    # train model
    model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
    y_pred = model.predict(dtest)
    
    '''Train the stacked models then predict the test data'''
    
    stacked_pipeline = make_pipeline(
        StackingEstimator(estimator=LassoLarsCV(normalize=True)),
        StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
        LassoLarsCV()
    
    )
    
    stacked_pipeline.fit(finaltrainset, y_train)
    results = stacked_pipeline.predict(finaltestset)
    
    '''R2 Score on the entire Train data when averaging'''
    
    print('R2 score on train data:')
    print(r2_score(y_train,stacked_pipeline.predict(finaltrainset)*0.2855 + model.predict(dtrain)*0.7145))
    
    '''Average the preditionon test data  of both models then save it on a csv file'''

    sub['y'] += y_pred*0.75 + results*0.25
sub['y'] /= 2

leaks = {
    1:71.34112,
    12:109.30903,
    23:115.21953,
    28:92.00675,
    42:87.73572,
    43:129.79876,
    45:99.55671,
    57:116.02167,
    3977:132.08556,
    88:90.33211,
    89:130.55165,
    93:105.79792,
    94:103.04672,
    1001:111.65212,
    104:92.37968,
    72:110.54742,
    78:125.28849,
    105:108.5069,
    110:83.31692,
    1004:91.472,
    1008:106.71967,
    1009:108.21841,
    973:106.76189,
    8002:95.84858,
    8007:87.44019,
    1644:99.14157,
    337:101.23135,
    253:115.93724,
    8416:96.84773,
    259:93.33662,
    262:75.35182,
    1652:89.77625
    }
sub['y'] = sub.apply(lambda r: leaks[int(r['ID'])] if int(r['ID']) in leaks else r['y'], axis=1)
#sub.to_csv('stacked-models.csv', index=False)

  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))




R2 score on train data:
0.6578930105


  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))




R2 score on train data:
0.658151843781


In [5]:
sub

Unnamed: 0,ID,y
0,1,71.341120
1,2,99.763818
2,3,81.973843
3,4,79.921164
4,5,122.194655
5,8,94.657468
6,10,113.840915
7,11,95.988436
8,12,109.309030
9,14,95.458349


## A NNet with Tensorflow

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

###################
# Load & process data
###################
def get_data():
    #################
    # read datasets
    #################
    train = pd.read_csv('data/train.csv')
    test_submit = pd.read_csv('data/test.csv')

    # Get y and ID
    train = train[train.y < 250] # Optional: Drop y outliers
    y_train = train['y']
    train = train.drop('y', 1)
    test_submit_id = test_submit['ID']

    #########################
    # Create data
    #########################
    features = ['X0',
                'X5',
                'X118',
                'X127',
                'X47',
                'X315',
                'X311',
                'X179',
                'X314',
                'X232',
                'X29',
                'X263',
                'X261']

    # Build a new dataset using key parameters, lots of drops
    train = train[features]
    test_submit = test_submit[features]

    # Label encoder
    for c in train.columns:
        if train[c].dtype == 'object':
            lbl = LabelEncoder()
            lbl.fit(list(train[c].values) + list(test_submit[c].values))
            train[c] = lbl.transform(list(train[c].values))
            test_submit[c] = lbl.transform(list(test_submit[c].values))

    # Convert to matrix
    train = train.as_matrix()
    y_train = np.transpose([y_train.as_matrix()])
    test_submit = test_submit.as_matrix()
    test_submit_id = test_submit_id.as_matrix()

    #print(train.shape)
    #print(test_submit.shape)

    return train, y_train, test_submit, test_submit_id

#####################
# Neural Network
#####################
# Training steps
STEPS = 500
LEARNING_RATE = 0.0001
BETA = 0.01
DROPOUT = 0.5
RANDOM_SEED = 12345
MAX_Y = 250
RESTORE = True
START = 0

# Training variables
IN_DIM = 13

# Network Parameters - Hidden layers
n_hidden_1 = 100
n_hidden_2 = 50

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.01)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.03, shape=shape)
    return tf.Variable(initial)

def deep_network(inputs, keep_prob):
    # Input -> Hidden Layer
    w1 = weight_variable([IN_DIM, n_hidden_1])
    b1 = bias_variable([n_hidden_1])
    # Hidden Layer -> Hidden Layer
    w2 = weight_variable([n_hidden_1, n_hidden_2])
    b2 = bias_variable([n_hidden_2])
    # Hidden Layer -> Output
    w3 = weight_variable([n_hidden_2, 1])
    b3 = bias_variable([1])

    # 1st Hidden layer with dropout
    h1 = tf.nn.relu(tf.matmul(inputs, w1) + b1)
    h1_dropout = tf.nn.dropout(h1, keep_prob)
    # 2nd Hidden layer with dropout
    h2 = tf.nn.relu(tf.matmul(h1_dropout, w2) + b2)
    h2_dropout = tf.nn.dropout(h2, keep_prob)

    # Run sigmoid on output to get 0 to 1
    out = tf.nn.sigmoid(tf.matmul(h2_dropout, w3) + b3)

    # Loss function with L2 Regularization
    regularizers = tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2) + tf.nn.l2_loss(w3)

    scaled_out = tf.multiply(out, MAX_Y)  # Scale output
    return inputs, out, scaled_out, regularizers

def main(_):
    tf.set_random_seed(RANDOM_SEED)

    # Create the model
    x = tf.placeholder(tf.float32, [None, IN_DIM])

    # Define loss and optimizer
    y_ = tf.placeholder(tf.float32, [None, 1])

    # Dropout on hidden layers
    keep_prob = tf.placeholder("float")

    # Build the graph for the deep net
    inputs, out, scaled_out, regularizers = deep_network(x, keep_prob)

    # Normal loss function (RMSE)
    loss = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(y_, scaled_out))))

    # Loss function with L2 Regularization
    loss = tf.reduce_mean(loss + BETA * regularizers)

    # Optimizer
    train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)

    total_error = tf.reduce_sum(tf.square(tf.subtract(y_, tf.reduce_mean(y_))))
    unexplained_error = tf.reduce_sum(tf.square(tf.subtract(y_, scaled_out)))
    accuracy = tf.subtract(1.0, tf.divide(unexplained_error, total_error))

    # Save model
    saver = tf.train.Saver(max_to_keep=5)

    with tf.Session() as sess:
        #if RESTORE:
        #    print('Loading Model...')
        #    ckpt = tf.train.get_checkpoint_state('./models/neural/')
        #    saver.restore(sess, ckpt.model_checkpoint_path)
        #else:
        sess.run(tf.global_variables_initializer())

        train, y_train, test_submit, test_submit_id = get_data()

        # Train until maximum steps reached or interrupted
        for i in range(START, STEPS):
            k_fold = KFold(n_splits=10, shuffle=True)
            #if i % 100 == 0:
            #    saver.save(sess, './models/neural/step_' + str(i) + '.cptk')

            for k, (ktrain, ktest) in enumerate(k_fold.split(train, y_train)):
                train_step.run(feed_dict={x: train[ktrain], y_: y_train[ktrain], keep_prob: DROPOUT})
                # Show test score every 10 iterations
                if i % 10 == 0:
                    # Tensorflow R2
                    #train_accuracy = accuracy.eval(feed_dict={
                    #    x: train[ktest], y_: y_train[ktest]})
                    # SkLearn metrics R2
                    train_accuracy = r2_score(y_train[ktest],
                                              sess.run(scaled_out, feed_dict={x: train[ktest], keep_prob: 1.0}))
                    print('Step: %d, Fold: %d, R2 Score: %g' % (i, k, train_accuracy))

        ####################
        # CV (repeat 5 times)
        ####################
        CV = []
        for i in range(5):
            k_fold = KFold(n_splits=10, shuffle=True)
            for k, (ktrain, ktest) in enumerate(k_fold.split(train, y_train)):
                # Tensorflow R2
                #accuracy = accuracy.eval(feed_dict={
                #    x: train[ktest], y_: y_train[ktest]})
                # SkLearn metrics R2
                accuracy = r2_score(y_train[ktest],
                                          sess.run(scaled_out, feed_dict={x: train[ktest], keep_prob: 1.0}))
                print('Step: %d, Fold: %d, R2 Score: %g' % (i, k, accuracy))
                CV.append(accuracy)
        print('Mean R2: %g' % (np.mean(CV)))

if __name__ == '__main__':
    tf.app.run()

Step: 0, Fold: 0, R2 Score: -4.7507
Step: 0, Fold: 1, R2 Score: -3.81577
Step: 0, Fold: 2, R2 Score: -4.30285
Step: 0, Fold: 3, R2 Score: -4.46236
Step: 0, Fold: 4, R2 Score: -4.16163
Step: 0, Fold: 5, R2 Score: -4.14147
Step: 0, Fold: 6, R2 Score: -4.05315
Step: 0, Fold: 7, R2 Score: -4.95255
Step: 0, Fold: 8, R2 Score: -5.01364
Step: 0, Fold: 9, R2 Score: -4.46376
Step: 10, Fold: 0, R2 Score: -0.262886
Step: 10, Fold: 1, R2 Score: -0.184338
Step: 10, Fold: 2, R2 Score: -0.157193
Step: 10, Fold: 3, R2 Score: -0.18155
Step: 10, Fold: 4, R2 Score: -0.191545
Step: 10, Fold: 5, R2 Score: -0.0717292
Step: 10, Fold: 6, R2 Score: -0.0736445
Step: 10, Fold: 7, R2 Score: -0.144103
Step: 10, Fold: 8, R2 Score: -0.134922
Step: 10, Fold: 9, R2 Score: -0.0880809
Step: 20, Fold: 0, R2 Score: -0.101706
Step: 20, Fold: 1, R2 Score: -0.0589188
Step: 20, Fold: 2, R2 Score: -0.0518843
Step: 20, Fold: 3, R2 Score: -0.101785
Step: 20, Fold: 4, R2 Score: -0.113034
Step: 20, Fold: 5, R2 Score: -0.0577997
St

Step: 210, Fold: 7, R2 Score: 0.599904
Step: 210, Fold: 8, R2 Score: 0.621885
Step: 210, Fold: 9, R2 Score: 0.602897
Step: 220, Fold: 0, R2 Score: 0.545358
Step: 220, Fold: 1, R2 Score: 0.640202
Step: 220, Fold: 2, R2 Score: 0.597606
Step: 220, Fold: 3, R2 Score: 0.537857
Step: 220, Fold: 4, R2 Score: 0.56837
Step: 220, Fold: 5, R2 Score: 0.598081
Step: 220, Fold: 6, R2 Score: 0.570602
Step: 220, Fold: 7, R2 Score: 0.662485
Step: 220, Fold: 8, R2 Score: 0.627427
Step: 220, Fold: 9, R2 Score: 0.598282
Step: 230, Fold: 0, R2 Score: 0.61397
Step: 230, Fold: 1, R2 Score: 0.628249
Step: 230, Fold: 2, R2 Score: 0.619514
Step: 230, Fold: 3, R2 Score: 0.564641
Step: 230, Fold: 4, R2 Score: 0.600018
Step: 230, Fold: 5, R2 Score: 0.551985
Step: 230, Fold: 6, R2 Score: 0.624007
Step: 230, Fold: 7, R2 Score: 0.587823
Step: 230, Fold: 8, R2 Score: 0.577407
Step: 230, Fold: 9, R2 Score: 0.590295
Step: 240, Fold: 0, R2 Score: 0.620987
Step: 240, Fold: 1, R2 Score: 0.584113
Step: 240, Fold: 2, R2 Scor

Step: 430, Fold: 0, R2 Score: 0.617738
Step: 430, Fold: 1, R2 Score: 0.60328
Step: 430, Fold: 2, R2 Score: 0.530473
Step: 430, Fold: 3, R2 Score: 0.565296
Step: 430, Fold: 4, R2 Score: 0.621615
Step: 430, Fold: 5, R2 Score: 0.52546
Step: 430, Fold: 6, R2 Score: 0.572421
Step: 430, Fold: 7, R2 Score: 0.656016
Step: 430, Fold: 8, R2 Score: 0.639338
Step: 430, Fold: 9, R2 Score: 0.676954
Step: 440, Fold: 0, R2 Score: 0.609136
Step: 440, Fold: 1, R2 Score: 0.6944
Step: 440, Fold: 2, R2 Score: 0.60982
Step: 440, Fold: 3, R2 Score: 0.591778
Step: 440, Fold: 4, R2 Score: 0.53584
Step: 440, Fold: 5, R2 Score: 0.595109
Step: 440, Fold: 6, R2 Score: 0.603682
Step: 440, Fold: 7, R2 Score: 0.579383
Step: 440, Fold: 8, R2 Score: 0.628436
Step: 440, Fold: 9, R2 Score: 0.584086
Step: 450, Fold: 0, R2 Score: 0.631948
Step: 450, Fold: 1, R2 Score: 0.639744
Step: 450, Fold: 2, R2 Score: 0.479207
Step: 450, Fold: 3, R2 Score: 0.592714
Step: 450, Fold: 4, R2 Score: 0.620391
Step: 450, Fold: 5, R2 Score: 0

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
