In [10]:
# Imports
import os
import pandas as pd
import numpy as np
import datetime
import warnings
from scipy.optimize import minimize
from sklearn.metrics import r2_score
from IPython.display import display

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
warnings.simplefilter('ignore')
N_JOBS = -1
SEED = 2017

In [11]:
# Stack all single Level 1 models OOF and test preds as entries for a Level 2 model
def get_level_one_outputs(OOF_targets) :
    level_one_OOF = pd.DataFrame()
    i = 0
    root_dir = "oof_preds/"
    for file_name in sorted(os.listdir(root_dir)) :
        preds_OOF = pd.read_csv(root_dir + file_name)
        cur_preds_OOF = pd.Series(preds_OOF.y, name = file_name[0:7])
        level_one_OOF = pd.concat([level_one_OOF, cur_preds_OOF], axis = 1)
        i += 1
    display(level_one_OOF.head())

    # Same for test set preds
    level_one_test = pd.DataFrame()
    i = 0
    root_dir = "preds/"
    for file_name in sorted(os.listdir(root_dir)) :
        preds_test = pd.read_csv(root_dir + file_name)
        cur_preds_test = pd.Series(preds_test.y, name = file_name[0:7])
        level_one_test = pd.concat([level_one_test, cur_preds_test], axis = 1)
        i += 1
    display(level_one_test.head())
    
    return(level_one_OOF, level_one_test)

In [12]:
'''# Define function to minimize
def target_func(weights) :
    final_prediction = 0
    for weight, prediction in zip(weights, preds_X_test):
        final_prediction += weight * prediction
    return(-r2_score(X_test_y, final_prediction))


# Optimize ensemble weights on held out fold
def optimize(preds_X_test, names) :
    scores = []
    weights = []
    for i in range(100):
        # Choose many random starting weights
        starting_values = np.random.uniform(size = len(preds_X_test))

        # Our weights are bound between 0 and 1
        bounds = [(0, 1)] * len(preds_X_test)

        res = minimize(
            target_func, 
            starting_values, 
            method = "SLSQP", 
            bounds = bounds, 
            options = {"maxiter" : 10000})

        scores.append(res["fun"])
        weights.append(res["x"])

    bestSC = -np.min(scores)
    bestWght = weights[np.argmin(scores)]

    print("\n Ensemble Score: {}".format(bestSC))
    print("\n Best Weights: {}".format(bestWght))
    print("\n Names: {}".format(names))

    return(bestSC, bestWght, names)'''

In [17]:
# Define function to minimize
def target_func(weights) :
    final_prediction = 0
    for weight, prediction in zip(weights, preds_X_test):
        final_prediction += weight * prediction
    return(-r2_score(X_test_y, final_prediction))


# Optimize ensemble weights on held out fold
def optimize(preds_X_test) :
    scores = []
    weights = []
    for i in range(100):
        # Choose many random starting weights
        starting_values = np.random.uniform(size = len(preds_X_test))

        # Our weights are bound between 0 and 1
        bounds = [(0, 1)] * len(preds_X_test)

        res = minimize(
            target_func, 
            starting_values, 
            method = "SLSQP", 
            bounds = bounds, 
            options = {"maxiter" : 10000})

        scores.append(res["fun"])
        weights.append(res["x"])

    bestSC = -np.min(scores)
    bestWght = weights[np.argmin(scores)]

    print("\n Ensemble Score: {}".format(bestSC))
    print("\n Best Weights: {}".format(bestWght))

    return(bestSC, bestWght)

<b>Script</b>

In [13]:
# Get data
OOF_targets = pd.read_csv("clean_data/OOF_targets.csv")
init_test = pd.read_csv("raw_data/test.csv")
test_ids = init_test.ID.values

display(OOF_targets.head())

Unnamed: 0,ID,y
0,1,99.15
1,2,91.98
2,3,91.52
3,4,91.57
4,5,90.11


In [14]:
# Create Level 2 inputs
level_one_OOF, level_one_test = get_level_one_outputs(OOF_targets)

Unnamed: 0,et_dc15,et_dc2_,et_dc3_,et_dc4_,et_dc5_,et_dc6_,et_dc7_
0,93.818,93.93,94.019,94.16,94.039,94.1,94.131
1,93.741,93.982,93.982,93.949,93.986,93.964,94.041
2,93.208,93.446,93.706,93.492,93.522,93.697,93.512
3,93.758,94.222,93.978,94.369,94.215,94.246,94.141
4,94.321,94.412,93.944,94.455,94.504,94.523,94.437


Unnamed: 0,et_dc15,et_dc2_,et_dc3_,et_dc4_,et_dc5_,et_dc6_,et_dc7_
0,78.668,78.365,77.753,78.544,78.549,79.078,78.543
1,93.689,93.904,93.871,93.892,93.962,94.017,93.963
2,77.303,77.51,77.42,77.199,77.466,77.166,77.539
3,78.276,78.111,77.434,78.561,78.164,78.466,78.226
4,112.993,112.932,112.707,113.328,112.907,113.103,113.004


In [16]:
# Get error of each OOF column
preds_X_test = []
for col in level_one_OOF.columns :
    print(OOF_targets.y.shape)
    print(level_one_OOF[col].shape)
    cv_score = r2_score(OOF_targets.y, level_one_OOF[col])
    print("Global OOF r2_score for " + col + " : " + str(cv_score))

    preds_X_test.append(level_one_OOF[col])

(4209,)
(4209,)
Global OOF r2_score for et_dc15 : 0.565423151536
(4209,)
(4209,)
Global OOF r2_score for et_dc2_ : 0.564604887927
(4209,)
(4209,)
Global OOF r2_score for et_dc3_ : 0.566072957735
(4209,)
(4209,)
Global OOF r2_score for et_dc4_ : 0.563813793244
(4209,)
(4209,)
Global OOF r2_score for et_dc5_ : 0.564090667366
(4209,)
(4209,)
Global OOF r2_score for et_dc6_ : 0.56416472274
(4209,)
(4209,)
Global OOF r2_score for et_dc7_ : 0.564040091236


In [18]:
# Optimize weights
bestSC, bestWght = optimize(preds_X_test)

NameError: name 'X_test_y' is not defined

In [None]:
'''# Get validation set data
la_preds_X_test = np.array(pd.read_csv("preds/la_preds_X_test.csv", header = None))
et_preds_X_test = np.array(pd.read_csv("preds/et_preds_X_test.csv", header = None))
xg_preds_X_test = np.array(pd.read_csv("preds/xg_preds_X_test.csv", header = None))

# Get test set data
la_preds_test = np.array(pd.read_csv("preds/la_preds_test.csv"))
et_preds_test = np.array(pd.read_csv("preds/et_preds_test.csv"))
xg_preds_test = np.array(pd.read_csv("preds/xg_preds_test.csv"))

preds_X_test = []
preds_X_test.append(la_preds_X_test[:, 0])
preds_X_test.append(et_preds_X_test[:, 0])
preds_X_test.append(xg_preds_X_test[:, 0])

preds_test = []
preds_test.append(la_preds_test[:, 1])
preds_test.append(et_preds_test[:, 1])
preds_test.append(xg_preds_test[:, 1])

names = []
names.append("la")
names.append("et")
names.append("xg")'''


In [None]:
'''# Scores of single models on validation set
print("la on held out fold : " + str(r2_score(X_test_y, la_preds_X_test)))
print("et on held out fold : " + str(r2_score(X_test_y, et_preds_X_test)))
print("xg on held out fold : " + str(r2_score(X_test_y, xg_preds_X_test)))'''


In [None]:
'''# Optimize weights
bestSC, bestWght, names = optimize(preds_X_test, names)'''

In [None]:
'''# Prepare submission using optimal weights
preds = 0
for i in range(0, len(bestWght)) :
    (bestWght[i] * preds_test[i])
    preds += (bestWght[i] * preds_test[i])'''

In [None]:
'''# Save predictions
pd.DataFrame({"ID": test_ids[0], "y": preds}).to_csv("preds/blend.csv", index = False)'''
