In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import minimize
from sklearn.metrics import r2_score
from IPython.display import HTML, display

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
display(HTML("<style>.container { width: 90% !important; }</style>"))
warnings.simplefilter('ignore')
N_JOBS = -1
SEED = 2017

In [2]:
# Stack all single Level 1 models OOF and test preds as entries for a Level 2 model
def get_level_one_data() :
    level_one_preds = pd.DataFrame()
    i = 0
    root_dir = "oof_preds/"
    for file_name in sorted(os.listdir(root_dir)) :
        preds_OOF = pd.read_csv(root_dir + file_name)
        cur_preds_OOF = pd.Series(preds_OOF.y, name = file_name[0:7])
        level_one_preds = pd.concat([level_one_preds, cur_preds_OOF], axis = 1)
        i += 1

    level_one_targets = pd.DataFrame()
    i = 0
    root_dir = "oof_targets/"
    for file_name in sorted(os.listdir(root_dir)) :
        targets_OOF = pd.read_csv(root_dir + file_name)
        cur_targets_OOF = pd.Series(targets_OOF.y, name = file_name[0:7])
        level_one_targets = pd.concat([level_one_targets, cur_targets_OOF], axis = 1)
        i += 1

    # Same for test set preds
    level_one_test = pd.DataFrame()
    i = 0
    root_dir = "preds/"
    for file_name in sorted(os.listdir(root_dir)) :
        preds_test = pd.read_csv(root_dir + file_name)
        cur_preds_test = pd.Series(preds_test.y, name = file_name[0:7])
        level_one_test = pd.concat([level_one_test, cur_preds_test], axis = 1)
        i += 1
    
    return(level_one_preds, level_one_targets, level_one_test)

In [3]:
# Plot predictions
def plot_preds(targets, preds) :
    plt.style.use("fivethirtyeight")
    plt.figure(figsize=(10, 20))
    colormap = plt.cm.gist_rainbow
    plt.gca().set_color_cycle([colormap(i) for i in np.linspace(0, 0.9, len(preds))])
    plt.plot([50, 250], [50, 250], c = "red", linewidth = 2)
    for i in range(len(preds)) :        
        plt.scatter(targets[i], preds[i], marker = "o", s = 5, label = "Model" + str(i + 1))
    plt.xlabel("Real y")
    plt.ylabel("Projected y")
    plt.legend()
    plt.show()

In [17]:
# Define function to minimize
def target_func(weights, targets_X_test) :
    final_pred = 0
    for weight, pred in zip(weights, preds_X_test):
        final_pred += weight * pred
    return(-r2_score(targets_X_test[0], final_pred))


# Optimize ensemble weights on held out fold
def optimize(preds_X_test, targets_X_test) :
    scores = []
    weights = []
    for i in range(100):
        # Choose many random starting weights
        starting_values = np.random.uniform(size = len(preds_X_test))

        # Our weights are bound between 0 and 1
        bounds = [(0, 1)] * len(preds_X_test)

        res = minimize(
            fun=target_func, 
            x0=starting_values, 
            args=(targets_X_test, ),
            method="SLSQP", 
            bounds=bounds, 
            options={"maxiter" : 10000}
        )

        scores.append(res["fun"])
        weights.append(res["x"])

    bestSC = -np.min(scores)
    bestWght = weights[np.argmin(scores)]

    print("\n Ensemble Score: {}".format(bestSC))
    print("\n Best Weights: {}".format(bestWght))

    return(bestSC, bestWght)

<b>Script</b>

In [5]:
# Get data
init_test = pd.read_csv("raw_data/test.csv")
test_ids = init_test.ID.values


In [6]:
# Create Level 2 inputs
level_one_preds, level_one_targets, level_one_test = get_level_one_data()
display(level_one_preds.head())
display(level_one_preds.tail())


Unnamed: 0,et_dc1_,et_dc2_,et_dc3_,et_dc4_,gb_dc1_,gb_dc2_,gb_dc3_,gb_dc4_,la_dc1_,la_dc2_,...,rf_dc3_,rf_dc4_,ri_dc1_,ri_dc2_,ri_dc3_,ri_dc4_,xg_dc1_,xg_dc2_,xg_dc3_,xg_dc4_
0,115.502,113.76,116.123,113.452,111.677,111.614,116.776,112.425,115.336,114.676,...,116.157,113.645,114.793,114.285,114.792,114.158,114.305,113.237,119.797,115.086
1,113.649,113.013,113.466,112.843,110.508,110.673,110.68,110.578,114.157,112.484,...,112.512,112.697,112.711,111.999,112.71,111.89,111.465,111.395,111.419,111.959
2,115.123,115.825,115.391,115.53,112.992,112.298,111.043,113.725,112.837,110.533,...,113.892,116.361,110.869,109.687,110.869,110.118,113.043,110.228,111.767,111.726
3,115.028,114.947,114.812,114.783,114.346,111.807,112.832,110.802,112.149,104.95,...,114.667,114.345,105.422,107.747,105.422,107.763,113.584,108.791,113.061,108.955
4,116.449,116.453,116.304,117.409,113.891,118.942,116.301,122.184,117.668,125.179,...,115.011,116.424,127.329,126.247,127.328,126.066,117.861,119.981,117.429,121.86


Unnamed: 0,et_dc1_,et_dc2_,et_dc3_,et_dc4_,gb_dc1_,gb_dc2_,gb_dc3_,gb_dc4_,la_dc1_,la_dc2_,...,rf_dc3_,rf_dc4_,ri_dc1_,ri_dc2_,ri_dc3_,ri_dc4_,xg_dc1_,xg_dc2_,xg_dc3_,xg_dc4_
3942,93.797,93.568,93.771,93.953,95.712,94.015,95.52,94.231,95.214,95.073,...,94.774,93.665,94.26,93.625,94.285,93.653,93.879,92.12,93.22,92.988
3943,93.583,93.462,93.467,93.677,94.436,93.882,93.321,94.028,94.72,95.068,...,93.601,93.453,93.233,93.302,93.302,93.322,93.009,92.061,91.666,92.552
3944,95.436,93.819,95.691,93.803,101.93,94.161,96.249,94.396,95.231,94.769,...,94.825,93.798,97.925,97.304,97.99,97.304,97.825,92.221,94.463,92.688
3945,93.306,93.719,93.515,93.704,94.122,95.563,93.862,94.708,94.896,94.744,...,93.296,93.759,93.175,94.175,93.27,94.19,94.368,94.019,93.023,94.31
3946,94.046,94.77,94.151,94.883,95.013,94.723,95.887,95.809,95.542,95.425,...,95.788,94.855,93.245,94.753,93.315,94.744,96.487,94.692,99.251,97.424


In [7]:
# Get error of each OOF column
preds_X_test = []
targets_X_test = []
for col in level_one_preds.columns :
    cv_score = r2_score(level_one_targets[col], level_one_preds[col])
    print("Global OOF r2_score for " + col + " : " + str(cv_score))
    preds_X_test.append(level_one_preds[col])
    targets_X_test.append(level_one_targets[col])
#plot_preds(targets_X_test, preds_X_test)


Global OOF r2_score for et_dc1_ : 0.573141923428
Global OOF r2_score for et_dc2_ : 0.570962983237
Global OOF r2_score for et_dc3_ : 0.571729325242
Global OOF r2_score for et_dc4_ : 0.56899946984
Global OOF r2_score for gb_dc1_ : 0.556668536736
Global OOF r2_score for gb_dc2_ : 0.559960120519
Global OOF r2_score for gb_dc3_ : 0.559796897974
Global OOF r2_score for gb_dc4_ : 0.555797134133
Global OOF r2_score for la_dc1_ : 0.588830724905
Global OOF r2_score for la_dc2_ : 0.555034568817
Global OOF r2_score for la_dc3_ : 0.588819290786
Global OOF r2_score for la_dc4_ : 0.551603743705
Global OOF r2_score for ll_dc1_ : 0.588807623522
Global OOF r2_score for ll_dc2_ : 0.54422420748
Global OOF r2_score for ll_dc3_ : 0.588857942044
Global OOF r2_score for ll_dc4_ : 0.552249064208
Global OOF r2_score for rf_dc1_ : 0.572794943196
Global OOF r2_score for rf_dc2_ : 0.570848219976
Global OOF r2_score for rf_dc3_ : 0.576381176152
Global OOF r2_score for rf_dc4_ : 0.568825836525
Global OOF r2_score fo

In [18]:
# Optimize weights
bestSC, bestWght = optimize(preds_X_test, targets_X_test)


 Ensemble Score: 0.5904447198979512

 Best Weights: [  0.00000000e+00   0.00000000e+00   8.09700994e-19   0.00000000e+00
   0.00000000e+00   7.97705083e-18   4.88296622e-18   0.00000000e+00
   1.93222733e-01   1.95814000e-18   2.05002525e-01   0.00000000e+00
   2.08608861e-01   0.00000000e+00   2.13897839e-01   0.00000000e+00
   6.00906230e-19   9.21941119e-18   0.00000000e+00   4.88183499e-18
   2.48405292e-18   0.00000000e+00   1.66543051e-17   0.00000000e+00
   1.52043542e-02   1.67190288e-01   1.05514083e-17   9.51213412e-18]


In [None]:
# Prepare submission using optimal weights
preds = 0
for i in range(0, len(bestWght)) :
    preds = preds + (bestWght[i] * level_one_test[[i]].values)

In [None]:
# Save predictions
file_name = "ens_preds/avg_" + str(level_one_preds.shape[1]) + ".csv"
pd.DataFrame({"ID": init_test.ID.values, "y": pd.DataFrame(preds)[0]}).to_csv(file_name, index = False)


In [None]:
pd.DataFrame(preds)[0]

In [None]:
'''# Get error of each OOF column
for i in range(len(models_OOF)) :
    cur_model_OOF = models_OOF[i]
    model_name = cur_model_OOF.columns[0][0:7]
    cv_score = r2_score(cur_model_OOF[[1]], cur_model_OOF[[0]])
    print("Global OOF r2_score for " + model_name + " : " + str(cv_score))
#plot_preds(targets_X_test, preds_X_test)'''


In [None]:
'''# Stack all single Level 1 models OOF and test preds as entries for a Level 2 model
def get_level_one_data() :
    # Get level 1 preds and targets
    models_OOF = []
    i = 0
    preds_dir = "oof_preds/"
    targets_dir = "oof_targets/"
    for file_name in sorted(os.listdir(preds_dir)) :
        preds_OOF = pd.read_csv(preds_dir + file_name)
        targets_OOF = pd.read_csv(targets_dir + file_name.replace("preds", "targets"))
        cur_preds_OOF = pd.Series(preds_OOF.y, name = file_name[0:7])
        cur_targets_OOF = pd.Series(targets_OOF.y, name = file_name[0:7])
        cur_model_OOF = pd.DataFrame({file_name[0:7] + "_preds" : cur_preds_OOF, file_name[0:7] + "_targets" : cur_targets_OOF})
        models_OOF.append(cur_model_OOF)

    # Get test set preds
    level_one_test = pd.DataFrame()
    i = 0
    test_dir = "preds/"
    for file_name in sorted(os.listdir(test_dir)) :
        preds_test = pd.read_csv(test_dir + file_name)
        cur_preds_test = pd.Series(preds_test.y, name = file_name[0:7])
        level_one_test = pd.concat([level_one_test, cur_preds_test], axis = 1)
        i += 1
    display(level_one_test.head())
    
    return(models_OOF, level_one_test)'''