In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import minimize
from sklearn.metrics import r2_score
from IPython.display import HTML, display

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
display(HTML("<style>.container { width: 90% !important; }</style>"))
warnings.simplefilter('ignore')
N_JOBS = -1
SEED = 2017

In [2]:
# Stack all single Level 1 models OOF and test preds as entries for a Level 2 model
def get_level_one_data() :
    level_one_preds = pd.DataFrame()
    i = 0
    root_dir = "oof_preds/"
    for file_name in sorted(os.listdir(root_dir)) :
        preds_OOF = pd.read_csv(root_dir + file_name)
        cur_preds_OOF = pd.Series(preds_OOF.y, name = file_name[0:7])
        level_one_preds = pd.concat([level_one_preds, cur_preds_OOF], axis = 1)
        i += 1
    display(level_one_preds.head())

    level_one_targets = pd.DataFrame()
    i = 0
    root_dir = "oof_targets/"
    for file_name in sorted(os.listdir(root_dir)) :
        targets_OOF = pd.read_csv(root_dir + file_name)
        cur_targets_OOF = pd.Series(targets_OOF.y, name = file_name[0:7])
        level_one_targets = pd.concat([level_one_targets, cur_targets_OOF], axis = 1)
        i += 1
    display(level_one_targets.head())

    # Same for test set preds
    level_one_test = pd.DataFrame()
    i = 0
    root_dir = "preds/"
    for file_name in sorted(os.listdir(root_dir)) :
        preds_test = pd.read_csv(root_dir + file_name)
        cur_preds_test = pd.Series(preds_test.y, name = file_name[0:7])
        level_one_test = pd.concat([level_one_test, cur_preds_test], axis = 1)
        i += 1
    display(level_one_test.head())
    
    return(level_one_preds, level_one_targets, level_one_test)

In [3]:
# Plot predictions
def plot_preds(targets, preds) :
    plt.style.use("fivethirtyeight")
    plt.figure(figsize=(10, 4))
    colormap = plt.cm.gist_rainbow
    plt.gca().set_color_cycle([colormap(i) for i in np.linspace(0, 0.9, len(preds))])
    plt.plot([50, 250], [50, 250], c = "red", linewidth = 2)
    for i in range(len(preds)) :        
        plt.scatter(targets[i], preds[i], marker = "o", s = 5, label = "Model" + str(i + 1))
    plt.xlabel("Real y")
    plt.ylabel("Projected y")
    plt.legend()
    plt.show()

In [4]:
# Define function to minimize
def target_func(weights) :
    final_pred = 0
    for weight, pred in zip(weights, preds_X_test):
        final_pred += weight * pred
    return(-r2_score(targets_X_test[0], final_pred))


# Optimize ensemble weights on held out fold
def optimize(preds_X_test) :
    scores = []
    weights = []
    for i in range(100):
        # Choose many random starting weights
        starting_values = np.random.uniform(size = len(preds_X_test))

        # Our weights are bound between 0 and 1
        bounds = [(0, 1)] * len(preds_X_test)

        res = minimize(
            target_func, 
            starting_values, 
            method = "SLSQP", 
            bounds = bounds, 
            options = {"maxiter" : 10000})

        scores.append(res["fun"])
        weights.append(res["x"])

    bestSC = -np.min(scores)
    bestWght = weights[np.argmin(scores)]

    print("\n Ensemble Score: {}".format(bestSC))
    print("\n Best Weights: {}".format(bestWght))

    return(bestSC, bestWght)

<b>Script</b>

In [5]:
# Get data
init_test = pd.read_csv("raw_data/test.csv")
test_ids = init_test.ID.values


In [6]:
# Create Level 2 inputs
level_one_preds, level_one_targets, level_one_test = get_level_one_data()

Unnamed: 0,et_dc23,et_dc24,et_dc27,et_dc28,xg_dc23,xg_dc24,xg_dc27,xg_dc28
0,116.226,113.476,115.581,113.427,116.837,115.885,117.329,115.643
1,115.1,112.909,114.666,112.715,112.731,111.803,112.834,111.764
2,115.158,116.418,115.223,116.386,111.913,111.069,111.245,112.022
3,114.757,113.759,114.118,114.348,112.205,109.941,110.541,109.916
4,116.92,116.723,116.924,116.564,120.582,121.36,118.645,122.208


Unnamed: 0,et_dc15,et_dc16,et_dc17,et_dc18,et_dc19,et_dc20,et_dc21,et_dc22,et_dc23,et_dc24,...,xg_dc19,xg_dc20,xg_dc21,xg_dc22,xg_dc23,xg_dc24,xg_dc25,xg_dc26,xg_dc27,xg_dc28
0,99.15,99.15,99.15,99.15,99.15,99.15,119.01,119.01,119.01,119.01,...,99.15,99.15,119.01,119.01,119.01,119.01,119.01,119.01,119.01,119.01
1,91.98,91.98,91.98,91.98,91.98,91.98,120.12,120.12,120.12,120.12,...,91.98,91.98,120.12,120.12,120.12,120.12,120.12,120.12,120.12,120.12
2,91.52,91.52,91.52,91.52,91.52,91.52,108.06,108.06,108.06,108.06,...,91.52,91.52,108.06,108.06,108.06,108.06,108.06,108.06,108.06,108.06
3,91.57,91.57,91.57,91.57,91.57,91.57,111.38,111.38,111.38,111.38,...,91.57,91.57,111.38,111.38,111.38,111.38,111.38,111.38,111.38,111.38
4,90.11,90.11,90.11,90.11,90.11,90.11,113.95,113.95,113.95,113.95,...,90.11,90.11,113.95,113.95,113.95,113.95,113.95,113.95,113.95,113.95


Unnamed: 0,et_dc23,et_dc24,et_dc27,et_dc28,xg_dc23,xg_dc24,xg_dc27,xg_dc28
0,75.624,76.583,75.744,76.339,74.801,76.983,75.344,76.758
1,93.606,93.446,93.692,93.413,94.787,92.791,93.038,92.669
2,77.253,76.937,76.822,76.83,76.811,78.136,76.838,78.243
3,77.549,77.312,77.252,77.241,76.055,76.763,76.16,76.928
4,112.24,112.272,112.251,112.175,110.074,112.383,110.646,112.311


In [7]:
# Get error of each OOF column
preds_X_test = []
targets_X_test = []
for col in level_one_preds.columns :
    cv_score = r2_score(level_one_targets[col], level_one_preds[col])
    print("Global OOF r2_score for " + col + " : " + str(cv_score))
    preds_X_test.append(level_one_preds[col])
    targets_X_test.append(level_one_targets[col])
plot_preds(targets_X_test, preds_X_test)


ValueError: Found input variables with inconsistent numbers of samples: [4249, 3947]

In [None]:
# Optimize weights
bestSC, bestWght = optimize(preds_X_test)

In [None]:
# Prepare submission using optimal weights
preds = 0
for i in range(0, len(bestWght)) :
    preds = preds + (bestWght[i] * level_one_test[[i]].values)

In [None]:
# Save predictions
file_name = "ens_preds/avg_" + str(level_one_preds.shape[1]) + ".csv"
pd.DataFrame({"ID": init_test.ID.values, "y": pd.DataFrame(preds)[0]}).to_csv(file_name, index = False)


In [None]:
pd.DataFrame(preds)[0]