In [1]:
# Imports
import pandas as pd
import numpy as np
import datetime
import warnings
from scipy.optimize import minimize
from sklearn.metrics import r2_score
from IPython.display import display

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
warnings.simplefilter('ignore')
N_JOBS = -1
SEED = 2017

In [2]:
# Define function to minimize
def target_func(weights) :
    final_prediction = 0
    for weight, prediction in zip(weights, preds_y_val):
        final_prediction += weight * prediction
    return(-r2_score(y_val, final_prediction))


# Optimize ensemble weights on held out fold
def optimize(preds_y_val, names) :
    scores = []
    weights = []
    for i in range(100):
        # Choose many random starting weights
        starting_values = np.random.uniform(size = len(preds_y_val))

        # Our weights are bound between 0 and 1
        bounds = [(0, 1)] * len(preds_y_val)

        res = minimize(
            target_func, 
            starting_values, 
            method = "SLSQP", 
            bounds = bounds, 
            options = {"maxiter" : 10000})

        scores.append(res["fun"])
        weights.append(res["x"])

    bestSC = -np.min(scores)
    bestWght = weights[np.argmin(scores)]

    print("\n Ensemble Score: {}".format(bestSC))
    print("\n Best Weights: {}".format(bestWght))
    print("\n Names: {}".format(names))

    return(bestSC, bestWght, names)

<b>Script</b>

In [3]:
# Get data
test = pd.read_csv("clean_data/test.csv")
test_ids = pd.read_csv("clean_data/test_ids.csv", header = None)


In [4]:
# Get validation set data
y_val = np.array(pd.read_csv("clean_data/y_val.csv", header = None))

la_X_val = np.array(pd.read_csv("clean_data/la_X_val.csv"))
la_preds_val = np.array(pd.read_csv("preds/la_preds_val.csv", header = None))

et_X_val = np.array(pd.read_csv("clean_data/et_X_val.csv"))
et_preds_val = np.array(pd.read_csv("preds/et_preds_val.csv", header = None))

preds_y_val = []
preds_y_val.append(la_preds_val)
preds_y_val.append(et_preds_val)

names = []
names.append("la_preds_val")
names.append("et_preds_val")


In [5]:
# Remind scores of single models on validation set
print("la on held out fold : " + str(r2_score(y_val, la_preds_val)))
print("et on held out fold : " + str(r2_score(y_val, et_preds_val)))


la on held out fold : 0.464321084613
et on held out fold : 0.453500241938


In [6]:
# Optimize weights
bestSC, bestWght, names = optimize(preds_y_val, names)


 Ensemble Score: 0.4651836947681909

 Best Weights: [ 1.          0.00391728]

 Names: ['la_preds_val', 'et_preds_val']


In [7]:
# Load predictions on whole test set
la_preds = pd.read_csv("preds/la_simple.csv")
et_preds = pd.read_csv("preds/et_simple.csv")

preds_test = []
preds_test.append(la_preds.y)
preds_test.append(et_preds.y)


In [8]:
# Prepare submission using optimal weights
preds = np.zeros(shape = (la_preds.shape[0]))
for i in range(0, len(bestWght)) :
    preds += (bestWght[i] * preds_test[i])

In [9]:
# Save predictions
pd.DataFrame({"ID": test_ids[0], "y": preds}).to_csv("preds/blend.csv", index = False)
