In [1]:
# Imports
import pandas as pd
import numpy as np
from IPython.display import HTML, display
import warnings
import os
import time
from sklearn.model_selection import KFold
from sklearn.linear_model import LassoCV, LassoLarsCV, RidgeCV
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor, DMatrix

# Visualization options
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', None)
%matplotlib inline
display(HTML("<style>.container { width: 90% !important; }</style>"))
warnings.filterwarnings("ignore")

# Global variables
SEED = 420
N_JOBS = 18
FOLDS = 10



In [2]:
# Stack all single Level 1 models OOF and test preds as entries for a Level 2 model
def create_level2_inputs(OOF_targets) :
    level_one_OOF = pd.DataFrame()
    i = 0
    root_dir = "preds_OOF/"
    for file_name in os.listdir(root_dir) :
        preds_OOF = pd.read_csv(root_dir + file_name)
        cur_preds_OOF = pd.Series(preds_OOF.y, name = str(i))
        level_one_OOF = pd.concat([level_one_OOF, cur_preds_OOF], axis = 1)
        i += 1
    display(level_one_OOF.head())

    # Same for test set preds
    level_one_test = pd.DataFrame()
    i = 0
    root_dir = "preds/"
    for file_name in os.listdir(root_dir) :
        preds_test = pd.read_csv(root_dir + file_name)
        cur_preds_test = pd.Series(preds_test.y, name = str(i))
        level_one_test = pd.concat([level_one_test, cur_preds_test], axis = 1)
        i += 1
    display(level_one_test.head())
    
    return(level_one_OOF, level_one_test)

In [3]:
def get_cv_score(model, X_train, X_train_y) :
    oof_preds = pd.DataFrame()
    oof_targets = pd.DataFrame()
    kf = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        #print("FOLD " + str(i + 1))
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = X_train_y.iloc[train_index], X_train_y.iloc[test_index]
        
        # Fit and predict
        model.fit(X_train_fold, y_train_fold.y)
        preds_X_test_fold = model.predict(X_test_fold)
        
        # Assemble OOF predictions and targets
        oof_preds = pd.concat([pd.Series(oof_preds), pd.Series(preds_X_test_fold)], axis = 0)
        oof_targets = pd.concat([pd.Series(oof_targets), pd.Series(y_test_fold.y)], axis = 0)
        
    # Compute error on concatenated OOF predictions
    cv_score = r2_score(oof_targets, oof_preds)
    print("Global OOF r2_score : " + str(cv_score))
                    
    return(cv_score)

In [5]:
# Iterate on each available model definition
def run_level2(models, level_one_OOF, level_one_test, OOF_targets, test_ids) :
    for model_name, model  in models.items() :
        print("***** MODEL : " + model_name + " *****")

        # Get CV score
        cv_score = get_cv_score(model, level_one_OOF, OOF_targets)

        # Fit, predict
        model.fit(level_one_OOF, OOF_targets.y)
        preds_test = model.predict(level_one_test)

        # Save preds
        if isinstance(model, (RidgeCV, KNeighborsRegressor)) :
            # Handle format problems
            preds_test = pd.DataFrame(preds_test).iloc[:, 0].values            
        file_name = "preds/level2_" + model_name + "_preds_test.csv"
        pd.DataFrame({"ID": test_ids, "y": preds_test}).to_csv(file_name, index = False)


In [6]:
# Define models
models = {
    "la" : LassoCV(eps = 0.0001, 
                   n_alphas = 100, 
                   max_iter = 10000, 
                   tol = 0.0001,                     
                   normalize = True, 
                   precompute = True, 
                   random_state = SEED,
                   n_jobs = N_JOBS),
    "ll" : LassoLarsCV(max_n_alphas = 1000, 
                       max_iter = 10000,
                       normalize = True, 
                       precompute = True, 
                       n_jobs = N_JOBS),
    "ri" : RidgeCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6],
                   normalize = True),
    "rf" : RandomForestRegressor(n_estimators = 200,
                                 criterion = "mse", 
                                 max_features = 0.75,
                                 max_depth = 8,
                                 min_samples_split = 5,
                                 min_samples_leaf = 2, 
                                 bootstrap = True, 
                                 n_jobs = N_JOBS, 
                                 random_state = SEED),
    "et" : ExtraTreesRegressor(n_estimators = 200,
                               criterion = "mse", 
                               max_features = 0.75,
                               max_depth = 8,
                               min_samples_split = 5,
                               min_samples_leaf = 2, 
                               bootstrap = True, 
                               n_jobs = N_JOBS, 
                               random_state = SEED), 
#    "kn" : KNeighborsRegressor(n_neighbors = 100, 
#                               weights = "distance", 
#                               p = 3, 
#                               n_jobs = N_JOBS), 
    "gb" : GradientBoostingRegressor(loss = "ls", 
                                     learning_rate = 0.03, 
                                     n_estimators = 200, 
                                     max_depth = 8,
                                     criterion = "friedman_mse",
                                     min_samples_split = 5,
                                     min_samples_leaf = 2,
                                     subsample = 0.75,
                                     max_features = 0.75, 
                                     random_state = SEED),
    "xg" : XGBRegressor(objective = "reg:linear", 
                        learning_rate = 0.03, 
                        n_estimators = 200, 
                        max_depth = 8,
                        min_child_weight = 2, 
                        subsample = 0.75, 
                        colsample_bytree = 0.75, 
                        colsample_bylevel = 0.75, 
                        nthread = N_JOBS,
                        seed = SEED)
}

<b>Script</b>

In [7]:
# Get data
OOF_targets = pd.read_csv("clean_data/OOF_targets.csv")
init_test = pd.read_csv("raw_data/test.csv")
test_ids = init_test.ID.values

In [8]:
# Create Level 2 inputs
level_one_OOF, level_one_test = create_level2_inputs(OOF_targets)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35
0,93.6746,93.81825,93.96026,93.74107,93.26511,93.52203,94.0187,92.72121,93.81505,93.33845,92.88982,93.04055,93.10428,93.37152,94.44538,93.94301,94.05792,93.79208,93.4595,93.51313,93.9891,93.87443,94.09961,93.95643,93.8139,92.78046,93.65997,94.15888,94.3274,94.16001,93.90604,94.03901,93.95624,93.9301,93.22628,94.1307
1,93.83344,93.74126,93.77025,93.75308,93.56305,93.13467,93.98182,93.65674,93.76341,92.74764,93.3237,93.66253,92.93102,92.98257,93.42953,93.789,93.67267,93.69734,93.71542,93.6984,94.00883,93.70994,93.96369,93.16937,94.83524,92.84693,93.81298,93.87815,95.13277,93.94887,93.73386,93.98563,93.98735,93.98229,93.41761,94.04106
2,93.38206,93.20597,93.26905,93.30804,91.43476,90.87791,93.70637,91.53871,93.14238,90.92743,90.5965,92.00553,91.35686,90.59,91.63435,93.26812,92.99599,93.29072,91.28838,92.55515,93.37028,91.19312,93.69737,91.24696,92.16187,91.77141,93.46975,93.49873,91.90289,93.49205,91.26245,93.52197,93.74961,93.44586,91.79105,93.51236
3,93.76958,93.76225,94.01277,94.01875,92.3748,92.05013,93.97784,92.62753,93.86256,92.13073,92.50835,91.78621,91.43083,91.40775,92.14256,93.98194,94.01908,93.88226,92.0006,92.6297,94.14469,92.21547,94.2456,91.9487,91.54463,91.81743,93.71504,94.26389,92.80727,94.36871,92.69655,94.2147,93.96872,94.22211,91.88335,94.14083
4,93.89437,94.32132,94.28809,94.18326,93.48251,93.16425,93.94438,93.28031,94.24012,93.57586,93.45882,93.77821,93.67146,93.09625,94.12957,94.32613,94.29537,94.18855,95.13079,94.0937,94.37584,93.67853,94.52297,93.23581,94.33716,93.14136,93.73709,94.39366,93.85577,94.45496,95.40146,94.50362,94.0142,94.41219,93.32481,94.43746


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42
0,78.36535,77.75277,77.14045,78.88329,78.49767,78.62871,77.8049,78.54425,77.26158,77.82254,78.62006,77.37821,77.87097,78.50745,78.49454,79.32145,79.26853,78.05093,78.3467,79.37068,78.28136,78.56267,77.93052,77.84616,77.69973,78.67789,77.23478,79.33321,78.16213,78.68913,78.71459,78.59412,77.67253,78.54895,78.54318,78.84533,79.27412,77.29384,78.99247,78.62371,78.00424,79.07819,78.6642
1,93.90436,93.87084,102.82574,96.54262,98.35094,94.01716,98.13713,93.89159,97.95395,96.35355,93.54512,100.73109,93.50914,110.46455,101.33544,97.24693,98.87353,96.0618,93.75373,98.29589,95.92467,93.56262,95.88253,93.57916,93.45908,93.5633,98.20374,98.56641,100.69314,93.53709,93.66931,100.8251,93.96683,93.96162,93.9625,97.20661,98.86199,95.79249,97.11054,93.95095,94.1575,94.01653,93.71397
2,77.50995,77.41967,78.23803,79.92087,81.11543,77.25984,79.04567,77.19856,80.40413,79.41566,77.29325,76.60947,77.38579,80.03847,81.63665,80.60226,79.00691,79.5914,77.60058,81.82232,78.813,77.50838,78.74232,77.3133,78.68549,77.23597,84.19683,80.43436,80.09332,77.29983,77.29233,81.24863,77.47751,77.46582,77.53913,79.94559,79.00222,79.60608,78.00878,77.18548,78.21062,77.16573,77.31384
3,78.11143,77.43426,77.89266,77.97121,77.9203,78.33085,78.32188,78.56101,77.67316,77.06278,78.12016,77.83444,77.28125,78.72546,78.25806,78.96616,79.19504,78.05206,77.97478,78.90921,77.67496,78.30639,77.5855,77.21711,77.4245,78.61697,77.56924,77.84442,78.66136,78.74078,78.13379,78.75278,77.40864,78.16416,78.22572,80.62379,79.19298,77.97347,78.8641,78.58316,77.87758,78.46633,78.23966
4,112.93191,112.70653,116.75742,115.36929,115.59367,112.79273,116.47668,113.32762,114.58219,114.4162,112.90066,114.08283,112.55397,116.55001,115.19833,115.29312,115.03568,112.63621,112.55807,117.58625,114.70511,112.69684,114.06457,112.31394,113.45873,112.60397,118.06808,115.59103,116.05825,112.99174,112.7831,115.23351,112.7204,112.90677,113.00354,114.95364,115.02289,115.22794,113.8122,113.12334,114.16381,113.10343,112.99319


In [9]:
# Iterate on each available model definition
run_level2(models, level_one_OOF, level_one_test, OOF_targets, test_ids)

***** MODEL : rf *****
Global OOF r2_score : 0.556854370732


ValueError: Number of features of the model must match the input. Model n_features is 36 and input n_features is 43 