In [1]:
# Imports
import os
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import ExtraTreesRegressor

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
FOLDS = 5
N_JOBS = 6
SEED = 2016

#DATA = "Base"
DATA = "Time"


In [2]:
# Get data
train = pd.read_csv("clean_data/train_" + DATA + ".csv")
test = pd.read_csv("clean_data/test_" + DATA + ".csv")
y = pd.read_csv("raw_data/y.csv")
print("train : " + str(train.shape))
print("test : " + str(test.shape))
print("y : " + str(y.shape))

test_ids = test.ID

train : (448169, 43)
test : (300891, 43)
y : (448169, 2)


In [3]:
# Split data for final performance check
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.25, random_state = SEED)
print("X_train.shape : " + str(X_train.shape))
print("X_test.shape : " + str(X_test.shape))
print("y_train.shape : " + str(y_train.shape))
print("y_test.shape : " + str(y_test.shape))

X_train.shape : (336126, 43)
X_test.shape : (112043, 43)
y_train.shape : (336126, 2)
y_test.shape : (112043, 2)


In [4]:
# Save y_test for later ensembling
name = "clean_data/y_test.csv"
pd.DataFrame({"ID": y_test.ID, "TARGET": y_test.TARGET}, columns = ["ID", "TARGET"]).to_csv(name, index = None)

In [5]:
# Generate y_test predictions to be able to optimize ensemble weights
preds_y_train = np.zeros((X_train.shape[0],))
preds_y_testX = np.empty((FOLDS, X_test.shape[0]))
kf = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
for i, (train_index, val_index) in enumerate(kf.split(X_train)):
    print("FOLD : " + str(i + 1))
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.TARGET.iloc[train_index], y_train.TARGET.iloc[val_index]

    # Train model
    et = ExtraTreesRegressor(
        n_estimators = 200,
        criterion = "mse", 
        max_features = 0.5,
        max_depth = 10,
        min_samples_split = 10,
        min_samples_leaf = 5, 
        bootstrap = True, 
        n_jobs = N_JOBS,
        random_state = (SEED * i))
    et.fit(X_train_fold, y_train_fold)                           

    # Save y_train and y_test predictions
    preds_y_train[val_index] = et.predict(X_val_fold)
    preds_y_testX[i, :] = et.predict(X_test)    
    print("-----")
    
# Average predictions on test set
preds_y_test = preds_y_testX.mean(axis = 0)

FOLD : 1
-----
FOLD : 2
-----
FOLD : 3
-----
FOLD : 4
-----
FOLD : 5
-----


In [6]:
# Show MSE on y_train using KFold CV
print("Average ExtraTreesRegressor MSE using OOF predictions : " + str(mean_squared_error(y_train.TARGET, preds_y_train)))

# Show MSE on y_test
print("Average ExtraTreesRegressor MSE on last hold-out fold : " + str(mean_squared_error(y_test.TARGET, preds_y_test)))

Average ExtraTreesRegressor MSE using OOF predictions : 100.645122382
Average ExtraTreesRegressor MSE on last hold-out fold : 102.917850957


In [7]:
# Now generate predictions on whole test set, to be used in ensemble for submissions
preds_testX = np.empty((FOLDS, test.shape[0]))
kf = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
for i, (train_index, val_index) in enumerate(kf.split(train)):
    print("FOLD : " + str(i + 1))
    X_train_fold, X_val_fold = train.iloc[train_index], train.iloc[val_index]
    y_train_fold, y_val_fold = y.TARGET.iloc[train_index], y.TARGET.iloc[val_index]

    # Train model
    et = ExtraTreesRegressor(
        n_estimators = 200,
        criterion = "mse", 
        max_features = 0.5,
        max_depth = 10,
        min_samples_split = 10,
        min_samples_leaf = 5, 
        bootstrap = True, 
        n_jobs = N_JOBS,
        random_state = (SEED * i))
    et.fit(X_train_fold, y_train_fold)

    # Save test set predictions
    preds_testX[i, :] = et.predict(test)    
    print("-----")
    
# Average predictions on test set
preds_test = preds_testX.mean(axis = 0)

FOLD : 1
-----
FOLD : 2
-----
FOLD : 3
-----
FOLD : 4
-----
FOLD : 5
-----


In [8]:
# Save predictions
y_test_name = "y_test_preds/ExtraTrees_" + DATA + ".csv"
test_name = "test_preds/ExtraTrees_" + DATA + ".csv"
pd.DataFrame({"ID": y_test.ID, "TARGET": preds_y_test}, columns = ["ID", "TARGET"]).to_csv(y_test_name, index = None)
pd.DataFrame({"ID": test.ID, "TARGET": preds_test}, columns = ["ID", "TARGET"]).to_csv(test_name, index = None)