In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV

import xgboost
import time
import math
import random

from model_wrapper import (
    SKLearnModelWrapper,
    ModelWrapper
)

In [10]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

class BoostRegression(SKLearnModelWrapper): 
    # MODIFIED TO TAKE in X, Y when initializing
    def __init__(self, X, Y, *args, debug=False, **kwargs):
        self.X = X
        # when debugging only use a tiny subset of the data
        if debug:
            self.X = self.X[:10]
        self.Y = Y
        
    def get_model(self):
        return xgboost.XGBRegressor(n_estimators=1000, max_depth=6)
        '''return GridSearchCV(xgboost.XGBRegressor(n_estimators=100, max_depth=7), cv=5,
                  param_grid={"n_estimators": [50, 100, 150],
                              "max_depth": [3, 7, 10]})'''
    
class KernelRidgeRegression(SKLearnModelWrapper):
    # MODIFIED TO TAKE in X, Y when initializing
    def __init__(self, X, Y, *args, debug=False, **kwargs):
        self.X = X
        # when debugging only use a tiny subset of the data
        if debug:
            self.X = self.X[:10]
        self.Y = Y
        
    def get_model(self):
        return KernelRidge(kernel='laplacian', alpha=1e-2,gamma=.001)

In [None]:
##########################
## SET UP TRAINING DATA ##
##########################

df_train = pd.read_csv("merged_train.csv")


# extract arrays containing data
Y_train = df_train.gap.values
X_train = df_train.values[:,:-1] # exclude last column

In [3]:
# train model
XGB = BoostRegression(X_train, Y_train)
t0 = time.time()
XGB.build_model()
print(time.time()-t0)

ASDASD


KeyboardInterrupt: 

In [None]:
######################
## SET UP TEST DATA ##
######################

df_test = pd.read_csv("merged_test.csv")

# check variables in same order
assert(list(df_train)[:-1] == list(df_test))

In [None]:
# extract data
X_test = df_test.values

XGB_pred = XGB.predict(X_test)

In [None]:
write_to_file('xgb_preds_tuned.csv', XGB_pred)

In [4]:
## LEARNING RATE ###
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import learning_curve

In [6]:
x = BoostRegression(X_train, Y_train)

In [7]:
x.get_model()

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=3000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [None]:
train_sizes_abs, train_scores_xgb, test_scores_xgb = \
    learning_curve(BoostRegression(X_train, Y_train).get_model(), X_train, Y_train, train_sizes=[1000, 5000, 10000, 50000, 80000],
                   scoring="neg_mean_squared_error", cv=5)

In [None]:
plt.figure()
plt.plot(train_sizes_abs, -test_scores_xgb.mean(1), 'o-', color="b")
plt.xlabel("Train size")
plt.ylabel("Mean Squared Error")
plt.title('Learning curve')
plt.legend(loc="best")

plt.show()