In [94]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV

import xgboost
import time
import math
import random

from model_wrapper import (
    SKLearnModelWrapper,
    ModelWrapper
)

In [72]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

class BoostRegression(SKLearnModelWrapper): 
    # MODIFIED TO TAKE in X, Y when initializing
    def __init__(self, X, Y, *args, debug=False, **kwargs):
        self.X = X
        # when debugging only use a tiny subset of the data
        if debug:
            self.X = self.X[:10]
        self.Y = Y
        
    def get_model(self):
        #return xgboost.XGBRegressor(n_estimators=100, max_depth=7)
        return GridSearchCV(xgboost.XGBRegressor(n_estimators=100, max_depth=7), cv=5,
                  param_grid={"n_estimators": [50, 100, 150],
                              "max_depth": [3, 7, 10]})
    
class KernelRidgeRegression(SKLearnModelWrapper):
    # MODIFIED TO TAKE in X, Y when initializing
    def __init__(self, X, Y, *args, debug=False, **kwargs):
        self.X = X
        # when debugging only use a tiny subset of the data
        if debug:
            self.X = self.X[:10]
        self.Y = Y
        
    def get_model(self):
        return KernelRidge(kernel='laplacian', alpha=1e-2,gamma=.001)

In [22]:
##########################
## SET UP TRAINING DATA ##
##########################

df_train = pd.read_csv("merged_train.csv")


# extract arrays containing data
Y_train = df_train.gap.values
X_train = df_train.values[:,:-1] # exclude last column

# train model
XGB = KernelRidgeRegression(X_train, Y_train)
t0 = time.time()
XGB.build_model()
print(time.time()-t0)

In [7]:
######################
## SET UP TEST DATA ##
######################

df_test = pd.read_csv("merged_test.csv")

# check variables in same order
assert(list(df_train)[:-1] == list(df_test))

In [17]:
# extract data
X_test = df_test.values

XGB_pred = XGB.predict(X_test)

In [18]:
write_to_file('xgb_preds_tuned.csv', XGB_pred)