In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.grid_search import GridSearchCV

data = pd.read_pickle('ml_finalproj_train_vF.pkl')



In [2]:
X_all = data.drop(['weight','y'],axis = 1)
Y = data['y']
weight = data['weight']

def Normalize(data):
    for col in data.columns.values:
        data[col]=(data[col]-np.mean(data[col]))/np.std(data[col])
    return data
X_norm = Normalize(X_all)

def feature_selection(X_norm, Y, data):
    alpha = np.array([0.00002,0.00003,0.00001])
    clf = LassoCV(alphas=alpha)
    sfm = SelectFromModel(clf,threshold=0.000025)
    sfm.fit(X_norm, Y)
    X_selected = X_norm.columns.values[sfm.get_support()]
    X = pd.concat([data[X_selected[0]],data[X_selected[1]]],axis = 1)
    for i in range(2,len(X_selected)):
        X = pd.concat([X,data[X_selected[i]]],axis = 1)
    return X
X_selected = feature_selection(X_norm,Y, X_all)
    


In [3]:
def data_split(X, Y, weight, train_frac):
    index = int(len(Y)*train_frac)
    return X[0:index],Y[0:index],weight[0:index],X[index+1:],Y[index+1:],weight[index+1:]

X_train, Y_train, weight_train, X_test, Y_test, weight_test = data_split(X_selected,Y,weight,4/5)


In [4]:
def test_param(param,param_name,param_list, X_train, Y_train, weight_train, X_test, Y_test, weight_test):
    data_train = xgb.DMatrix(X_train, label=Y_train,weight =weight_train)
    for i in param_list:
        param[param_name] = i
        model = xgb.train(params=param, dtrain=data_train, num_boost_round=10)
        Y_pred_test = model.predict(xgb.DMatrix(X_test))
        Y_pred_train = model.predict(xgb.DMatrix(X_train))
        print('{} :{}, R2_outofsample :{}, R2_insample: {}'.format(param_name,i, r2_score(Y_test,Y_pred_test,weight_test), r2_score(Y_train, Y_pred_train,weight_train ) ))

# min_child_weight, alpha, gamma, lambda have no influence
# max_depth = 2
    
    

In [5]:
param1 = {
    'objective'          :'reg:linear',
    'max_depth'          :6,
    'eta'                :0.3,
    'gamma'              :0,
    'min_child_weight'   :1,
    'subsample'          :1,
    'colsample_bytree'   :1,
    'alpha'              :0,
    'lambda'             :1,
    'base_score'         :np.mean(Y_train)
}

max_depths = [2,3,4,6,8]
test_param(param1,'max_depth',max_depths, X_train, Y_train, weight_train, X_test, Y_test, weight_test)

max_depth :2, R2_outofsample :-0.06986852122018994, R2_insample: 0.008804364992566316
max_depth :3, R2_outofsample :-0.24628118653583098, R2_insample: 0.015091169838153284
max_depth :4, R2_outofsample :-0.27615906790190725, R2_insample: 0.022648744127704123
max_depth :6, R2_outofsample :-0.5132758246869185, R2_insample: 0.04784573459195374
max_depth :8, R2_outofsample :-0.7784434791758914, R2_insample: 0.09257697094749939


In [6]:
param2 = {
    'objective'          :'reg:linear',
    'max_depth'          :2,
    'eta'                :0.3,
    'gamma'              :0,
    'min_child_weight'   :1,
    'subsample'          :1,
    'colsample_bytree'   :1,
    'alpha'              :0,
    'lambda'             :1,
    'base_score'         :np.mean(Y_train)
}
min_child_weights = [1, 20, 50, 100, 200]
test_param(param2,'min_child_weight',min_child_weights, X_train, Y_train, weight_train, X_test, Y_test, weight_test)

min_child_weight :1, R2_outofsample :-0.06986852122018994, R2_insample: 0.008804364992566316
min_child_weight :20, R2_outofsample :-0.06986852122018994, R2_insample: 0.008804364992566316
min_child_weight :50, R2_outofsample :-0.06986852122018994, R2_insample: 0.008804364992566316
min_child_weight :100, R2_outofsample :-0.06986852122018994, R2_insample: 0.008804364992566316
min_child_weight :200, R2_outofsample :-0.06986852122018994, R2_insample: 0.008804364992566316


In [7]:
param3 = {
    'objective'          :'reg:linear',
    'max_depth'          :2,
    'eta'                :0.3,
    'gamma'              :0,
    'min_child_weight'   :1,
    'subsample'          :1,
    'colsample_bytree'   :1,
    'alpha'              :0,
    'lambda'             :1,
    'base_score'         :np.mean(Y_train)
}

gammas = [0.2,0.4,0.6,0.8]
test_param(param3,'gamma',gammas, X_train, Y_train, weight_train, X_test, Y_test, weight_test)

gamma :0.2, R2_outofsample :-0.06986852122018994, R2_insample: 0.008804364992566316
gamma :0.4, R2_outofsample :-0.06986852122018994, R2_insample: 0.008804364992566316
gamma :0.6, R2_outofsample :-0.06986852122018994, R2_insample: 0.008804364992566316
gamma :0.8, R2_outofsample :-0.06986852122018994, R2_insample: 0.008804364992566316


In [10]:
param4 = {
    'objective'          :'reg:linear',
    'max_depth'          :2,
    'eta'                :0.3,
    'gamma'              :0,
    'min_child_weight'   :1,
    'subsample'          :1,
    'colsample_bytree'   :1,
    'alpha'              :0,
    'lambda'             :1,
    'base_score'         :np.mean(Y_train)
}

subsamples = [0.9,0.95,1]
test_param(param4,'subsample',subsamples, X_train, Y_train, weight_train, X_test, Y_test, weight_test)

subsample :0.9, R2_outofsample :-0.12450371269895122, R2_insample: 0.008954268990161163
subsample :0.95, R2_outofsample :-0.09243507261011552, R2_insample: 0.0085624753484973
subsample :1, R2_outofsample :-0.06986852122018994, R2_insample: 0.008804364992566316


In [13]:
param5 = {
    'objective'          :'reg:linear',
    'max_depth'          :2,
    'eta'                :0.3,
    'gamma'              :0,
    'min_child_weight'   :1,
    'subsample'          :1,
    'colsample_bytree'   :1,
    'alpha'              :0,
    'lambda'             :1,
    'base_score'         :np.mean(Y_train)
}

colsample_bytrees=[0.5,0.5,0.55,0.6]
test_param(param5,'colsample_bytree',colsample_bytrees, X_train, Y_train, weight_train, X_test, Y_test, weight_test)

colsample_bytree :0.5, R2_outofsample :-0.01360662956811498, R2_insample: 0.007058049231864816
colsample_bytree :0.5, R2_outofsample :-0.01360662956811498, R2_insample: 0.007058049231864816
colsample_bytree :0.55, R2_outofsample :-0.012106156054300987, R2_insample: 0.00751810738042602
colsample_bytree :0.6, R2_outofsample :-0.013539708073106116, R2_insample: 0.007926859698016875


In [14]:
param6 = {
    'objective'          :'reg:linear',
    'max_depth'          :2,
    'eta'                :0.3,
    'gamma'              :0,
    'min_child_weight'   :1,
    'subsample'          :1,
    'colsample_bytree'   :0.55,
    'alpha'              :0,
    'lambda'             :1,
    'base_score'         :np.mean(Y_train)
}

alphas = [1e-5, 1e-2, 0.1, 1, 100]
test_param(param6,'alpha',alphas, X_train, Y_train, weight_train, X_test, Y_test, weight_test)

alpha :1e-05, R2_outofsample :-0.012106156054300987, R2_insample: 0.00751810738042602
alpha :0.01, R2_outofsample :-0.012106156045035732, R2_insample: 0.007518107383351458
alpha :0.1, R2_outofsample :-0.012106155879013869, R2_insample: 0.007518107306146771
alpha :1, R2_outofsample :-0.012106150267647386, R2_insample: 0.0075181069191639915
alpha :100, R2_outofsample :-0.012105565135511087, R2_insample: 0.0075180670881126455


In [16]:
param7 = {
    'objective'          :'reg:linear',
    'max_depth'          :2,
    'eta'                :0.3,
    'gamma'              :0,
    'min_child_weight'   :1,
    'subsample'          :1,
    'colsample_bytree'   :0.55,
    'alpha'              :0,
    'lambda'             :1,
    'base_score'         :np.mean(Y_train)
}

lambdas = [1e-5, 1e-2, 0.1, 1, 100]
test_param(param6,'lambda',lambdas, X_train, Y_train, weight_train, X_test, Y_test, weight_test)

lambda :1e-05, R2_outofsample :-0.012105565135511087, R2_insample: 0.0075180670881126455
lambda :0.01, R2_outofsample :-0.012105565135511087, R2_insample: 0.0075180670881126455
lambda :0.1, R2_outofsample :-0.012105565135511087, R2_insample: 0.0075180670881126455
lambda :1, R2_outofsample :-0.012105565135511087, R2_insample: 0.0075180670881126455
lambda :100, R2_outofsample :-0.012105563519744011, R2_insample: 0.007518066660217704


In [37]:
def test_eta(param,eta_list,numround_list, X_train, Y_train, weight_train, X_test, Y_test, weight_test):
    data_train = xgb.DMatrix(X_train, label=Y_train,weight =weight_train)
    for i in range(len(eta_list)):
        param['eta'] = eta_list[i]
        for j in range(len(numround_list)):
            model = xgb.train(params=param, dtrain=data_train, num_boost_round=numround_list[j])
            Y_pred_test = model.predict(xgb.DMatrix(X_test))
            Y_pred_train = model.predict(xgb.DMatrix(X_train))
            print('eta: {}, num_boost_round :{}, R2_outofsample :{}, R2_insample: {}'.format(eta_list[i],numround_list[j], r2_score(Y_test,Y_pred_test,weight_test), r2_score(Y_train, Y_pred_train,weight_train ) ))

param8 = {
    'objective'          :'reg:linear',
    'max_depth'          :2,
    'eta'                :0.3,
    'gamma'              :0,
    'min_child_weight'   :1,
    'subsample'          :1,
    'colsample_bytree'   :0.55,
    'alpha'              :0,
    'lambda'             :1,
    'base_score'         :np.mean(Y_train)
}
#eta = 0.2, num_boost_round = 14
#eta = 0.22, num_boost_round = 16
# split:4/5,eta=0.008,num_boost_round = 40 
#[0.3,0.25,0.2,0.15,0.1,0.05]
#[10,15,20,25,30,35]
etas = [0.01,0.008,0.007,0.006]
num_rounds = [30,32,36,40,44,48,52]
test_eta(param8,etas,num_rounds, X_train, Y_train, weight_train, X_test, Y_test, weight_test)

eta: 0.01, num_boost_round :30, R2_outofsample :-5.855333343429692e-05, R2_insample: 0.0014939802671741775
eta: 0.01, num_boost_round :32, R2_outofsample :9.432840054168778e-05, R2_insample: 0.0016556957576743248
eta: 0.01, num_boost_round :36, R2_outofsample :0.00013559345354818841, R2_insample: 0.0018365260514672244
eta: 0.01, num_boost_round :40, R2_outofsample :0.0001239432276725072, R2_insample: 0.0020661043302817594
eta: 0.01, num_boost_round :44, R2_outofsample :2.156142733600319e-05, R2_insample: 0.0022743094785320217
eta: 0.01, num_boost_round :48, R2_outofsample :-0.00020227933289795885, R2_insample: 0.0024761473806974577
eta: 0.01, num_boost_round :52, R2_outofsample :-0.00036889042806653727, R2_insample: 0.002703109856539987
eta: 0.008, num_boost_round :30, R2_outofsample :-6.669229482092476e-05, R2_insample: 0.0012109009514790214
eta: 0.008, num_boost_round :32, R2_outofsample :8.016540224975444e-05, R2_insample: 0.0013235927589041463
eta: 0.008, num_boost_round :36, R2_ou

In [45]:

X1_train, Y1_train, weight1_train, X1_test, Y1_test, weight1_test = data_split(X_selected,Y,weight,4/5)
param_final = {
    'objective'          :'reg:linear',
    'max_depth'          :2,
    'eta'                :0.008,
    'gamma'              :0,
    'min_child_weight'   :1,
    'subsample'          :1,
    'colsample_bytree'   :0.55,
    'alpha'              :0,
    'lambda'             :1,
    'base_score'         :np.mean(Y1_train)
}


data_train1 = xgb.DMatrix(X1_train, label=Y1_train,weight =weight1_train)
model1 = xgb.train(params=param_final, dtrain=data_train1, num_boost_round=40)
Y1_pred_test = model1.predict(xgb.DMatrix(X1_test))
Y1_pred_train = model1.predict(xgb.DMatrix(X1_train))
print(' R2_outofsample :{}, R2_insample: {}'.format( r2_score(Y1_test,Y1_pred_test,weight1_test), r2_score(Y1_train, Y1_pred_train,weight1_train ) ))



 R2_outofsample :0.00022478383096424803, R2_insample: 0.0016725565397176512
