## Importing necessary packages

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb

In [2]:
# read datasets
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

# process columns, apply LabelEncoder to categorical features
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

# shape        
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

Shape train: (4209, 378)
Shape test: (4209, 377)


In [3]:
from sklearn.decomposition import PCA, FastICA
n_comp = 10

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]
    
y_train = train["y"]
y_mean = np.mean(y_train)

In [4]:
X, y = train.drop('y', axis=1).values, train.y.values
print(X.shape)

(4209, 397)


In [None]:
model = xgb.XGBRegressor(n_jobs = 1, base_score= y_mean, silent=True, objective='reg:linear')

xgb_params = {
    'n_estimators': [500, 600, 700], 
    'learning_rate': [0.001, 0.005, 0.01],
    'max_depth': [4, 6, 8],
    'subsample': [0.90, 0.95, 0.1]}

clf = GridSearchCV(model, xgb_params, cv = 5, verbose=1, n_jobs = -1)
clf.fit(X, y)
#grid = GridSearchCV(model, xgb_params, cv = 5, verbose=1, n_jobs = -1)
#gridresults = grid.fit(X, y)

In [39]:
print(clf.best_score_)
print(clf.best_params_)

0.541374237439
{'n_estimators': 600, 'subsample': 0.95, 'learning_rate': 0.005, 'max_depth': 4}


In [40]:
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.343037 (0.045156) with: {'n_estimators': 500, 'subsample': 0.9, 'learning_rate': 0.001, 'max_depth': 4}
0.344804 (0.044274) with: {'n_estimators': 500, 'subsample': 0.95, 'learning_rate': 0.001, 'max_depth': 4}
0.340100 (0.040377) with: {'n_estimators': 500, 'subsample': 0.1, 'learning_rate': 0.001, 'max_depth': 4}
0.379539 (0.049579) with: {'n_estimators': 600, 'subsample': 0.9, 'learning_rate': 0.001, 'max_depth': 4}
0.381691 (0.048569) with: {'n_estimators': 600, 'subsample': 0.95, 'learning_rate': 0.001, 'max_depth': 4}
0.376959 (0.044740) with: {'n_estimators': 600, 'subsample': 0.1, 'learning_rate': 0.001, 'max_depth': 4}
0.409626 (0.052951) with: {'n_estimators': 700, 'subsample': 0.9, 'learning_rate': 0.001, 'max_depth': 4}
0.411964 (0.051840) with: {'n_estimators': 700, 'subsample': 0.95, 'learning_rate': 0.001, 'max_depth': 4}
0.407796 (0.048460) with: {'n_estimators': 700, 'subsample': 0.1, 'learning_rate': 0.001, 'max_depth': 4}
0.325724 (0.060135) with: {'n_estimators': 

In [None]:
model = xgb.XGBRegressor(learning_rate = 0.005, n_jobs = -1, base_score= y_mean, silent=True, objective='reg:linear')

xgb_params = {
    'n_estimators': [600, 650, 700], 
    'max_depth': [4, 5, 6],
    'subsample': [0.90, 0.95, 0.1]}

grid = GridSearchCV(model, xgb_params, cv = 5, verbose=1, n_jobs = -1)
gridresults = grid.fit(X, y)

In [16]:
print(gridresults.best_score_)
print(gridresults.best_params_)

0.541645700512
{'n_estimators': 650, 'subsample': 0.95, 'max_depth': 4}


In [17]:
means = gridresults.cv_results_['mean_test_score']
stds = gridresults.cv_results_['std_test_score']
params = gridresults.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.532922 (0.067212) with: {'n_estimators': 600, 'subsample': 0.9, 'max_depth': 4}
0.541374 (0.062475) with: {'n_estimators': 600, 'subsample': 0.95, 'max_depth': 4}
0.530717 (0.055391) with: {'n_estimators': 600, 'subsample': 0.1, 'max_depth': 4}
0.530606 (0.068565) with: {'n_estimators': 650, 'subsample': 0.9, 'max_depth': 4}
0.541646 (0.062057) with: {'n_estimators': 650, 'subsample': 0.95, 'max_depth': 4}
0.530052 (0.055427) with: {'n_estimators': 650, 'subsample': 0.1, 'max_depth': 4}
0.529443 (0.069241) with: {'n_estimators': 700, 'subsample': 0.9, 'max_depth': 4}
0.540965 (0.062039) with: {'n_estimators': 700, 'subsample': 0.95, 'max_depth': 4}
0.528706 (0.054314) with: {'n_estimators': 700, 'subsample': 0.1, 'max_depth': 4}
0.503469 (0.097803) with: {'n_estimators': 600, 'subsample': 0.9, 'max_depth': 5}
0.504126 (0.096995) with: {'n_estimators': 600, 'subsample': 0.95, 'max_depth': 5}
0.525316 (0.052777) with: {'n_estimators': 600, 'subsample': 0.1, 'max_depth': 5}
0.499759 (0.

In [None]:
model = xgb.XGBRegressor(learning_rate = 0.005, n_jobs = -1, 
                         base_score= y_mean, silent=True, 
                         objective='reg:linear', max_depth = 4)

xgb_params = {
    'n_estimators': [625, 650, 675], 
    'subsample': [0.90, 0.95, 0.1]}

grid = GridSearchCV(model, xgb_params, cv = 5, verbose=1, n_jobs = -1)
gridresults = grid.fit(X, y)

In [20]:
print(gridresults.best_score_)
print(gridresults.best_params_)

0.541645700512
{'n_estimators': 650, 'subsample': 0.95}


In [21]:
means = gridresults.cv_results_['mean_test_score']
stds = gridresults.cv_results_['std_test_score']
params = gridresults.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.532852 (0.067183) with: {'n_estimators': 625, 'subsample': 0.9}
0.541411 (0.062440) with: {'n_estimators': 625, 'subsample': 0.95}
0.531015 (0.055623) with: {'n_estimators': 625, 'subsample': 0.1}
0.530606 (0.068565) with: {'n_estimators': 650, 'subsample': 0.9}
0.541646 (0.062057) with: {'n_estimators': 650, 'subsample': 0.95}
0.530052 (0.055427) with: {'n_estimators': 650, 'subsample': 0.1}
0.530138 (0.069106) with: {'n_estimators': 675, 'subsample': 0.9}
0.541254 (0.061785) with: {'n_estimators': 675, 'subsample': 0.95}
0.529546 (0.054797) with: {'n_estimators': 675, 'subsample': 0.1}


In [28]:
# train model
xgb_model = xgb.XGBRegressor(n_estimators=650, learning_rate=0.005, 
                             max_depth=4, subsample=0.95, objective='reg:linear',
                             base_score=y_mean, n_jobs=-1)
xgb_model.fit(X, y, eval_metric='rmse', verbose=True)

XGBRegressor(base_score=100.669318128, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.005, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=650,
       n_jobs=-1, nthread=-1, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=0.95)

In [29]:
# check f2-score (to get higher score - increase num_boost_round in previous cell)
from sklearn.metrics import r2_score

print r2_score(y, xgb_model.predict(X))

0.63188961146


In [31]:
from sklearn.externals import joblib

# save model to file
joblib.dump(xgb_model, "xgb_650_GCV0625.joblib.dat")

['xgb_650_GCV0625.joblib.dat']

In [39]:
#xgb_model = joblib.load("xgb_650_GCV0625.joblib.dat")
x_test = np.array(test)
#print x_test
print x_test.shape
print test.shape

(4209, 397)
(4209, 397)


In [37]:
# make predictions and save results
y_pred = xgb_model.predict(x_test)
output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': y_pred})
output.to_csv('xgb_650_GCV0625-pca-ica.csv', index=False)

### Trying with random search instead of grid search

In [None]:
from scipy.stats import randint as sp_randint

model = xgb.XGBRegressor(base_score= y_mean, silent=True, objective='reg:linear')

xgb_params = {
    'n_estimators': [600, 625, 650, 675, 700], 
    'learning_rate': [0.005, 0.01],
    'max_depth': [4, 5],
    'subsample': [0.80, 0.85, 0.90, 0.95],
    'gamma': [0, 0.05],
    'colsample_bytree': [0.75, 1],
    'colsample_bylevel': [0.75, 1]}

clf = RandomizedSearchCV(model, xgb_params, cv = 5, verbose=1, 
                         n_iter=50, n_jobs = -1)
clf.fit(X, y)

In [12]:
print(clf.best_score_)
print(clf.best_params_)

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.538127937497
{'colsample_bytree': 0.75, 'colsample_bylevel': 1, 'learning_rate': 0.005, 'n_estimators': 600, 'subsample': 0.9, 'max_depth': 4, 'gamma': 0.05}
0.467630 (0.156224) with: {'colsample_bytree': 0.75, 'colsample_bylevel': 0.75, 'learning_rate': 0.005, 'n_estimators': 650, 'subsample': 0.9, 'max_depth': 5, 'gamma': 0}
0.486489 (0.126675) with: {'colsample_bytree': 0.75, 'colsample_bylevel': 1, 'learning_rate': 0.005, 'n_estimators': 600, 'subsample': 0.8, 'max_depth': 5, 'gamma': 0}
0.502740 (0.094620) with: {'colsample_bytree': 1, 'colsample_bylevel': 1, 'learning_rate': 0.005, 'n_estimators': 625, 'subsample': 0.95, 'max_depth': 5, 'gamma': 0}
0.492220 (0.055937) with: {'colsample_bytree': 1, 'colsample_bylevel': 1, 'learning_rate': 0.01, 'n_estimators': 625, 'subsample': 0.9, 'max_depth': 4, 'gamma': 0.05}
0.483409 (0.069933) with: {'colsample_bytree': 0.75, 'colsample_bylevel': 1, 'learning_rate': 0.01, 'n_estimators': 650, 'subsample': 0.8, 'max_depth': 4, 'gamma': 0}
0

In [None]:
xgb_model = xgb.XGBRegressor(colsample_bytree=0.75, colsample_bylevel=1, 
                             learning_rate=0.005, n_estimators=650, 
                             subsample=0.9, max_depth=4, 
                             gamma= 0.05, objective='reg:linear',
                             base_score=y_mean, n_jobs=-1)

xgb_model.fit(X, y, eval_metric='rmse', verbose=True)

In [18]:
# check f2-score (to get higher score - increase num_boost_round in previous cell)
from sklearn.metrics import r2_score

print r2_score(y, xgb_model.predict(X))

0.6309514157


In [21]:
xgb_model = xgb.XGBRegressor(colsample_bytree=0.75, colsample_bylevel=1, 
                             learning_rate=0.005, n_estimators=650, 
                             subsample=0.95, max_depth=4, 
                             gamma= 0.05, objective='reg:linear',
                             base_score=y_mean, n_jobs=-1)

xgb_model.fit(X, y, eval_metric='rmse', verbose=True)

# check f2-score (to get higher score - increase num_boost_round in previous cell)
from sklearn.metrics import r2_score

print r2_score(y, xgb_model.predict(X))

0.629841004819
