In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
import pickle
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [3]:
# read datasets
train = pd.read_csv('data_fun/data/train.csv')
test = pd.read_csv('data_fun/data/test.csv')

# process columns, apply LabelEncoder to categorical features
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

# shape        
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

Shape train: (4209, 378)
Shape test: (4209, 377)


In [4]:
# This step is huge!
from sklearn.decomposition import PCA, FastICA 
n_comp = 10

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]
    
y_train = train["y"]
y_mean = np.mean(y_train)

In [None]:
y_train = train["y"]
y_mean = np.mean(y_train)
()# mmm, xgboost, loved by everyone ^-^
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 500, 
    'eta': 0.005,
    'max_depth': 4,
    'subsample': 0.95,
    'objective': 'reg:linear',
    #'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)



def r2_metric(preds, dtrain):
    """Self defined evaluation obj"""
    from sklearn.metrics import r2_score
    return 'r2_metric', r2_score(dtrain.get_label(), preds)


# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=1000, # increase to have better results (~700)
                   early_stopping_rounds=50,
                   verbose_eval=50, 
                   show_stdv=False,
                   feval = r2_metric,
                   maximize = True
                  )

num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

[0]	train-r2_metric:0.005423	test-r2_metric:0.004084
[50]	train-r2_metric:0.234394	test-r2_metric:0.224742
[100]	train-r2_metric:0.3753	test-r2_metric:0.358211
[150]	train-r2_metric:0.462841	test-r2_metric:0.438891
[200]	train-r2_metric:0.517671	test-r2_metric:0.487574
[250]	train-r2_metric:0.552546	test-r2_metric:0.516984
[300]	train-r2_metric:0.575248	test-r2_metric:0.534909
[350]	train-r2_metric:0.592353	test-r2_metric:0.545639
[400]	train-r2_metric:0.606735	test-r2_metric:0.55194
[450]	train-r2_metric:0.618064	test-r2_metric:0.555598
[500]	train-r2_metric:0.6285	test-r2_metric:0.557612
[550]	train-r2_metric:0.637139	test-r2_metric:0.558715
[600]	train-r2_metric:0.644767	test-r2_metric:0.559442
[650]	train-r2_metric:0.651455	test-r2_metric:0.559638
[700]	train-r2_metric:0.657961	test-r2_metric:0.55957


In [26]:
# check f2-score (to get higher score - increase num_boost_round in previous cell)

# now fixed, correct calculation
print(r2_score(dtrain.get_label(), model.predict(dtrain)))

0.633945009942


In [27]:
# make predictions and save results
y_pred = model.predict(dtest)
output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': y_pred})
output.to_csv('xgboost-depth{}-pca-ica.csv'.format(xgb_params['max_depth']), index=False)

In [28]:
xgb

<module 'xgboost' from '/usr/local/lib/python3.6/site-packages/xgboost/__init__.py'>

# Paramter Turning

Let's first consider an example for "GridSearch" 

In [133]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svr = svm.SVC()
clf = GridSearchCV(svr, parameters,verbose= 1)
clf.fit(iris.data, iris.target)
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Best is score is {}'.format(score))
print('It is given by parameters setting:')
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
clf.best_params_
# clf. to see how to extract useful info then

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best is score is 0.98
It is given by parameters setting:
C: 1
kernel: 'linear'


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.0s finished


{'C': 1, 'kernel': 'linear'}

Now, let's look at our problem. The default parameter is 

In [5]:
xgb_params = {
    'n_estimators': 550, 
    'learning_rate': 0.005,
    'max_depth': 4,
    'subsample': 0.95,
    'objective': 'reg:linear',
    #'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    #'silent': 1
}

We use grid search to find good parameters

In [10]:
y_train = train["y"]
y_mean = np.mean(y_train)
()# mmm, xgboost, loved by everyone ^-^
import xgboost as xgb

boost = xgb.XGBRegressor(**xgb_params)


parameters = {
    # n_estimators
    'learning_rate': [0.01],#, 0.015,0.05],
    'gamma': [0],#,0.1, 0.5,0.9],
    'max_depth': [4],#, 9],
    'min_child_weight': [1,5],
    "subsample": [0.6],#,0.8,1],
    'colsample_bytree': [0.6],#,0.8,1],
    'reg_alpha' : [0], 
    'reg_lambda' : [1],
}
reg = GridSearchCV(boost, parameters, n_jobs=1, cv=3, verbose = 2)
reg.fit(train.drop('y', axis=1).as_matrix(), y_train)

best_parameters, score, _ = max(reg.grid_scores_, key=lambda x: x[1])
print(score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    
pickle.dump( reg.best_params_, open("bestpara.p", "wb" ))


dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)


def r2_metric(preds, dtrain):
    """Self defined evaluation obj"""
    from sklearn.metrics import r2_score
    return 'r2_metric', r2_score(dtrain.get_label(), preds)


# xgboost, cross-validation
cv_result = xgb.cv(reg.best_params_, 
                   dtrain, 
                   num_boost_round=1000, # increase to have better results (~700)
                   early_stopping_rounds=50,
                   verbose_eval=50, 
                   show_stdv=False,
                   feval = r2_metric,
                   maximize = True
                  )

num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(reg.best_params_, silent=0), dtrain, num_boost_round=num_boost_rounds)


Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] reg_alpha=0, colsample_bytree=0.6, learning_rate=0.01, min_child_weight=1, subsample=0.6, reg_lambda=1, max_depth=4, gamma=0 
[CV]  reg_alpha=0, colsample_bytree=0.6, learning_rate=0.01, min_child_weight=1, subsample=0.6, reg_lambda=1, max_depth=4, gamma=0, total=   1.8s
[CV] reg_alpha=0, colsample_bytree=0.6, learning_rate=0.01, min_child_weight=1, subsample=0.6, reg_lambda=1, max_depth=4, gamma=0 
[CV]  reg_alpha=0, colsample_bytree=0.6, learning_rate=0.01, min_child_weight=1, subsample=0.6, reg_lambda=1, max_depth=4, gamma=0, total=   2.5s
[CV] reg_alpha=0, colsample_bytree=0.6, learning_rate=0.01, min_child_weight=1, subsample=0.6, reg_lambda=1, max_depth=4, gamma=0 
[CV]  reg_alpha=0, colsample_bytree=0.6, learning_rate=0.01, min_child_weight=1, subsample=0.6, reg_lambda=1, max_depth=4, gamma=0, total=   2.6s
[CV] reg_alpha=0, colsample_bytree=0.6, learning_rate=0.01, min_child_weight=5, subsample=0.6, reg_lambda=1, m

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   14.9s finished



0.537761432887
colsample_bytree: 0.6
gamma: 0
learning_rate: 0.01
max_depth: 4
min_child_weight: 5
reg_alpha: 0
reg_lambda: 1
subsample: 0.6
[0]	train-r2_metric:-61.2333	test-r2_metric:-61.3924
[50]	train-r2_metric:-22.1245	test-r2_metric:-22.1844
[100]	train-r2_metric:-7.77013	test-r2_metric:-7.79586
[150]	train-r2_metric:-2.49104	test-r2_metric:-2.5084
[200]	train-r2_metric:-0.544446	test-r2_metric:-0.567541
[250]	train-r2_metric:0.179741	test-r2_metric:0.146239
[300]	train-r2_metric:0.454008	test-r2_metric:0.410373
[350]	train-r2_metric:0.563022	test-r2_metric:0.508223
[400]	train-r2_metric:0.609867	test-r2_metric:0.544423
[450]	train-r2_metric:0.633036	test-r2_metric:0.557212
[500]	train-r2_metric:0.647178	test-r2_metric:0.561282
[550]	train-r2_metric:0.657793	test-r2_metric:0.562053
541




In [146]:
pickle.dump( reg.best_params_, open("bestpara.p", "wb" ))

In [5]:
pickle.load( open( "bestpara.p", "rb" ) )

ValueError: unsupported pickle protocol: 3

In [None]:
favorite_color = { "lion": "yellow", "kitty": "red" }
pickle.dump( favorite_color, open( "save.p", "wb" ) )

pickle.load( open( "save.p", "rb" ) )

# read datasets
train = pd.read_csv('data_fun/data/train.csv')
test = pd.read_csv('data_fun/data/test.csv')

# process columns, apply LabelEncoder to categorical features
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

# shape        
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

# This step is huge!
from sklearn.decomposition import PCA, FastICA 
n_comp = 10

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]
    
y_train = train["y"]
y_mean = np.mean(y_train)

y_train = train["y"]
y_mean = np.mean(y_train)
()# mmm, xgboost, loved by everyone ^-^
import xgboost as xgb
xgb_params = {
    'n_estimators': 550, 
    'learning_rate': 0.005,
    'max_depth': 4,
    'subsample': 0.95,
    'objective': 'reg:linear',
    #'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    #'silent': 1
}

boost = xgb.XGBRegressor(**xgb_params)

parameters = {
    # n_estimators
    'learning_rate': [0.01, 0.015,0.05],
    'gamma': [0,0.1,0.5,0.9],
    'max_depth': [4, 9],
    'min_child_weight': [1,5],
    "subsample": [0.6,0.8,1],
    'colsample_bytree': [0.6,0.8,1],
    'reg_alpha' : [0], 
    'reg_lambda' : [1],
}
reg = GridSearchCV(boost, parameters, n_jobs=1, cv=3, verbose = 1)
reg.fit(train.drop('y', axis=1).as_matrix(), y_train)

best_parameters, score, _ = max(reg.grid_scores_, key=lambda x: x[1])
print(score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    
pickle.dump( reg.best_params_, open("bestpara.p", "wb" ))

# xgboost, cross-validation
cv_result = xgb.cv(reg.best_params_, 
                   dtrain, 
                   num_boost_round=1000, # increase to have better results (~700)
                   early_stopping_rounds=50,
                   verbose_eval=50, 
                   show_stdv=False,
                   feval = r2_metric,
                   maximize = True
                  )

num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(reg.best_params_, silent=0), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# now fixed, correct calculation
print(r2_score(dtrain.get_label(), model.predict(dtrain)))

# make predictions and save results
y_pred = model.predict(dtest)
output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': y_pred})
output.to_csv('xgboost-depth{}-pca-ica.csv'.format(xgb_params['max_depth']), index=False)

Shape train: (4209, 378)
Shape test: (4209, 377)
Fitting 3 folds for each of 432 candidates, totalling 1296 fits