In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [2]:
cols = [0, 1, 5, 7, 11, 12, 13]+[i for i in range(16, 33)]
train_data = pd.read_csv('train.csv', header=0, usecols=cols)
test_data = pd.read_csv('test.csv', header=0, usecols=cols[:-1])

In [3]:
# replicate data because label 1 only has 1 entry
train_data = train_data.append([train_data[train_data.label == 1]]*9, ignore_index=True)

In [4]:
X = train_data[['num-comments', 'feedback-karma', 'ratings-given', 'ratings-received',
                'num-authors', 'prev-games', 'fun-average', 'innovation-average', 'theme-average',
                'graphics-average', 'audio-average', 'humor-average', 'mood-average', 'fun-rank',
                'innovation-rank', 'theme-rank', 'graphics-rank', 'audio-rank', 'humor-rank', 'mood-rank']]
y = train_data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [6]:
X2 = test_data[['num-comments', 'feedback-karma', 'ratings-given', 'ratings-received', 
                'num-authors', 'prev-games', 'fun-average', 'innovation-average', 'theme-average', 
                'graphics-average', 'audio-average', 'humor-average', 'mood-average', 'fun-rank', 
                'innovation-rank', 'theme-rank', 'graphics-rank', 'audio-rank', 'humor-rank', 'mood-rank']]

In [7]:
full_dtest = xgb.DMatrix(X2)

In [8]:
# specify parameters via map
param = {
    'max_depth': 2,
    'eta': 1,
    'objective': 'multi:softmax',
    'num_class': 6,
    'eval_metric': 'mlogloss'
}

# specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_boost_round, early_stopping_rounds = 100, 10
bst = xgb.train(param, dtrain, 
                num_boost_round=num_boost_round,
                early_stopping_rounds=early_stopping_rounds,
                evals=watchlist)

[0]	eval-mlogloss:0.43367	train-mlogloss:0.43011
Multiple eval metrics have been passed: 'train-mlogloss' will be used for early stopping.

Will train until train-mlogloss hasn't improved in 10 rounds.
[1]	eval-mlogloss:0.29174	train-mlogloss:0.28105
[2]	eval-mlogloss:0.22180	train-mlogloss:0.21357
[3]	eval-mlogloss:0.19217	train-mlogloss:0.18337
[4]	eval-mlogloss:0.17266	train-mlogloss:0.16392
[5]	eval-mlogloss:0.16230	train-mlogloss:0.15243
[6]	eval-mlogloss:0.15684	train-mlogloss:0.14410
[7]	eval-mlogloss:0.15185	train-mlogloss:0.13872
[8]	eval-mlogloss:0.14856	train-mlogloss:0.13469
[9]	eval-mlogloss:0.14682	train-mlogloss:0.13162
[10]	eval-mlogloss:0.14483	train-mlogloss:0.12896
[11]	eval-mlogloss:0.14402	train-mlogloss:0.12644
[12]	eval-mlogloss:0.14293	train-mlogloss:0.12456
[13]	eval-mlogloss:0.14210	train-mlogloss:0.12295
[14]	eval-mlogloss:0.14105	train-mlogloss:0.12089
[15]	eval-mlogloss:0.14116	train-mlogloss:0.11944
[16]	eval-mlogloss:0.13967	train-mlogloss:0.11823
[17]	ev

In [11]:
# this is prediction
preds = bst.predict(dtest)
labels = dtest.get_label()
print('error=%f' %
      (sum(1 for i in range(len(preds)) if preds[i] != labels[i]) /
       float(len(preds))))

error=0.064663


In [17]:
# specify parameters via map
param = {
    'max_depth': 2,
    'eta': 1,
    'objective': 'multi:softmax',
    'num_class': 6,
    'eval_metric': 'mlogloss'
}

# specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_boost_round, early_stopping_rounds = 200, 10
cv_results = xgb.cv(param, dtrain,
                    num_boost_round=num_boost_round,
                    seed=42, nfold=5,
                    metrics={'mlogloss'},
                    early_stopping_rounds=10
)
cv_results

Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,0.426591,0.003592,0.430054,0.004593
1,0.275183,0.000719,0.282285,0.007124
2,0.208027,0.000892,0.217758,0.005539
3,0.177649,0.001519,0.189222,0.004859
4,0.159429,0.001682,0.173555,0.004458
5,0.147278,0.002119,0.163131,0.003508
6,0.138985,0.001722,0.156991,0.004773
7,0.13358,0.001755,0.153485,0.004858
8,0.129715,0.001684,0.15126,0.00539
9,0.126527,0.002013,0.148825,0.00451


In [10]:
loss_function = 'merror'

In [63]:
# You can try wider intervals with a larger step between
# each value and then narrow it down. Here after several
# iteration I found that the optimal value was in the
# following ranges.
gridsearch_params = [
    (max_depth, min_child_weight) for max_depth in range(2, 8) for min_child_weight in range(4, 10)
]
param = {
    'eta': 1,
    'objective': 'multi:softmax',
    'num_class': 6,
    'eval_metric': loss_function
}


# Define initial best params and MAE
min_loss = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print(f"CV with max_depth={max_depth}, min_child_weight={min_child_weight}")
    
    # Update our parameters
    param['max_depth'] = max_depth
    param['min_child_weight'] = min_child_weight
    
    # Run CV
    cv_results = xgb.cv(param,dtrain,
                        num_boost_round=num_boost_round,
                        seed=42, nfold=5,
                        metrics={loss_function},
                        early_stopping_rounds=10)

    # Update best loss
    mean_loss = cv_results[f'test-{loss_function}-mean'].min()
    boost_rounds = cv_results[f'test-{loss_function}-mean'].argmin()
    print(f"\t{loss_function} {mean_loss} for {boost_rounds} rounds")
    if mean_loss < min_loss:
        min_loss = mean_loss
        best_params = (max_depth, min_child_weight)
print(f"Best params: {best_params[0]}, {best_params[1]}, {loss_function}: {min_loss}")

CV with max_depth=2, min_child_weight=4
	merror 0.0573298 for 29 rounds
CV with max_depth=2, min_child_weight=5
	merror 0.057955999999999994 for 28 rounds
CV with max_depth=2, min_child_weight=6
	merror 0.0585258 for 21 rounds
CV with max_depth=2, min_child_weight=7
	merror 0.05732980000000001 for 34 rounds
CV with max_depth=2, min_child_weight=8
	merror 0.057842399999999995 for 33 rounds
CV with max_depth=2, min_child_weight=9
	merror 0.058525400000000005 for 17 rounds
CV with max_depth=3, min_child_weight=4
	merror 0.060973599999999996 for 12 rounds
CV with max_depth=3, min_child_weight=5
	merror 0.0610302 for 13 rounds
CV with max_depth=3, min_child_weight=6
	merror 0.05920879999999999 for 19 rounds
CV with max_depth=3, min_child_weight=7
	merror 0.0601764 for 24 rounds
CV with max_depth=3, min_child_weight=8
	merror 0.059778 for 21 rounds
CV with max_depth=3, min_child_weight=9
	merror 0.059892 for 13 rounds
CV with max_depth=4, min_child_weight=4
	merror 0.06256779999999999 for 11

Best params: max_depth: 2, min_child_weight: 7

In [64]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]
param = {
    'max_depth': 2,
    'min_child_weight': 7,
    'eta': 1,
    'objective': 'multi:softmax',
    'num_class': 6,
    'eval_metric': loss_function
}

min_loss = float("Inf")
best_params = None

# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print(f"CV with subsample={subsample}, colsample={colsample}")

    # We update our parameters
    param['subsample'] = subsample
    param['colsample_bytree'] = colsample

    # Run CV
    cv_results = xgb.cv(param, dtrain,
                        num_boost_round=num_boost_round,
                        seed=42, nfold=5,
                        metrics={loss_function},
                        early_stopping_rounds=10)

    # Update best score
    mean_loss = cv_results[f'test-{loss_function}-mean'].min()
    boost_rounds = cv_results[f'test-{loss_function}-mean'].argmin()
    print(f"\t{loss_function} {mean_loss} for {boost_rounds} rounds")
    if mean_loss < min_loss:
        min_loss = mean_loss
        best_params = (subsample, colsample)
print(f"Best params: {best_params[0]}, {best_params[1]}, {loss_function}: {min_loss}")

CV with subsample=1.0, colsample=1.0
	merror 0.05732980000000001 for 34 rounds
CV with subsample=1.0, colsample=0.9
	merror 0.0576716 for 22 rounds
CV with subsample=1.0, colsample=0.8
	merror 0.057216 for 41 rounds
CV with subsample=1.0, colsample=0.7
	merror 0.06023339999999999 for 13 rounds
CV with subsample=0.9, colsample=1.0
	merror 0.0574436 for 43 rounds
CV with subsample=0.9, colsample=0.9
	merror 0.05886739999999999 for 29 rounds
CV with subsample=0.9, colsample=0.8
	merror 0.0571592 for 18 rounds
CV with subsample=0.9, colsample=0.7
	merror 0.058013 for 30 rounds
CV with subsample=0.8, colsample=1.0
	merror 0.058297600000000005 for 29 rounds
CV with subsample=0.8, colsample=0.9
	merror 0.06046119999999999 for 16 rounds
CV with subsample=0.8, colsample=0.8
	merror 0.0589812 for 22 rounds
CV with subsample=0.8, colsample=0.7
	merror 0.05818399999999999 for 16 rounds
CV with subsample=0.7, colsample=1.0
	merror 0.058753 for 19 rounds
CV with subsample=0.7, colsample=0.9
	merror 

Best params: subsample: 0.9, colsample: 0.8

In [11]:
%time
param = {
    'max_depth': 2,
    'min_child_weight': 7,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'objective': 'multi:softmax',
    'num_class': 6,
    'eval_metric': loss_function
}

min_loss = float("Inf")
best_params = None
for eta in [1, .5, .3, .2, .1, .05, .01, .005]:
    print(f"CV with eta={eta}")
    # We update our parameters
    param['eta'] = eta
    
    # Run and time CV
    %time cv_results = xgb.cv(param, dtrain, \
                              num_boost_round=num_boost_round, \
                              seed=42, nfold=5, \
                              metrics=[loss_function], \
                              early_stopping_rounds=10)

    # Update best score
    mean_loss = cv_results[f'test-{loss_function}-mean'].min()
    boost_rounds = cv_results[f'test-{loss_function}-mean'].argmin()
    print(f"\t{loss_function} {mean_loss} for {boost_rounds} rounds\n")
    if mean_loss < min_loss:
        min_loss = mean_loss
        best_params = eta
print(f"Best params: {best_params}, MLOGLOSS: {min_loss}")

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.91 µs
CV with eta=1
CPU times: user 15.5 s, sys: 171 ms, total: 15.7 s
Wall time: 4.28 s
	merror 0.059778 for 25 rounds

CV with eta=0.5
CPU times: user 17.2 s, sys: 123 ms, total: 17.3 s
Wall time: 4.48 s
	merror 0.0575576 for 26 rounds

CV with eta=0.3
CPU times: user 32.3 s, sys: 252 ms, total: 32.5 s
Wall time: 8.58 s
	merror 0.0577284 for 57 rounds

CV with eta=0.2
CPU times: user 46.9 s, sys: 309 ms, total: 47.2 s
Wall time: 12.3 s
	merror 0.056874400000000006 for 96 rounds

CV with eta=0.1
CPU times: user 48.4 s, sys: 506 ms, total: 48.9 s
Wall time: 13.7 s
	merror 0.05915159999999999 for 95 rounds

CV with eta=0.05
CPU times: user 48.2 s, sys: 570 ms, total: 48.8 s
Wall time: 13.5 s
	merror 0.063194 for 99 rounds

CV with eta=0.01
CPU times: user 8.02 s, sys: 94.4 ms, total: 8.11 s
Wall time: 2.19 s
	merror 0.100427 for 5 rounds

CV with eta=0.005
CPU times: user 8.3 s, sys: 88.5 ms, total: 8.39 s
Wall time: 2.22 s
	

Best params: eta: 0.2

In [16]:
param = {
    'max_depth': 2,
    'min_child_weight': 7,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'objective': 'multi:softmax',
    'num_class': 6,
    'eval_metric': loss_function,
    'eta': 0.2
}

num_boost_round, early_stopping_rounds = 200, 10
watchlist = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(param, dtrain,
                  num_boost_round=num_boost_round,
                  early_stopping_rounds=10,
                  evals=watchlist
)

[0]	train-merror:0.09177	test-merror:0.09335
Multiple eval metrics have been passed: 'test-merror' will be used for early stopping.

Will train until test-merror hasn't improved in 10 rounds.
[1]	train-merror:0.10760	test-merror:0.10974
[2]	train-merror:0.08876	test-merror:0.09039
[3]	train-merror:0.08244	test-merror:0.08515
[4]	train-merror:0.08249	test-merror:0.08333
[5]	train-merror:0.08244	test-merror:0.08288
[6]	train-merror:0.07896	test-merror:0.07832
[7]	train-merror:0.07526	test-merror:0.07650
[8]	train-merror:0.07532	test-merror:0.07605
[9]	train-merror:0.07339	test-merror:0.07514
[10]	train-merror:0.07355	test-merror:0.07514
[11]	train-merror:0.07185	test-merror:0.07468
[12]	train-merror:0.06946	test-merror:0.07240
[13]	train-merror:0.06877	test-merror:0.07218
[14]	train-merror:0.06764	test-merror:0.07195
[15]	train-merror:0.06644	test-merror:0.07081
[16]	train-merror:0.06667	test-merror:0.07104
[17]	train-merror:0.06598	test-merror:0.06967
[18]	train-merror:0.06490	test-merr

In [21]:
full_dtrain = xgb.DMatrix(X, label=y)

param = {
    'max_depth': 2,
    'min_child_weight': 7,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'objective': 'multi:softmax',
    'num_class': 6,
    'eval_metric': loss_function,
    'eta': 0.2
}

num_boost_round, early_stopping_rounds = 200, 10
watchlist = [(full_dtrain, 'train')]
model = xgb.train(param, full_dtrain,
                  num_boost_round=num_boost_round,
                  early_stopping_rounds=10,
                  evals=watchlist
)

[0]	train-merror:0.10548
Will train until train-merror hasn't improved in 10 rounds.
[1]	train-merror:0.10361
[2]	train-merror:0.10193
[3]	train-merror:0.08754
[4]	train-merror:0.08467
[5]	train-merror:0.08079
[6]	train-merror:0.08184
[7]	train-merror:0.07911
[8]	train-merror:0.07815
[9]	train-merror:0.07578
[10]	train-merror:0.07478
[11]	train-merror:0.07178
[12]	train-merror:0.06950
[13]	train-merror:0.06895
[14]	train-merror:0.06886
[15]	train-merror:0.06827
[16]	train-merror:0.06754
[17]	train-merror:0.06608
[18]	train-merror:0.06613
[19]	train-merror:0.06458
[20]	train-merror:0.06331
[21]	train-merror:0.06153
[22]	train-merror:0.05984
[23]	train-merror:0.05953
[24]	train-merror:0.05925
[25]	train-merror:0.05871
[26]	train-merror:0.05839
[27]	train-merror:0.05811
[28]	train-merror:0.05770
[29]	train-merror:0.05779
[30]	train-merror:0.05775
[31]	train-merror:0.05679
[32]	train-merror:0.05707
[33]	train-merror:0.05625
[34]	train-merror:0.05588
[35]	train-merror:0.05588
[36]	train-mer

In [22]:
preds = model.predict(dtest)
labels = dtest.get_label()
print('error=%f' %
      (sum(1 for i in range(len(preds)) if preds[i] != labels[i]) /
       float(len(preds))))

error=0.044854


In [23]:
preds_test = model.predict(full_dtest)

In [24]:
result = pd.DataFrame({'id': test_data.id, 'label': preds_test}).round().astype('int32')

In [25]:
result.to_csv('submission_19.csv', index=False)

In [9]:
import numpy as np
A = np.array([[0.91, 0.66], [0.46, 0.33]])
b = np.array([0.25, 0.14])

x_1 = np.array([-0.088, 0.5])
x_2 = np.array([0.99, -1.01])
print(b-A@x_1)
print(b-A@x_2)

[8.000e-05 1.548e-02]
[0.0157 0.0179]
