# Cross Validation with XGBoost

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
import xgboost



### Replace missing value by -1

In [2]:
train_users = pd.read_csv('preprocessed/train_users.csv')
train_users.fillna(-1, inplace=True)

In [3]:
y_train = train_users['country_destination']
train_users.drop(['country_destination', 'id'], axis=1, inplace=True)
x_train = train_users.values

In [11]:
train_users.columns

Index(['age', 'age_group', 'day_account_created', 'weekday_account_created',
       'week_account_created', 'month_account_created', 'year_account_created',
       'day_first_active', 'weekday_first_active', 'week_first_active',
       ...
       'most_used_device_Chromebook', 'most_used_device_Linux Desktop',
       'most_used_device_Mac Desktop', 'most_used_device_Opera Phone',
       'most_used_device_Tablet', 'most_used_device_Windows Desktop',
       'most_used_device_Windows Phone', 'most_used_device_iPad Tablet',
       'most_used_device_iPhone', 'most_used_device_iPodtouch'],
      dtype='object', length=736)

In [5]:
label_encoder = LabelEncoder()
encoded_y_train = label_encoder.fit_transform(y_train)

In [6]:
train_data = xgboost.DMatrix(x_train, encoded_y_train)

### I will being using Normalized Discounted Cumulative Gain (NDCG) as the evaluation metric. This evaluation metric will calculate the gain for the predictions. The prediction gets highest rank receives more weight. NDCG5 is calculating the gain for top five predictions.

In [7]:
def ndcg5_score(preds, dtrain):
    labels = dtrain.get_label()
    top = []

    for i in range(preds.shape[0]):
        top.append(np.argsort(preds[i])[::-1][:5])

    mat = np.reshape(np.repeat(labels,np.shape(top)[1]) == np.array(top).ravel(),np.array(top).shape).astype(int)
    score = np.mean(np.sum(mat/np.log2(np.arange(2, mat.shape[1] + 2)),axis = 1))
    return 'ndcg5', score

In [12]:
param = {
    'max_depth': 10,
    'learning_rate': 0.1,
    'n_estimators': 5,
    'objective': 'multi:softprob',
    'num_class': 12,
    'gamma': 0,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 1,
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'scale_pos_weight': 1,
    'base_score': 0.5,
    'missing': None,
    'silent': True,
    'nthread': 4,
    'seed': 42
}

num_round = 10
result = xgboost.cv(param, train_data, num_boost_round=num_round, metrics=['mlogloss'], feval=ndcg5_score)

Unnamed: 0,test-ndcg5-mean,test-ndcg5-std,train-ndcg5-mean,train-ndcg5-std
0,0.925338,0.00072,0.930782,0.000312
1,0.926033,0.000763,0.930904,0.000282
2,0.926321,0.000806,0.93112,0.000272
3,0.926413,0.000754,0.931317,0.000226
4,0.926593,0.000721,0.931594,0.000276
5,0.926628,0.000762,0.931834,0.000325
6,0.926689,0.000785,0.932162,0.000404
7,0.926689,0.00079,0.932481,0.000439
8,0.926748,0.000775,0.932779,0.000547
9,0.926804,0.00074,0.933165,0.000499


In [None]:
print(result)