In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVC

In [3]:
def load_data_a9a(path = './'):
    """
        input: path
        return: lib svm files
    """
    X_train, y_train = load_svmlight_file(path+'a9a')
    X_test, y_test = load_svmlight_file(path+'a9a.t')

    y_test[y_test==-1] = 0
    y_train[y_train==-1] = 0
    
    
    return {"Xtr":X_train, "ytr":y_train, "Xtst":X_test, "ytst":y_test}

In [4]:
data = load_data_a9a()

In [12]:
def xgb_classifier(data, xgb_params, K=5):
    """
        input: hyperparameters for the model
        return: predictions
    """
    kf = KFold(n_splits = K, random_state = 3228, shuffle = True)
    xgb_preds = []
    for train_index, cv_index in kf.split(data['Xtr']):
        train_X, valid_X = data['Xtr'][train_index], data['Xtr'][cv_index]
        train_y, valid_y = data['ytr'][train_index], data['ytr'][cv_index]

        dtrain = xgb.DMatrix(train_X, train_y)
        dvalid = xgb.DMatrix(valid_X, valid_y)
        dtest = xgb.DMatrix(data['Xtst'])

        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

        model = xgb.train(xgb_params, dtrain, 1000, watchlist, early_stopping_rounds=100, maximize=False, verbose_eval=50)
        xgb_pred = model.predict(dtest)
        xgb_preds.append(list(xgb_pred))

    return xgb_preds

In [6]:
def svm_classifier(data, params, K=5):
    """
       input: hyper params, and data
       return: predictions
    """
    kf = KFold(n_splits = K, random_state = 3228, shuffle = True)
    svc_preds = []
    for train_index, cv_index in kf.split(data['Xtr']):
        train_X, valid_X = data['Xtr'][train_index], data['Xtr'][cv_index]
        train_y, valid_y = data['ytr'][train_index], data['ytr'][cv_index]

        model = SVC(**params)
        model.fit(train_X, train_y)
        
        svc_pred = model.predict(data['Xtst'])
        svc_preds.append(list(svc_pred))

    return svc_preds

In [7]:
xgb_pars = {'eta': 0.1, 'colsample_bytree': 0.3, 'max_depth': 6, "min_child_weight":3, "lambda": .001, "alpha": .5,
            'subsample':.6,'nthread': 4, 'objective': 'binary:logistic'}

In [13]:
K=5
preds = xgb_classifier(data, xgb_pars, K = K)

[0]	train-error:0.184314	valid-error:0.193766
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 100 rounds.
[50]	train-error:0.146537	valid-error:0.161523
[100]	train-error:0.140625	valid-error:0.157992
[150]	train-error:0.135826	valid-error:0.157838
Stopping. Best iteration:
[85]	train-error:0.141431	valid-error:0.156149

[0]	train-error:0.203002	valid-error:0.212224
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 100 rounds.
[50]	train-error:0.14634	valid-error:0.160934
[100]	train-error:0.138854	valid-error:0.154484
[150]	train-error:0.135744	valid-error:0.155405
[200]	train-error:0.133057	valid-error:0.155098
Stopping. Best iteration:
[127]	train-error:0.137049	valid-error:0.15387

[0]	train-error:0.204192	valid-error:0.209613
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping

In [14]:
len(preds)
final_preds = []
for i in range(len(preds[0])):
    res = 0
    for j in range(K):
        res += preds[j][i]
    final_preds.append(res*(1/K))
predictions = np.array(final_preds)
predictions[predictions>=.5] = 1
predictions[predictions<.5] = 0

accuracy = accuracy_score(data['ytst'], predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 85.28%


In [8]:
K=2
svm_pars = {"probability":True, "C":1, "gamma":.005, "kernel":'rbf'}
preds = svm_classifier(data, svm_pars, K = K)

In [9]:
len(preds)
final_preds = []
for i in range(len(preds[0])):
    res = 0
    for j in range(K):
        res += preds[j][i]
    final_preds.append(res*(1/K))
predictions = np.array(final_preds)
predictions[predictions>=.5] = 1
predictions[predictions<.5] = 0

accuracy = accuracy_score(data['ytst'], predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 84.88%
