In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.svm import SVC

In [2]:
def load_data_a9a(path = './'):
    """
        input: path
        return: lib svm files
    """
    X_train, y_train = load_svmlight_file(path+'a9a')
    X_test, y_test = load_svmlight_file(path+'a9a.t')

    y_test[y_test==-1] = 0
    y_train[y_train==-1] = 0
    
    
    return {"Xtr":X_train, "ytr":y_train, "Xtst":X_test, "ytst":y_test}

In [3]:
data = load_data_a9a()

In [4]:
def xgb_classifier(data, xgb_params, K=5):
    """
        input: hyperparameters for the model
        return: predictions
    """
    kf = KFold(n_splits = K, random_state = 3228, shuffle = True)
    xgb_preds = []
    cv_preds = []
    cv_labels = []
    train_preds = []
    train_labels = []
    
    for train_index, cv_index in kf.split(data['Xtr']):
        train_X, valid_X = data['Xtr'][train_index], data['Xtr'][cv_index]
        train_y, valid_y = data['ytr'][train_index], data['ytr'][cv_index]

        dtrain = xgb.DMatrix(train_X, train_y)
        dvalid = xgb.DMatrix(valid_X, valid_y)
        dtest = xgb.DMatrix(data['Xtst'])

        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

        model = xgb.train(xgb_params, dtrain, 1000, watchlist, early_stopping_rounds=100, maximize=False, verbose_eval=50)
        
        #train
        dtrain = xgb.DMatrix(train_X)
        train_preds.append(list(model.predict(dtrain)))
        train_labels.append(train_y) 
        
        #cross validation
        dcv = xgb.DMatrix(valid_X)
        cv_preds.append(list(model.predict(dcv)))
        cv_labels.append(valid_y)
        
        #test
        xgb_pred = model.predict(dtest)
        xgb_preds.append(list(xgb_pred))
        
    return xgb_preds, cv_preds, cv_labels, train_preds, train_labels

In [5]:
def svm_classifier(data, params, K=5):
    """
       input: hyper params, and data
       return: predictions
    """
    kf = KFold(n_splits = K, random_state = 3228, shuffle = True)
    svc_preds = []
    cv_preds = []
    cv_labels = []
    train_preds = []
    train_labels = []
    for train_index, cv_index in kf.split(data['Xtr']):
        train_X, valid_X = data['Xtr'][train_index], data['Xtr'][cv_index]
        train_y, valid_y = data['ytr'][train_index], data['ytr'][cv_index]

        model = SVC(**params)
        model.fit(train_X, train_y)
        
        #train predictions
        train_preds.append(list(model.predict(train_X)))
        train_labels.append(train_y)
        
        #cv predictions
        cv_preds.append(list(model.predict(valid_X)))
        cv_labels.append(valid_y)
        
        #test predictions
        svc_pred = model.predict(data['Xtst'])
        svc_preds.append(list(svc_pred))

    return svc_preds, cv_preds, cv_labels, train_preds, train_labels

In [6]:
xgb_pars = {'eta': 0.1, 'colsample_bytree': 0.3, 'max_depth': 6, "min_child_weight":3, "lambda": .001, "alpha": .5,
            'subsample':.6,'nthread': 4, 'objective': 'binary:logistic'}

In [7]:
K=5
test_preds, cv_preds, cv_labels, train_preds, train_labels = xgb_classifier(data, xgb_pars, K = K)

[0]	train-error:0.184314	valid-error:0.193766
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 100 rounds.
[50]	train-error:0.146537	valid-error:0.161523
[100]	train-error:0.140625	valid-error:0.157992
[150]	train-error:0.135826	valid-error:0.157838
Stopping. Best iteration:
[85]	train-error:0.141431	valid-error:0.156149

[0]	train-error:0.203002	valid-error:0.212224
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 100 rounds.
[50]	train-error:0.14634	valid-error:0.160934
[100]	train-error:0.138854	valid-error:0.154484
[150]	train-error:0.135744	valid-error:0.155405
[200]	train-error:0.133057	valid-error:0.155098
Stopping. Best iteration:
[127]	train-error:0.137049	valid-error:0.15387

[0]	train-error:0.204192	valid-error:0.209613
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping

In [8]:
def merge(preds):
    total_len = 0
    predictions = np.array([])
    for k in range(len(preds)):
        total_len+=len(preds[k])
        predictions = np.concatenate((predictions, np.array(preds[k])), axis = 0)
    predictions.reshape((total_len,))
    predictions[predictions>=.5] = 1
    predictions[predictions<.5] = 0
    return predictions
def mergeKFoldPredictions(preds, K = 5):
    final_preds = []
    for i in range(len(preds[0])):
        res = 0
        for j in range(len(preds)):
            res += preds[j][i]
        final_preds.append(res*(1/len(preds)))
    predictions = np.array(final_preds)
    predictions[predictions>=.5] = 1
    predictions[predictions<.5] = 0
    return predictions

In [9]:
train_predictions = merge(train_preds)
train_labels = merge(train_labels)
accuracy = accuracy_score(train_labels, train_predictions)
print("Accuracy of train Set: %.2f%%" % (accuracy * 100.0))
print("Error of train Set: %.2f%%" % (100 - accuracy * 100.0))

Accuracy of train Set: 86.59%
Error of train Set: 13.41%


In [10]:
cv_predictions = merge(cv_preds)
cv_labels = merge(cv_labels)
accuracy = accuracy_score(cv_labels, cv_predictions)
print("Accuracy of cv Set: %.2f%%" % (accuracy * 100.0))
print("Error of cv Set: %.2f%%" % (100 - accuracy * 100.0))

Accuracy of cv Set: 84.99%
Error of cv Set: 15.01%


In [11]:
test_predictions = mergeKFoldPredictions(test_preds, K)
accuracy = accuracy_score(data['ytst'], test_predictions)
print("Accuracy of test Set: %.2f%%" % (accuracy * 100.0))
print("Error of test Set: %.2f%%" % (100 - accuracy * 100.0))

Accuracy of test Set: 85.28%
Error of test Set: 14.72%


In [12]:
K=2
svm_pars = {"probability":True, "C":1, "gamma":.005, "kernel":'rbf', 'verbose':True}
test_preds2, cv_preds2, cv_labels2, train_preds2, train_labels2 = svm_classifier(data, svm_pars, K = K)

[LibSVM][LibSVM]

In [13]:
train_predictions = merge(train_preds2)
train_labels = merge(train_labels2)
accuracy = accuracy_score(train_labels, train_predictions)
print("Accuracy of train Set: %.2f%%" % (accuracy * 100.0))
print("Error of train Set: %.2f%%" % (100 - accuracy * 100.0))

Accuracy of train Set: 84.44%
Error of train Set: 15.56%


In [14]:
cv_predictions = merge(cv_preds2)
cv_labels = merge(cv_labels2)
accuracy = accuracy_score(cv_labels, cv_predictions)
print("Accuracy of cv Set: %.2f%%" % (accuracy * 100.0))
print("Error of cv Set: %.2f%%" % (100 - accuracy * 100.0))

Accuracy of cv Set: 84.32%
Error of cv Set: 15.68%


In [15]:
test_predictions = mergeKFoldPredictions(test_preds2, K)
accuracy = accuracy_score(data['ytst'], test_predictions)
print("Accuracy of test Set: %.2f%%" % (accuracy * 100.0))
print("Error of test Set: %.2f%%" % (100 - accuracy * 100.0))

Accuracy of test Set: 84.88%
Error of test Set: 15.12%


In [16]:
cv_predictions = mergeKFoldPredictions(cv_preds2, K=2)
accuracy = accuracy_score(data['ytr'], cv_predictions)
print("Accuracy of CV Set: %.2f%%" % (accuracy * 100.0))
print("Error of CV Set: %.2f%%" % (100 - accuracy * 100.0))


test_predictions = mergeKFoldPredictions(test_preds2, K =2)
accuracy = accuracy_score(data['ytst'], test_predictions)
print("Accuracy of test Set: %.2f%%" % (accuracy * 100.0))
print("Error of test Set: %.2f%%" % (100 - accuracy * 100.0))

IndexError: list index out of range