### Import Libraries

In [57]:
import numpy as np
import pandas as pd

import xgboost as xgb
import sklearn as skl

In [58]:
from sklearn.datasets import make_classification 
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.cross_validation import train_test_split

### Configuration

In [59]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Set Random Seed

In [60]:
seed = 1234

### Prepare Data

In [61]:
X, y = make_classification(
    n_samples=200,
    n_features=5,
    n_informative=3,
    n_classes=2,
    weights=[0.9, 0.1], # to make the data set imbalanced
    shuffle=True,
    random_state=seed)

f"There are {y.sum()} positive instances"

'There are 22 positive instances'

In [62]:
X_train, X_test, \
y_train, y_test = train_test_split(
                                X,
                                y,
                                test_size=0.33,
                                stratify=y,
                                random_state=seed)

f"{y_train.sum()} positive train instances"
f"{y_test.sum()} positive test instances"

'15 positive train instances'

'7 positive test instances'

### Baseline Model

In [63]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [64]:
params = {
    'objective':'binary:logistic',
    'max_depth':1,
    'silent':1,
    'eta':1
}

num_rounds = 15

In [65]:
bst = xgb.train(params, dtrain, num_rounds)
y_test_preds = (bst.predict(dtest) > 0.5).astype('int')

In [66]:
def evaluate(y_test, y_test_preds):

    matrix = pd.crosstab(
        pd.Series(y_test, name='Actual'),
        pd.Series(y_test_preds, name='Predicted'),
        margins=True
    )
    
    scores = [f"Accurancy: {accuracy_score(y_test, y_test_preds):.2f}", 
              f"Precision: {precision_score(y_test, y_test_preds):.2f}", 
              f"Recall: {recall_score(y_test, y_test_preds):.2f}"]
    
    return matrix, scores

In [67]:
matrix, scores = evaluate(y_test, y_test_preds)
matrix
scores

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,59,0,59
1,4,3,7
All,63,3,66


['Accurancy: 0.94', 'Precision: 1.00', 'Recall: 0.43']

**Precision** repressents the ability of the classifier to NOT label a negative sample as positive

**Recall** repressents the ability of the classifier to find all the positive samples

This is an example of the accuracy paradox 

If you only look at the accuracy of the model it looks good.

However, if you look at the recall, you will notice the minority class is performing poorly

### Custom Weights

In [68]:
weights = np.zeros(len(y_train))
weights[y_train == 0] = 1
weights[y_train == 1] = 5

dtrain = xgb.DMatrix(X_train, label=y_train, weight=weights)
dtest = xgb.DMatrix(X_test)

In [69]:
bst = xgb.train(params, dtrain, num_rounds)
y_test_preds = (bst.predict(dtest) > 0.5).astype('int')

In [70]:
matrix, scores = evaluate(y_test, y_test_preds)
matrix
scores

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,57,2,59
1,1,6,7
All,58,8,66


['Accurancy: 0.95', 'Precision: 0.75', 'Recall: 0.86']

### Tune Weights

In [71]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [72]:
train_labels = dtrain.get_label()

ratio = float(np.sum(train_labels == 0)) / np.sum(train_labels == 1)
params['scale_pos_weight'] = ratio

In [73]:
bst = xgb.train(params, dtrain, num_rounds)
y_test_preds = (bst.predict(dtest) > 0.5).astype('int')

In [74]:
matrix, scores = evaluate(y_test, y_test_preds)
matrix
scores

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,54,5,59
1,1,6,7
All,55,11,66


['Accurancy: 0.91', 'Precision: 0.55', 'Recall: 0.86']