### XGBoost example

Reference: https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/

#### install XGBoost for python first:
sudo pip install xgboost


In [1]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# load data
dataset = loadtxt('pima-indians-diabetes.data.csv', delimiter=",")

In [6]:
# split data into X and y
X = dataset[:,0:8]
Y = dataset[:,8]

In [7]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [8]:
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [9]:
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


In [18]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [11]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 77.95%


In [21]:
X_train

array([[   3.   ,  102.   ,   44.   , ...,   30.8  ,    0.4  ,   26.   ],
       [   1.   ,   77.   ,   56.   , ...,   33.3  ,    1.251,   24.   ],
       [   9.   ,  124.   ,   70.   , ...,   35.4  ,    0.282,   34.   ],
       ..., 
       [   0.   ,   57.   ,   60.   , ...,   21.7  ,    0.735,   67.   ],
       [   1.   ,  105.   ,   58.   , ...,   24.3  ,    0.187,   21.   ],
       [   8.   ,  179.   ,   72.   , ...,   32.7  ,    0.719,   36.   ]])

### XGBoost function for Yelp Dataset:

In [None]:
def XGmodel(X_train, X_test, y_train, y_test):
    # fit model no training data
    model = XGBClassifier()
    
    # setup parameters for xgboost
    param = {}
    # use softmax multi-class classification
    # six classes
    param['objective'] = 'multi:softmax'
    param['num_class'] = 6
    # scale weight of positive examples
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['nthread'] = 4
    #number of training rounds
    num_round = 5
    model.fit(param,X_train,num_round, y_train)
    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    

#### Note that XGBoost does not support categorical features; if your data contains categorical features, load it as a numpy array first and then perform one-hot encoding.