### XGBoost example

Reference: https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/

#### install XGBoost for python first:
sudo pip install xgboost


In [39]:
from numpy import loadtxt
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# load data
dataset = loadtxt('pima-indians-diabetes.data.csv', delimiter=",")

In [6]:
# split data into X and y
X = dataset[:,0:8]
Y = dataset[:,8]

In [7]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [8]:
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [9]:
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


In [18]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [11]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 77.95%


In [21]:
X_train

array([[   3.   ,  102.   ,   44.   , ...,   30.8  ,    0.4  ,   26.   ],
       [   1.   ,   77.   ,   56.   , ...,   33.3  ,    1.251,   24.   ],
       [   9.   ,  124.   ,   70.   , ...,   35.4  ,    0.282,   34.   ],
       ..., 
       [   0.   ,   57.   ,   60.   , ...,   21.7  ,    0.735,   67.   ],
       [   1.   ,  105.   ,   58.   , ...,   24.3  ,    0.187,   21.   ],
       [   8.   ,  179.   ,   72.   , ...,   32.7  ,    0.719,   36.   ]])

### XGBoost function for Yelp Dataset:

In [57]:
def XGmodel(X_train, X_test, y_train, y_test):
    # fit model no training data
    model = XGBClassifier()
    model.fit(X_train, y_train)
    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    

#### Note that XGBoost does not support categorical features; if your data contains categorical features, load it as a numpy array first and then perform one-hot encoding.
##### reference: https://machinelearningmastery.com/data-preparation-gradient-boosting-xgboost-python/

In [58]:
# multiclass classification
import pandas
import xgboost
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation
# load data
data = pandas.read_csv('iris.csv', header=None)
dataset = data.values
# split data into X and y
X = dataset[:,0:4]
Y = dataset[:,4]
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, label_encoded_y, test_size=test_size, random_state=seed)


In [59]:
y_train

array([0, 0, 2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 1, 1, 0, 2, 0, 0, 2, 2, 0, 2,
       0, 1, 2, 1, 0, 1, 0, 2, 2, 1, 0, 0, 1, 2, 0, 2, 2, 1, 0, 1, 0, 2, 2,
       0, 0, 2, 1, 2, 2, 1, 0, 0, 2, 0, 0, 1, 2, 2, 1, 1, 0, 2, 0, 0, 1, 1,
       2, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 0, 0, 0, 1, 1, 0, 2, 2, 1, 2, 0, 2,
       1, 1, 0, 2, 1, 2, 1, 0])

In [60]:
# fit model no training data
model = xgboost.XGBClassifier()
model.fit(X_train, y_train)
print(model)
# make predictions for test data
y_pred = model.predict(X_test)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


In [61]:
y_pred

array([2, 1, 0, 1, 1, 0, 1, 1, 0, 1, 2, 1, 0, 2, 0, 2, 2, 2, 0, 0, 1, 2, 1,
       1, 2, 2, 1, 1, 2, 2, 2, 1, 0, 2, 1, 0, 0, 0, 0, 2, 2, 1, 2, 2, 1, 0,
       1, 1, 2, 0])

In [62]:
predictions = [round(value) for value in y_pred]

In [55]:
predictions

[2,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 2,
 1,
 0,
 2,
 0,
 2,
 2,
 2,
 0,
 0,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 1,
 2,
 2,
 2,
 1,
 0,
 2,
 1,
 0,
 0,
 0,
 0,
 2,
 2,
 1,
 2,
 2,
 1,
 0,
 1,
 1,
 2,
 0]