# Part 5: Advanced Machine Learning Models
<b>Author</b>: Sterling Cutler
<br>
<b>Date</b>: March 24, 2018

## Boosting, Bagging, and Ensemble Methods
Link: https://quantdare.com/what-is-the-difference-between-bagging-and-boosting/

In [None]:
# Load dataset
df <- read.csv("ABI_data.csv")

# Train/test split
train_ind <- sample(nrow(df), size=round(nrow(df)*0.8), replace=FALSE)
x_train <- data.matrix(df[train_ind, 1:6])
x_test <- data.matrix(df[-train_ind, 1:6])
y_train <- df[train_ind, 7]
y_test <- df[-train_ind, 7]

# Print data shapes
cat('Train Data Shape:', dim(x_train), "\n")
cat('Test Data Shape:', dim(x_test))

## Gradient Boosting Machine (GBM)
Doc: https://cran.r-project.org/web/packages/gbm/gbm.pdf

Link: https://medium.com/mlreview/gradient-boosting-from-scratch-1e317ae4587d

In [None]:
library(gbm)

# Fit model to training data
gbm <- gbm.fit(x_train, y_train, distribution="gaussian", n.trees=1000,
           shrinkage=0.5, bag.fraction=0.5)
print(gbm)
summary(gbm)

In [None]:
# Use out-of-bag estimator (OOB) to find optimal number of iterations
best_iter <- gbm.perf(gbm, method="OOB")
print(best_iter)
summary(gbm, n.trees=best_iter)

In [None]:
# Value plots?
par(mfrow=c(2,3))
plot(gbm, 1, best.iter)
plot(gbm, 2, best.iter)
plot(gbm, 3, best.iter)
plot(gbm, 4, best.iter)
plot(gbm, 5, best.iter)
plot(gbm, 6, best.iter)

In [None]:
library(Metrics)

# Predict target for test data
gbm_preds <- predict(gbm, x_test, best_iter)
paste("RMSE:", round(rmse(y_test, gbm_preds), 4))

## Random Forest
Doc: https://cran.r-project.org/web/packages/randomForest/randomForest.pdf

In [None]:
library(randomForest)

# Fit model to training data
rf <- randomForest(x_train, y_train, x_test, y_test, ntree=500)
print(rf)

In [None]:
# Tune model parameters with cross validation
rf_cv <- rfcv(x_train, y_train, cv.fold=5)
paste(rf_cv$n.var, rf_cv$error.cv)

In [None]:
# Plot variable importance
varImpPlot(rf, sort=TRUE)

In [None]:
# Predict target for test data
rf_preds <- predict(rf, x_test)
paste("RMSE:", round(rmse(y_test, rf_preds), 4))

## XGBOOST
Doc: https://cran.r-project.org/web/packages/xgboost/xgboost.pdf

Link: http://xgboost.readthedocs.io/en/latest/model.html#objective-function-training-loss-regularization

Versus GBM:
- Adds regularization term that helps model avoid overfitting
- Other hardware/computational benefits

In [None]:
library(xgboost)

# Fit model to training data
dtrain <- xgb.DMatrix(x_train, label=y_train)
cntrl = list(eta=0.3, max_depth=6)
xgb <- xgb.train(params=cntrl, data=x_train, nrounds=1000)

In [None]:
# Tune model parameters with cross validation
cv <- xgb.cv(data=dtrain, nrounds=3, nfold=5, metrics=list("rmse", "auc"), objective="reg:linear")

In [None]:
library(ggplot2)

# Show and plot feature importance
xgb.importance(xgb)
xgb.ggplot.importance(xgb)

In [None]:
# Predict target for test data
xgb_preds <- xgb.predict(x_test)
paste("RMSE:", round(rmse(y_test, xgb_preds), 4))

## Final Note: Distributed Modeling
Sometimes datasets are too large to model on a single machine in a stable and timely manner.
... talk about H20

## Sources
