In [4]:
import xgboost as xgb
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [5]:
power_plant = pd.read_excel("Folds5x2_pp.xlsx")


X = power_plant.drop("PE", axis = 1)   # Drop PE from independent variables
y = power_plant['PE'].values           # Hold PE as the dependent variable. PE - Net Hourly Power ouput

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.3, random_state=1)

In [6]:
# Xgb takes data in matrix form both for training and testing...

DM_train = xgb.DMatrix(data = X_train, 
                       label = y_train)  
DM_test =  xgb.DMatrix(data = X_test,
                       label = y_test)

In [7]:
# setting the hyper parameters ... Ref https://xgboost.readthedocs.io/en/latest/python/python_api.html

gbm_param_grid = {
     'colsample_bytree': np.linspace(0.5, 0.9, 5),  # generate 5 numbers between .5 and .9 
     'n_estimators':[10, 200],
     'max_depth': [10, 15, 20, 25]
}

In [8]:
gbm = xgb.XGBRegressor()

In [9]:
grid_mse = GridSearchCV(estimator = gbm, param_grid = gbm_param_grid, scoring = 'neg_mean_squared_error', cv = 5, verbose = 1)

In [10]:
grid_mse.fit(X_train, y_train)
print("Best parameters found: ",grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  5.6min finished


Best parameters found:  {'colsample_bytree': 0.8, 'max_depth': 15, 'n_estimators': 200}
Lowest RMSE found:  3.192780782426709


In [11]:
power_plant.head()

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


In [None]:
model=xgb.XGBRegressor(colsample_bytree=0.8,max_depth=15,n_estimators=200)


from sklearn.utils import resample
import numpy as np

# load dataset
values = power_plant.values   # original data with 10 data points

# configure bootstrap
n_iterations = 500              # picking only 50 % of the given data in every bootstrap sample
n_size = int(len(values) * 1)    # Number of bootstrap samples to create = 10

# run bootstrap
stats = list()

for i in range(n_iterations):
	# prepare train and test sets
    train = resample(values, n_samples=n_size)  # Sampling with replacement 
    #getting test values
    trainset = set([tuple(x) for x in train])
    valueset = set([tuple(x) for x in values])
    test = np.array([x for x in valueset-trainset])
    
    model.fit(train[:,:-1], train[:,-1])
    score=model.score(test[:,:-1], test[:,-1])
    print(score)
    stats.append(score)

In [None]:
from matplotlib import pyplot

# plot scores
pyplot.hist(stats)
pyplot.show()
# confidence intervals
alpha = 0.95                             # for 95% confidence 
p = ((1.0-alpha)/2.0) * 100              # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(stats, p))  
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

In [None]:
np.mean((y_test-grid_mse.predict(X_test))**2)

In [None]:
pred = grid_mse.predict(X_test)
print("Root mean square error for test dataset: {}".format(np.round(np.sqrt(mean_squared_error(y_test, pred)), 2)))

In [None]:
test = pd.DataFrame({"prediction": pred, "observed": y_test.flatten()})
lowess = sm.nonparametric.lowess
z = lowess(pred.flatten(), y_test.flatten())
test.plot(figsize = [14,8],
          x ="prediction", y = "observed", kind = "scatter", color = 'darkred')
plt.title("Extreme Gradient Boosting: Prediction Vs Test Data", fontsize = 18, color = "darkgreen")
plt.xlabel("Predicted Power Output", fontsize = 18) 
plt.ylabel("Observed Power Output", fontsize = 18)
plt.plot(z[:,0], z[:,1], color = "blue", lw= 3)
plt.show()