<a href="https://colab.research.google.com/github/hjn14133/Machine-Learning/blob/main/Bagging_Regression_Yield.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd

import sklearn
from sklearn import ensemble, datasets, tree
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import sys
import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("FT_join_JH_cleaned.csv")

X = df[['DISTANCE', 'Heading','Elevation', 'AppliedRat']]
y = df['VRYIELDBAL']

# y = df['Loan_Rat']

In [5]:
# Perform data splitting using 80/20 ratio
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

In [6]:
from sklearn.ensemble import BaggingRegressor

bag_regressor = BaggingRegressor(random_state=1)
bag_regressor.fit(X_train, Y_train)

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=1, verbose=0,
                 warm_start=False)

In [13]:
Y_preds = bag_regressor.predict(X_test)

print(Y_preds[:10])
print(Y_test[:10])

print('Training Coefficient of R^2 : %.3f'%bag_regressor.score(X_train, Y_train))
print('Test Coefficient of R^2 : %.3f'%bag_regressor.score(X_test, Y_test))

bag_regressor.get_params()

[1.68849425 3.74470496 1.1284088  1.61820918 2.74120575 2.49612509
 1.78430113 2.57911151 1.62842032 2.97339184]
10574    1.551293
1975     3.353458
12637    0.221702
9996     1.650577
444      4.225409
4589     2.677815
9365     1.743007
4267     2.735673
8410     1.893007
10590    1.548948
Name: VRYIELDBAL, dtype: float64
Training Coefficient of R^2 : 0.937
Test Coefficient of R^2 : 0.642


{'base_estimator': None,
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [18]:
# Calculate the absolute errors
errors = abs(Y_preds - Y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), '.')


# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / Y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 0.39 .
Accuracy: 74.36 %.


In [23]:
# Get numerical feature importances
importances = list(bag_regressor.best_estimator_.feature_importances_)
feature_list = list(X.columns)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

AttributeError: ignored

In [17]:
params = {'base_estimator': [None, LinearRegression(), KNeighborsRegressor()],
          'n_estimators': [5, 10, 100, 500, 1000],
          #'max_samples': [1.0, 2],
          'max_features': [0.5, 1.0, 2],
          'bootstrap': [True],
          'bootstrap_features': [False]}

bagging_regressor_grid = GridSearchCV(BaggingRegressor(random_state=1, n_jobs=-1), param_grid =params, cv=3, n_jobs=-1, verbose=1)
bagging_regressor_grid.fit(X_train, Y_train)

print('Train R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_train, Y_train))
print('Test R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_test, Y_test))
print('Best R^2 Score Through Grid Search : %.3f'%bagging_regressor_grid.best_score_)
print('Best Parameters : ',bagging_regressor_grid.best_params_)

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:  6.4min finished


Train R^2 Score : 0.956
Test R^2 Score : 0.670
Best R^2 Score Through Grid Search : 0.657
Best Parameters :  {'base_estimator': None, 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'n_estimators': 1000}


In [21]:
Y_preds_grid = bagging_regressor_grid.best_estimator_.predict(X_test)

# Calculate the absolute errors
error_grid = abs(Y_preds_grid - Y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(error_grid), 2), '.')


# Calculate mean absolute percentage error (MAPE)
mape_grid = 100 * (error_grid / Y_test)
# Calculate and display accuracy
accuracy_grid = 100 - np.mean(mape_grid)
print('Accuracy:', round(accuracy_grid, 2), '%.')

Mean Absolute Error: 0.37 .
Accuracy: 74.3 %.
