# Gradient Boosted Algorithm Modelling on synthetically generated Regression Dataset

In [9]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

#### Getting Data

In [2]:
X, y = make_regression(n_samples=5000, n_features=15, n_informative=10, noise=3)

In [3]:
X.shape, y.shape

((5000, 15), (5000,))

#### Splitting dataset for Modelling

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [6]:
X_train.shape, y_train.shape

((3500, 15), (3500,))

In [7]:
X_test.shape, y_test.shape

((1500, 15), (1500,))

#### Model Building

In [22]:
gb_regressor = GradientBoostingRegressor(loss='squared_error',
                                         learning_rate=0.4,
                                         n_estimators=200,
                                         random_state=42)
gb_regressor.fit(X_train, y_train)

In [13]:
def accuracy(model, X_train, X_test, y_train, y_test, return_scores = 0):
    training_score = round((model.score(X_train, y_train))*100,4)
    test_score = round((model.score(X_test, y_test))*100,4)
    print(f"Training Score: {training_score}%")
    print(f"Test Score: {test_score}%")
    
    if return_scores:
        return training_score, test_score

In [17]:
accuracy(gb_regressor, X_train, X_test, y_train, y_test, return_scores = 0)

Training Score: 99.5251%
Test Score: 97.214%


In [18]:
for estimators in [10, 50, 100, 150, 200, 250, 300, 350, 400, 500]:
    gb_regressor = GradientBoostingRegressor(loss='squared_error',
                                         learning_rate=0.4,
                                         n_estimators=estimators,
                                         random_state=42)
    gb_regressor.fit(X_train, y_train)
    print(f"Gradient Boosting Regressor with {estimators} estimators:")
    accuracy(gb_regressor, X_train, X_test, y_train, y_test, return_scores = 0)
    print("\n")

Gradient Boosting Regressor with 10 estimators:
Training Score: 89.2327%
Test Score: 85.2016%


Gradient Boosting Regressor with 50 estimators:
Training Score: 98.1784%
Test Score: 95.9678%


Gradient Boosting Regressor with 100 estimators:
Training Score: 99.0284%
Test Score: 96.9063%


Gradient Boosting Regressor with 150 estimators:
Training Score: 99.326%
Test Score: 97.1036%


Gradient Boosting Regressor with 200 estimators:
Training Score: 99.5251%
Test Score: 97.214%


Gradient Boosting Regressor with 250 estimators:
Training Score: 99.6643%
Test Score: 97.2999%


Gradient Boosting Regressor with 300 estimators:
Training Score: 99.7482%
Test Score: 97.3484%


Gradient Boosting Regressor with 350 estimators:
Training Score: 99.8078%
Test Score: 97.3792%


Gradient Boosting Regressor with 400 estimators:
Training Score: 99.8517%
Test Score: 97.3863%


Gradient Boosting Regressor with 500 estimators:
Training Score: 99.9087%
Test Score: 97.3948%




#### Gradient Boosted Regressor with 350 estimators is decent, so picking it s a final model

In [23]:
gb_regressor = GradientBoostingRegressor(loss='squared_error',
                                         learning_rate=0.4,
                                         n_estimators=350,
                                         random_state=42)
gb_regressor.fit(X_train, y_train)

In [24]:
accuracy(gb_regressor, X_train, X_test, y_train, y_test, return_scores = 0)

Training Score: 99.8078%
Test Score: 97.3792%


In [25]:
y_pred_train = gb_regressor.predict(X_train)
y_pred_test = gb_regressor.predict(X_test)

In [26]:
# Defining function to return the MAPE (Mean Absolute Percent Error)

def get_mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100

In [32]:
# defining a single function to get all required metrics of regression, to avoid writing it again and again

def regression_metrics(actual, predicted, return_results = 0):
    
    mae = mean_absolute_error(actual, predicted)
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    mape = get_mape(actual, predicted)
    r_squared = r2_score(actual, predicted)
    
    mae = round(mae,2)
    mse = round(mse,2)
    rmse = round(rmse,2)
    mape = round(mape,2)
    r_squared = round(r_squared,2)
    
    print(f'Mean Absolute Error: {mae}')
    print(f'Mean Squared Error: {mse}')
    print(f'Root Mean Squared Error: {rmse}')
    print(f'Mean Absolute Percent Error: {mape}%')
    print(f'R Squared: {r_squared}')
    
    if return_results:
        return mae, mse, rmse, mape, r_squared

In [29]:
# checking if any of the target is 0, because MAPE will fail otherwise
print(any(y_train)==0)
print(any(y_test)==0)
print(any(y_pred_train)==0)
print(any(y_pred_test)==0)

False
False
False
False


In [33]:
regression_metrics(y_train, y_pred_train)

Mean Absolute Error: 6.29
Mean Squared Error: 64.84
Root Mean Squared Error: 8.05
Mean Absolute Percent Error: 19.21%
R Squared: 1.0


In [34]:
regression_metrics(y_test, y_pred_test)

Mean Absolute Error: 22.16
Mean Squared Error: 808.7
Root Mean Squared Error: 28.44
Mean Absolute Percent Error: 59.23%
R Squared: 0.97
