# Gradient Boosted Locally Weighted Regression
---

Gradient Boosted Locally Weighted Regression (GBLWR) is a supervised learning algorithm that combines two techniques: Gradient Boosting and Locally Weighted Regression.

Gradient Boosting is an ensemble learning method that iteratively trains weak decision trees on the residuals of the previous trees to improve the overall performance of the model.

Locally Weighted Regression is a non-parametric method that fits a regression model to a subset of the data based on the similarity of each observation to a query point.

In GBLWR, the algorithm first uses Locally Weighted Regression to fit a model to a small subset of the training data. Then, Gradient Boosting is used to iteratively improve the performance of the model by fitting additional models to the residuals of the previous models.

The weights used in the Locally Weighted Regression step are updated in each iteration based on the errors of the previous models. This allows the algorithm to give more weight to observations that are difficult to predict, improving the overall performance of the model.

The final prediction of GBLWR is the weighted average of the predictions from all the models.

In [1]:
# computational libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, QuantileTransformer, MinMaxScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from scipy.spatial import Delaunay
from scipy.spatial import distance_matrix, distance
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.pipeline import Pipeline
import scipy.stats as stats 
from sklearn.model_selection import train_test_split as tts, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from scipy.interpolate import interp1d, RegularGridInterpolator, griddata, LinearNDInterpolator, NearestNDInterpolator
from math import ceil
from scipy import linalg
# the following line(s) are necessary if you want to make SKlearn compliant functions
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

# Locally Weighted Regression and Boosting functions
---

The functions have been optimized using ChatGPT where appropriate to find faster functions or vectorization. Kernels have now been embeded into the multi-dimensional LOWESS function, the boosting function has been refactored for faster looping. 

In [None]:
def dist(u, v):
  return np.linalg.norm(u - v, axis=1)

def lw_ag_md(x, y, xnew, f=2/3, iter=3, intercept=True, kernel='Epanechnikov'):
  n = len(x)
  r = int(np.ceil(f * n))
  yest = np.zeros(n)

  y = y.reshape(-1, 1) if len(y.shape) == 1 else y
  x = x.reshape(-1, 1) if len(x.shape) == 1 else x

  x1 = np.column_stack([np.ones((len(x), 1)), x]) if intercept else x

  dist_x_x = distance_matrix(x, x)

  h = np.sort(np.sqrt(np.sum((x - x[:, None])**2, axis=2)), axis=1)[:, r]

  w = np.clip(dist_x_x / h[:, None], 0.0, 1.0)

  kernel_functions = {
      'Epanechnikov': lambda w: np.where(w > 1, 0, 3/4*(1-w**2)),
      'Tricubic': lambda w: np.where(w > 1, 0, 70/81*(1-w**3)**3),
      'Quartic': lambda w: np.where(w > 1, 0, 15/16*(1-w**2)**2),
      'Gaussian': lambda w: np.where(w>4,0,np.exp(-0.5*w**2) / np.sqrt(2*np.pi))
  }

  w = kernel_functions[kernel](w)

  delta = np.ones(n)

  for iteration in range(iter):
      for i in range(n):
          W = np.diag(delta).dot(np.diag(w[i]))
          b = np.dot(x1.T, W).dot(y)
          A = np.dot(x1.T, W).dot(x1) + 0.0001 * np.eye(x1.shape[1])
          beta = np.linalg.solve(A, b)
          yest[i] = np.dot(x1[i], beta.ravel())

      residuals = y.ravel() - yest
      s = np.median(np.abs(residuals))
      delta = np.clip(residuals / (6.0 * s), -1, 1)
      delta = (1 - delta ** 2) ** 2

  if x.shape[1] == 1:
      f = interp1d(x.flatten(), yest, fill_value='extrapolate')
      output = f(xnew)
  else:
      output = np.zeros(len(xnew))
      for i in range(len(xnew)):
          ind = np.argsort(np.sum((x - xnew[i]) ** 2, axis=1))[:r]
          pca = PCA(n_components=3)
          x_pca = pca.fit_transform(x[ind])
          tri = Delaunay(x_pca, qhull_options='QJ Pp')
          f = LinearNDInterpolator(tri, yest[ind])
          output[i] = f(pca.transform(xnew[i].reshape(1,-1))) 

      if np.isnan(output).any():
          g = NearestNDInterpolator(x, yest.ravel())
          output[np.isnan(output)] = g(xnew[np.isnan(output)])

  return output

class Lowess_AG_MD:
    def __init__(self, f = 1/10, iter = 3,intercept=True, kernel = 'Epanechnikov'):
        self.f = f
        self.iter = iter
        self.intercept = intercept
        self.kernel = kernel

    def fit(self, x, y):
        f = self.f
        iter = self.iter
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new, kernel):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        f = self.f
        iter = self.iter
        intercept = self.intercept
        return lw_ag_md(x, y, x_new, f, iter, intercept, kernel) # efined function of Lowess

    def get_params(self, deep=True):
    # suppose this estimator has parameters "f", "iter" and "intercept"
        return {"f": self.f, "iter": self.iter,"intercept":self.intercept}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

def boosted_lwr(x, y, xnew, f=1/3, iter=5, intercept=True, kernel = 'Epanechnikov'):
    # initialize models and residuals
    model1 = Lowess_AG_MD(f=f, iter=iter, intercept=intercept, kernel = kernel)
    model2 = Lowess_AG_MD(f=f, iter=iter, intercept=intercept, kernel = kernel)
    residuals = y.copy()
    output = np.zeros(len(xnew))

    for i in range(iter):
        # train first model on X and y
        model1.fit(x, residuals)
        # compute residuals for the second model
        residuals = y - model1.predict(x, kernel = kernel)
        # train second model on X and residuals
        model2.fit(x, residuals)
        # add predictions from both models
        output += model1.predict(xnew, kernel = kernel) + model2.predict(xnew, kernel = kernel)

    return output

# Test the implementation
---
To test the implementation we load three datasets: cars, concrete, and housing. For each test we will utilize polynomially engineered features for improved results. For each dataset the cross-fold validation is run across each kernel type, a random forest regressor, and an Adaboost regressor. There is currently no optimization for the hypterparameters in any of the models.

In [None]:
# Import the data using Linux-like file selection
import glob
%cd "/content/drive/MyDrive/Data Science/Data 441/Projects Assignments/Project 3/data"

files = list(glob.glob('*.csv'))

cars, concrete, housing = pd.read_csv(files[0]), pd.read_csv(files[1]), pd.read_csv(files[2])

/content/drive/MyDrive/Data Science/Data 441/Projects Assignments/Project 3/data


In [None]:
poly = PolynomialFeatures(degree=2)
scale = StandardScaler()
pipe = Pipeline([['zscores',scale],['Poly',poly]])

## Cars dataset

In [None]:
cars.head()

Unnamed: 0,MPG,CYL,ENG,WGT
0,18.0,8,307.0,3504
1,15.0,8,350.0,3693
2,18.0,8,318.0,3436
3,16.0,8,304.0,3433
4,17.0,8,302.0,3449


In [None]:
x = cars.loc[:,'CYL':'WGT'].values
y = cars['MPG'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123)

xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

yhat = boosted_lwr(xtrain,ytrain,xtest,f=25/len(xtrain),iter=1,intercept=True)

mse(ytest,yhat)

17.778368631794528

## Cars K-Fold Cross-Validation (K=10)

In [None]:
mse_rf = []
mse_ab = []
kf = KFold(n_splits=10, shuffle=True, random_state=1234)
model_list = [('Random Forest', RandomForestRegressor(n_estimators=200, max_depth=5)),
              ('AdaBoost', AdaBoostRegressor(n_estimators=200))]
kernel_list = ['Epanechnikov', 'Gaussian', 'Quartic', 'Tricubic']
mse_lwr = [[] for _ in kernel_list]

x_scaled = pipe.fit_transform(x)

for idxtrain, idxtest in kf.split(x):
    xtrain = x_scaled[idxtrain]
    ytrain = y[idxtrain]
    ytest = y[idxtest]
    xtest = x_scaled[idxtest]

    for i, k in enumerate(kernel_list):
        yhat_lw = boosted_lwr(xtrain, ytrain, xtest, f=25/len(xtrain), iter=1, intercept=True, kernel=k)
        mse_lwr[i].append(mse(ytest, yhat_lw))

    for name, model in model_list:
        model.fit(xtrain, ytrain)
        yhat = model.predict(xtest)
        mse_ = mse(ytest, yhat)
        if name == 'Random Forest':
            mse_rf.append(mse_)
        else:
            mse_ab.append(mse_)

print(f'The Cross-validated Mean Squared Error for Locally Weighted Regression (Epanechnikov) is: {np.mean(mse_lwr[0])}')
print(f'The Cross-validated Mean Squared Error for Locally Weighted Regression (Gaussian) is: {np.mean(mse_lwr[1])}')
print(f'The Cross-validated Mean Squared Error for Locally Weighted Regression (Quartic) is: {np.mean(mse_lwr[2])}')
print(f'The Cross-validated Mean Squared Error for Locally Weighted Regression (Tricubic) is: {np.mean(mse_lwr[3])}')
print(f'The Cross-validated Mean Squared Error for Random Forest is: {np.mean(mse_rf)}')
print(f'The Cross-validated Mean Squared Error for AdaBoost is: {np.mean(mse_ab)}')

The Cross-validated Mean Squared Error for Locally Weighted Regression (Epanechnikov) is: 18.216314228704576
The Cross-validated Mean Squared Error for Locally Weighted Regression (Gaussian) is: 16.468630838976356
The Cross-validated Mean Squared Error for Locally Weighted Regression (Quartic) is: 18.493542703425312
The Cross-validated Mean Squared Error for Locally Weighted Regression (Tricubic) is: 18.559283530958634
The Cross-validated Mean Squared Error for Random Forest is: 16.909369362700065
The Cross-validated Mean Squared Error for AdaBoost is: 19.20291102121353


## Concrete dataset 

In [None]:
concrete.head()

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [None]:
x = concrete.loc[:,'cement':'age'].values
y = concrete['strength'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123)

xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

yhat = boosted_lwr(xtrain,ytrain,xtest,f=25/len(xtrain),iter=1,intercept=True)

mse(ytest,yhat)

56.84140077872742

## Concrete K-Fold Cross-Validation (K=10)

In [None]:
mse_rf = []
mse_ab = []
kf = KFold(n_splits=10, shuffle=True, random_state=1234)
model_list = [('Random Forest', RandomForestRegressor(n_estimators=200, max_depth=5)),
              ('AdaBoost', AdaBoostRegressor(n_estimators=200))]
kernel_list = ['Epanechnikov', 'Gaussian', 'Quartic', 'Tricubic']
mse_lwr = [[] for _ in kernel_list]

x_scaled = pipe.fit_transform(x)

for idxtrain, idxtest in kf.split(x):
    xtrain = x_scaled[idxtrain]
    ytrain = y[idxtrain]
    ytest = y[idxtest]
    xtest = x_scaled[idxtest]

    for i, k in enumerate(kernel_list):
        yhat_lw = boosted_lwr(xtrain, ytrain, xtest, f=25/len(xtrain), iter=1, intercept=True, kernel=k)
        mse_lwr[i].append(mse(ytest, yhat_lw))

    for name, model in model_list:
        model.fit(xtrain, ytrain)
        yhat = model.predict(xtest)
        mse_ = mse(ytest, yhat)
        if name == 'Random Forest':
            mse_rf.append(mse_)
        else:
            mse_ab.append(mse_)

print(f'The Cross-validated Mean Squared Error for Locally Weighted Regression (Epanechnikov) is: {np.mean(mse_lwr[0])}')
print(f'The Cross-validated Mean Squared Error for Locally Weighted Regression (Gaussian) is: {np.mean(mse_lwr[1])}')
print(f'The Cross-validated Mean Squared Error for Locally Weighted Regression (Quartic) is: {np.mean(mse_lwr[2])}')
print(f'The Cross-validated Mean Squared Error for Locally Weighted Regression (Tricubic) is: {np.mean(mse_lwr[3])}')
print(f'The Cross-validated Mean Squared Error for Random Forest is: {np.mean(mse_rf)}')
print(f'The Cross-validated Mean Squared Error for AdaBoost is: {np.mean(mse_ab)}')

## Housing Dataset

In [None]:
housing.head()

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

housing['town'] = label_encoder.fit_transform(housing['town'])
housing['river'] = label_encoder.fit_transform(housing['river'])

In [None]:
housing.head()

In [None]:
x = housing.loc[:,'town':'lstat'].values
y = housing['cmedv'].values

xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123)

xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

yhat = boosted_lwr(xtrain,ytrain,xtest,f=25/len(xtrain),iter=1,intercept=True)

mse(ytest,yhat)

In [None]:
mse_rf = []
mse_ab = []
kf = KFold(n_splits=10, shuffle=True, random_state=1234)
model_list = [('Random Forest', RandomForestRegressor(n_estimators=200, max_depth=5)),
              ('AdaBoost', AdaBoostRegressor(n_estimators=200))]
kernel_list = ['Epanechnikov', 'Gaussian', 'Quartic', 'Tricubic']
mse_lwr = [[] for _ in kernel_list]

x_scaled = pipe.fit_transform(x)

for idxtrain, idxtest in kf.split(x):
    xtrain = x_scaled[idxtrain]
    ytrain = y[idxtrain]
    ytest = y[idxtest]
    xtest = x_scaled[idxtest]

    for i, k in enumerate(kernel_list):
        yhat_lw = boosted_lwr(xtrain, ytrain, xtest, f=25/len(xtrain), iter=1, intercept=True, kernel=k)
        mse_lwr[i].append(mse(ytest, yhat_lw))

    for name, model in model_list:
        model.fit(xtrain, ytrain)
        yhat = model.predict(xtest)
        mse_ = mse(ytest, yhat)
        if name == 'Random Forest':
            mse_rf.append(mse_)
        else:
            mse_ab.append(mse_)

print(f'The Cross-validated Mean Squared Error for Locally Weighted Regression (Epanechnikov) is: {np.mean(mse_lwr[0])}')
print(f'The Cross-validated Mean Squared Error for Locally Weighted Regression (Gaussian) is: {np.mean(mse_lwr[1])}')
print(f'The Cross-validated Mean Squared Error for Locally Weighted Regression (Quartic) is: {np.mean(mse_lwr[2])}')
print(f'The Cross-validated Mean Squared Error for Locally Weighted Regression (Tricubic) is: {np.mean(mse_lwr[3])}')
print(f'The Cross-validated Mean Squared Error for Random Forest is: {np.mean(mse_rf)}')
print(f'The Cross-validated Mean Squared Error for AdaBoost is: {np.mean(mse_ab)}')