In [22]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, QuantileTransformer, MinMaxScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from scipy.spatial import Delaunay
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import scipy.stats as stats
from sklearn.model_selection import train_test_split as tts, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from scipy.interpolate import interp1d, RegularGridInterpolator, griddata, LinearNDInterpolator, NearestNDInterpolator
from math import ceil
from scipy import linalg

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted


In [23]:
from scipy.spatial.distance import cdist
import xgboost
from sklearn import linear_model

# 1)

### Part a)
Create your class that implements the Gradient Boosting concept, based on the locally weighted regression method (Lowess class), and that allows a user-prescribed number of boosting steps.
Required methods:
 - fit
 - is_fitted
 - predict

----

In [24]:
# Gaussian Kernel
def Gaussian(w):
  return np.where(w>4,0,1/(np.sqrt(2*np.pi))*np.exp(-1/2*w**2))

# Tricubic Kernel
def Tricubic(w):
  return np.where(w>1,0,70/81*(1-w**3)**3)

# Quartic Kernel
def Quartic(w):
  return np.where(w>1,0,15/16*(1-w**2)**2)

# Epanechnikov Kernel
def Epanechnikov(w):
  return np.where(w>1,0,3/4*(1-w**2))

In [25]:
def dist(u,v):
  if len(v.shape)==1:
    v = v.reshape(1,-1)
  d = np.array([np.sqrt(np.sum((u-v[i])**2,axis=1)) for i in range(len(v))])
  return d

In [26]:
def weight_function(u,v,kern=Gaussian,tau=0.5):
    return kern(cdist(u, v, metric='euclidean')/(2*tau))


---


In [27]:
def lw_ag_md(x, y, xnew,f=2/3,iter=3, intercept=True):

  n = len(x)
  r = int(ceil(f * n))
  yest = np.zeros(n)

  if len(y.shape)==1: # here we make column vectors
    y = y.reshape(-1,1)

  if len(x.shape)==1:
    x = x.reshape(-1,1)

  if intercept:
    x1 = np.column_stack([np.ones((len(x),1)),x])
  else:
    x1 = x

  h = [np.sort(np.sqrt(np.sum((x-x[i])**2,axis=1)))[r] for i in range(n)]
  # dist(x,x) is always symmetric
  w = np.clip(dist(x,x) / np.array(h), 0.0, 1.0)
  # note that w is a square matrix and in Python arithmetic operations such as
  # w**3 or 1-w**3 are performed element-wise
  #w = (1-w**3)**3 # a Tricubic kernel
  w = Epanechnikov(w)

  #Looping through all X-points
  delta = np.ones(n)
  for iteration in range(iter):
    for i in range(n):
      W = np.diag(delta).dot(np.diag(w[i,:]))
      # when we multiply two diagonal matrices we get also a diagonal matrix
      b = np.transpose(x1).dot(W).dot(y)
      A = np.transpose(x1).dot(W).dot(x1)
      ##
      A = A + 0.0001*np.eye(x1.shape[1]) # if we want L2 regularization for solving the system
      beta = linalg.solve(A, b)

      #beta, res, rnk, s = linalg.lstsq(A, b)
      yest[i] = np.dot(x1[i],beta.ravel())

    residuals = y.ravel() - yest
    s = np.median(np.abs(residuals))

    delta = np.clip(residuals / (6.0 * s), -1, 1)

    delta = (1 - delta ** 2) ** 2

  # here we are making predictions for xnew by using an interpolation and the predictions we made for the train data
  if x.shape[1]==1:
    f = interp1d(x.flatten(),yest,fill_value='extrapolate')
    output = f(xnew)
  else:
    output = np.zeros(len(xnew))
    for i in range(len(xnew)):
      ind = np.argsort(np.sqrt(np.sum((x-xnew[i])**2,axis=1)))[:r]
      pca = PCA(n_components=3)
      x_pca = pca.fit_transform(x[ind])
      tri = Delaunay(x_pca,qhull_options='QJ Pp')
      f = LinearNDInterpolator(tri,yest[ind])
      output[i] = f(pca.transform(xnew[i].reshape(1,-1)))
      # the output may have NaN's where the data points from xnew are outside the convex hull of X

  if sum(np.isnan(output))>0:
    g = NearestNDInterpolator(x,yest.ravel())
    # output[np.isnan(output)] = g(X[np.isnan(output)])
    output[np.isnan(output)] = g(xnew[np.isnan(output)])
  return output

In [8]:
class MyGradientBoost():
  def __init__(self, kernel = 'Gaussian', tau = .2):
    self.kernel = kernel
    self.tau = tau
    return

  def fit(self, x, y):
    kernel = self.kernel
    tau = self.tau
    self.xtrain_ = x
    self.yhat_ = y

  def is_fitted(self):
    check_is_fitted(self)

  def predict(self, x_new):
    self.is_fitted()
    x = self.xtrain_
    y = self.yhat_
    lm = linear_model.Ridge(alpha=0.001)
    w = weight_function(x,x_new,self.kernel,self.tau)

    if np.isscalar(x_new):
      lm.fit(np.diag(w)@(x.reshape(-1,1)),np.diag(w)@(y.reshape(-1,1)))
      yest = lm.predict([[x_new]])[0][0]
    else:
      n = len(x_new)
      yest_test = np.zeros(n)
      #Looping through all x-points
      for i in range(n):
        lm.fit(np.diag(w[:,i])@x,np.diag(w[:,i])@y)
        yest_test[i] = lm.predict([x_new[i]])
    return yest_test






In [29]:
class GradBoost:
    def __init__(self, kernel = Gaussian, tau=0.05):
        self.kernel = kernel
        self.tau = tau

    def fit(self, x, y):
        kernel = self.kernel
        tau = self.tau
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        lm = linear_model.Ridge(alpha=0.001)
        w = weight_function(x,x_new,self.kernel,self.tau)

        if np.isscalar(x_new):
          lm.fit(np.diag(w)@(x.reshape(-1,1)),np.diag(w)@(y.reshape(-1,1)))
          yest = lm.predict([[x_new]])[0][0]
        else:
          n = len(x_new)
          yest_test = np.zeros(n)
          #Looping through all x-points
          for i in range(n):
            lm.fit(np.diag(w[:,i])@x,np.diag(w[:,i])@y)
            yest_test[i] = lm.predict([x_new[i]])
        return yest_test

### Part b)
- Show applications with real data for regression
  - For concrete dataset, determine a choice of hyperparameters that yield lower MSEs for your method thean the eXtream Gradient Boosting Library
- 10-fold cross-validation
- Compare effects on different scalers
  - StandardScaler
  - MinMaxScaler
  - QuantileScaler

In [15]:
scaler_s = StandardScaler()
scaler_q = QuantileTransformer()
scaler_m = MinMaxScaler()

In [19]:
scaler_list = [scaler_s, scaler_m, scaler_q]

In [9]:
data = pd.read_csv('/content/drive/MyDrive/W&M/F23/DATA301_Data/concrete.csv')

In [17]:
data.head(3)

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27


In [12]:
x = data.loc[:,'cement':'age'].values
y = data['strength'].values

In [13]:
xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123)

In [31]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [32]:
mse_lower_m = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)
model_rf = RandomForestRegressor(n_estimators=200,max_depth=7)
model_1 = GradBoost(kernel= Gaussian,tau=0.14)
model_2 = GradBoost(kernel= Epanechnikov,tau=0.3)

for scale in scaler_list:
  mse_lwr = []
  for idxtrain, idxtest in kf.split(x):
    xtrain = x[idxtrain]
    ytrain = y[idxtrain].ravel()
    ytest = y[idxtest].ravel()
    xtest = x[idxtest]
    xtrain = scale.fit_transform(xtrain, ytrain)
    xtest = scale.transform(xtest)

    model_1.fit(xtrain,ytrain)
    yhat_train = model_1.predict(xtrain)
    residuals_train = ytrain - yhat_train
    model_2.fit(xtrain, residuals_train)
    residuals_hat = model_2.predict(xtest)
    yhat_lw = model_1.predict(xtest) + model_2.predict(xtest)

    mse_lwr.append(mse(ytest,yhat_lw))
  mse_lower_m.append(np.mean(mse_lwr))


print(f'The Cross-validated Mean Squared Error for Locally Weighted Regression is StandardScaler:\n{mse_lower_m[0]}, \n MinMax:{mse_lower_m[1]}, \n Quantile:{mse_lower_m[2]}')

The Cross-validated Mean Squared Error for Locally Weighted Regression is StandardScaler:
160.46632956977277, 
 MinMax:43.09254217553561, 
 Quantile:20.523452399924093


In [33]:
model_xgboost = xgboost.XGBRFRegressor(n_estimators=200,max_depth=7)

In [34]:
model_xgboost.fit(xtrain,ytrain)
mse(ytest,model_xgboost.predict(xtest))

27.443587005918328

Using the Quantile scaler, my model has a lower MSE compared to XGBoost.

My MSE = 20.523452399924093
XGBoost = 27.443587005918328

#2)
### Based on the Usearch library, create your own class that computes the k_Nearest Neighbors for Regression.

In [35]:
!pip install usearch

Collecting usearch
  Downloading usearch-2.9.1-cp310-cp310-manylinux_2_28_x86_64.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: usearch
Successfully installed usearch-2.9.1


In [46]:
import numpy as np
from scipy.spatial.distance import cdist
from usearch.index import search, MetricKind, Matches, BatchMatches, Index

In [38]:
vectors = np.random.rand(10000, 1024).astype(np.float32)
vector = np.random.rand(1024).astype(np.float32)

In [48]:
k = 5

In [50]:
class knn_regression_usearch():
  def __init__(self, k):
    self.k = k

  def fit(self, x, y):
    self.xtrain_ = x
    self.ytrain_ = y


  def ind_vectors(self, vectors):
    many_in_many: BatchMatches = search(vectors, vectors, vectors.shape[0], MetricKind.L2sq, exact=True)
    return many_in_many.to_list()

  def predict(self, x):
    preds_list = []
    for xi in x:
      x_scale = scaler_s.fit_transform.transform([xi])
      one_in_many = search(self.X_train, x_scale, self.k, MetricKind.L2sq, exact=True)
      distances = one_in_many.to_list()

      idx_nearest = np.array(distances)[:, 0].astype('int64')
      nearest_distances = np.array(distances)[:,1].astype(float)

      weights = 1 / nearest_distances
      sum_weight = np.sum(weights)

      if sum_weight != 0:
        pred = np.sum(weights * self.y_train[idx_nearest]) / sum_weight
      preds_list.append(pred)
    return preds_list