In [34]:
pip install numpy usearch



In [103]:
import numpy as np
import pandas as pd
import torch
import xgboost
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from scipy.spatial import Delaunay
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error as mse
from scipy import linalg
from scipy.interpolate import interp1d, LinearNDInterpolator, NearestNDInterpolator
from sklearn.decomposition import PCA
from usearch.index import search, Index, MetricKind, Matches, BatchMatches
import usearch
from sklearn.neighbors import KNeighborsRegressor

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

In [36]:
from google.colab import files
uploads = files.upload()

Saving concrete(1).csv to concrete(1) (1).csv


In [70]:
data = pd.read_csv('concrete(1).csv')
#data

In [38]:
x = data.loc[:,'cement':'age'].values
y = data['strength'].values

## Question 1

In [39]:
# Gaussian Kernel
def Gaussian(w):
  return np.where(w>4,0,1/(np.sqrt(2*np.pi))*np.exp(-1/2*w**2))

# Tricubic Kernel
def Tricubic(w):
  return np.where(w>1,0,70/81*(1-w**3)**3)

# Quartic Kernel
def Quartic(w):
  return np.where(w>1,0,15/16*(1-w**2)**2)

# Epanechnikov Kernel
def Epanechnikov(w):
  return np.where(w>1,0,3/4*(1-w**2))

In [40]:
# not sure if these will be needed
def dist(u,v):
  D = []
  if len(v.shape)==1:
    v = v.reshape(1,-1)
  for row in v:
    D.append(np.sqrt(np.sum((u-row)**2,axis=1)))
  return np.array(D).T

In [41]:
def kernel_function(xi,x0,kern, tau):
    return kern(dist(xi,x0)/(2*tau))

In [42]:
def weight_function(u,v,kern=Gaussian,tau=0.5):
    return kern(dist(u,v)/(2*tau))

In [45]:
class Lowess:
    def __init__(self, kernel = Gaussian, tau=0.05):
        self.kernel = kernel
        self.tau = tau

    def fit(self, x, y):
        kernel = self.kernel
        tau = self.tau
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        lm = linear_model.Ridge(alpha=0.001)
        w = weight_function(x,x_new,self.kernel,self.tau)

        if np.isscalar(x_new):
          lm.fit(np.diag(w)@(x.reshape(-1,1)),np.diag(w)@(y.reshape(-1,1)))
          yest = lm.predict([[x_new]])[0][0]
        else:
          n = len(x_new)
          yest_test = np.zeros(n)
          #Looping through all x-points
          for i in range(n):
            lm.fit(np.diag(w[:,i])@x,np.diag(w[:,i])@y)
            yest_test[i] = lm.predict(x_new[i].reshape(1,-1))
        return yest_test

In [46]:
scaler = QuantileTransformer()

In [47]:
class GradBoost:
      def __init__(self, model=Lowess, scaler = scaler, kernel = Gaussian, tau=0.05, iterations = 1):
        self.kernel = kernel
        self.tau = tau
        self.model = model
        self.iterations = iterations
        self.scaler = scaler

      # do I need an intercept
      def fit(self, x, y):
        if self.scaler:
          self.X = self.scaler.fit_transform(x)
          self.Y = y.reshape(-1,1)
        else:
          self.X = x
          self.Y = y

      def is_fitted(self):
          try:
            if self.X.any() and self.Y.any():
              return True
            else:
              return False
          except:
            raise Exception('Data must be fitted for you to predict')

      def predict(self, x_new, iterations):
        if self.is_fitted():
          pass
        else:
          raise Exception('Data must be fitted for you to predict')
        output = np.zeros((iterations, x_new.shape[0]))
        for i in range(self.iterations):
          model1 = self.model(tau=self.tau,kernel=self.kernel)
          model1.fit(self.X, self.Y)
          model2 = self.model(tau=self.tau,kernel=self.kernel)
          model2.fit(self.X, self.Y)
          residuals1 = self.Y - model1.predict(x_new = self.X).reshape(-1,1)
          model2.fit(self.X,residuals1)
          output[i] = np.add(model1.predict(x_new), model2.predict(x_new))
        return output


In [69]:
mse_gradboost = []
mse_xgboost = []

kf = KFold(n_splits=10,shuffle=True,random_state=1234)
scaler = QuantileTransformer(n_quantiles = 70)
kerneluse = Quartic
iterations = 1

for tau in np.linspace(0.5, 0.625, 5):
  #print('tau is: ', tau)
  gradboost = GradBoost(scaler=scaler, kernel=kerneluse,tau=tau, iterations=iterations)
  model_xgboost = xgboost.XGBRFRegressor(n_estimators=200,max_depth=7)
  for idxtrain, idxtest in kf.split(x):
    xtrain = x[idxtrain]
    ytrain = y[idxtrain]
    ytest = y[idxtest]
    xtest = x[idxtest]
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)


    gradboost.fit(xtrain,ytrain)
    yhat_gb = gradboost.predict(xtest, iterations=1)
    y_test = np.repeat(ytest,iterations)
    mse_gradboost.append(mse(y_test, yhat_gb.flatten()))
    #print('the mse for gradboost is: ', mse(ytest, yhat_gb.flatten()))

    model_xgboost.fit(xtrain,ytrain)
    mse_xgboost.append(mse(ytest,model_xgboost.predict(xtest)))

print('The Average Cross-validated Mean Squared Error for Gradiant Boosted Lowess is : '+str(np.mean(mse_gradboost))+
      ' with a Quartic kernel and a Quantile Transformer and a Tau between 0.5 and 0.625')
print('The Average Cross-validated Mean Squared Error for XGBoost is : '+str(np.mean(mse_xgboost)))

tau is:  0.5
tau is:  0.53125
tau is:  0.5625
tau is:  0.59375
tau is:  0.625
The Average Cross-validated Mean Squared Error for Gradiant Boosted Lowess is : 26.218138327075152with a Quartic kernel and a Quantile Transformer and a Tau between 0.5 and 0.625
The Average Cross-validated Mean Squared Error for XGBoost is : 31.45365385676951


## Question 2

In [130]:
class UsearchKNN:

  def __init__(self, n_neighbors =10, scaler = StandardScaler()):
    self.k = n_neighbors
    self.scaler = scaler

  def fit(self,X,y):
    if self.scaler:
      self.X = self.scaler.fit_transform(X)
      self.y = y

    else:
      self.X = X
      self.y = y

  def is_fitted(self):
    try:
      if self.X.any():
        return True
      else:
        return False
    except:
        raise Exception('Data must be fitted before predicting')

  def n_nearest_points(self):
    n = self.X.shape[0]
    points = {}

    for i, test_point in enumerate(self.X):
      shortest = {}
      distances = search(self.X, test_point, n, MetricKind.L2sq, exact=True).distances
      indices = search(self.X, test_point,n, MetricKind.L2sq, exact=True).keys
      for idx, distance in enumerate(points[1:self.k]):
        shortest[indices[idx+1]] = distance
      points[i] = shortest

    return points

  def predict(self, xnew):
    if self.scaler:
      xnew = self.scaler.fit_transform(xnew)

    n = xnew.shape[0]

    y_pred = np.ndarray((n,1))
    for i in range(len(xnew)):
      for row in xnew:
        distances = search(self.X, row,self.k, MetricKind.L2sq, exact=True).distances
        indices = search(self.X, row, self.k,MetricKind.L2sq, exact=True).keys
        weights = (1/(distances))
        ypred = weights@y[indices].reshape(-1,1)/np.sum(weights)
        y_pred[i] = ypred

    return y_pred

In [135]:
usearchknn = UsearchKNN(n_neighbors = 10, scaler= StandardScaler())
defaultknn = KNeighborsRegressor(n_neighbors=10)
usearchmses = []
defaultmses = []


X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state =50)
kfold = KFold(n_splits = 10, shuffle=True, random_state = 50)
for idxtrain, idxtest in kf.split(x):
  X_train = x[idxtrain]
  y_train = y[idxtrain]
  y_test = y[idxtest]
  X_test = x[idxtest]

  usearchknn.fit(X_train, y_train)
  defaultknn.fit(X_train, y_train)
  ypred1 = usearchknn.predict(X_test)
  ypred2 = defaultknn.predict(X_test)

  usearchmses.append(mse(y_test, ypred1))
  defaultmses.append(mse(y_test, ypred2))

print('the mean mse for my Usearch KNN is : ' , np.mean(usearchmses))
print('The mean mse for the default KNN is : ',  np.mean(defaultmses))



the mean mse for my Usearch KNN is :  432.9882368100819
The mean mse for the default KNN is :  89.87481322621359


 (1 point) Host your project on your GitHiub page.