# load the data

In [3]:
import numpy as np
np.set_printoptions(suppress=True)

#import row data
row_data = np.loadtxt('sarcos_inv.csv',delimiter = ',')
np.random.seed(1)
np.random.shuffle(row_data)
X = row_data[:,:21]
Y = row_data[:,21:]

# normalize_x
X = (X - np.mean(X)) / np.std(X)

#split data
split_size = [0.6,0.2,0.2]
index_train = int(X.shape[0] * split_size[0])
index_cv = index_train + int(X.shape[0] * split_size[1])
index_test = index_cv + int(X.shape[0] * split_size[2])
X_train = X[:index_train, :]
Y_train = Y[:index_train, :]
X_test = X[index_cv:index_test, :]
Y_test = Y[index_cv:index_test, :]

# Nearest neighbour

In [4]:
best_k = -100
best_rmse = 10000000000000000000000
for k in range(1,10):
    size1 = Y_test.shape
    X_predicted = np.zeros(size1)
    size2 = X_predicted.shape[0]
    for i in range(size2):
        distance = np.linalg.norm(X_train - X_test[i], axis=1)
        index = np.argpartition(distance, k)[:k]
        weights = 1 / distance[index]**2
        X_predicted[i][0] = np.sum(np.squeeze(Y_train[index]) \
                                   * np.squeeze(weights))/ np.sum(weights)
    rmse = np.sqrt(np.mean(np.square(X_predicted - Y_test)))
    if rmse < best_rmse:
        best_rmse = rmse
        best_k = k 

In [5]:
print("Nearest neighbor:")
print("k:{}".format(best_k))
print("RMSE: {}.".format(rmse))

Nearest neighbor:
k:5
RMSE: 5.027232981806817.


# Linear regression

In [6]:
n = X_train.shape[1]
w = np.random.normal(0, 0.5, size=(n, 1))
b = np.random.normal(0, 0.5)

iters = 10000
learning_rate = 0.1

costed = 100000000000000000000
new_learning_rate = learning_rate

for i in range(iters):
    prediction = np.dot(X_train, w) + b
    cost = np.sum((prediction - Y_train)**2) / (2 * Y_train.shape[0])
    m = prediction.shape[0]
    grad_w = np.dot(X_train.T, (prediction - Y_train)) / m
    grad_b = np.sum((prediction - Y_train)) / m
    w = w - learning_rate * grad_w
    b = b - learning_rate * grad_b      
    if cost >= costed:
        c_learning_rate = c_learning_rate * 0.5
        continue
        
    costed = cost

prediction = np.dot(X_test, w) + b
rmse = np.sqrt(np.mean(np.square(prediction - Y_test)))    

In [7]:
print("Linear regression :")
print("RMSE: {}.".format(rmse))

Linear regression :
RMSE: 5.605214619103697.


# Regression forest

# Gaussian process

In [20]:
class get_raw_data:

    def __init__(self, path):
        data = np.loadtxt(path, delimiter=',')
        np.random.seed(1)
        np.random.shuffle(data)
        self.X = data[:, :21]
        self.Y = data[:, 21:]

    def split_data(self, split, action=False):

        X = self.X
        if action:
            X = get_raw_data.norm_data(self.X)
        index_train = int(self.X.shape[0] * split[0])
        X_train = X[:index_train, :]
        Y_train = self.Y[:index_train, :]
        index_cv = index_train + int(self.X.shape[0] * split[1])
        index_test = index_cv + int(np.round(self.X.shape[0] * split[2]))
        X_test = X[index_cv:index_test, :]
        Y_test = self.Y[index_cv:index_test, :]
        return X_train, Y_train, X_test, Y_test

    def norm_data(X):
        X = (X - np.mean(X, axis=0)) / np.std(X)
        return X

In [21]:
class GaussianProcess:

    def __init__(self, X_train, Y_train):

        self.X_train = X_train
        self.Y_train = Y_train
        
    def test(self, X_test, Y_test, length_scale):

        distance = np.sum(self.X_train**2, axis=1, keepdims=True) + np.sum(self.X_train**2, axis=1) - 2 * np.dot(self.X_train, self.X_train.T)
        K = np.exp(-(1 / (2 * length_scale**2)) * distance)
        L = np.linalg.cholesky(K)
        distance = np.sum(X_test**2, axis=1, keepdims=True) + np.sum(self.X_train**2, axis=1) - 2 * np.dot(X_test, self.X_train.T)
        K_star_ = np.exp(-(1 / (2 * length_scale**2)) * distance)
        v = np.linalg.solve(L, self.Y_train)
        w = np.linalg.solve(L.T, v)
        mu = np.dot(K_star_, w)
        rmse = np.sqrt(np.mean(np.square(mu - Y_test))) 

        return rmse