In [23]:
import argparse

import numpy as np
import sklearn.datasets
import sklearn.model_selection
import sklearn.linear_model

In [5]:
parser = argparse.ArgumentParser()
# These arguments will be set appropriately by ReCodEx, even if you change them.
parser.add_argument("--recodex", default=False, action="store_true", help="Running in ReCodEx")
parser.add_argument("--seed", default=42, type=int, help="Random seed")
parser.add_argument("--test_size", default=0.1, type=lambda x: int(x) if x.isdigit() else float(x), help="Test size")

_StoreAction(option_strings=['--test_size'], dest='test_size', nargs=None, const=None, default=0.1, type=<function <lambda> at 0x137050670>, choices=None, required=False, help='Test size', metavar=None)

In [6]:
dataset = sklearn.datasets.load_diabetes()

In [13]:
print(dataset.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [7]:
alpha = np.ones((len(dataset.data), 1))
dataset.data = np.concatenate((dataset.data, alpha), axis=1)

In [8]:
dataset.data


array([[ 0.03807591,  0.05068012,  0.06169621, ...,  0.01990749,
        -0.01764613,  1.        ],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.06833155,
        -0.09220405,  1.        ],
       [ 0.08529891,  0.05068012,  0.04445121, ...,  0.00286131,
        -0.02593034,  1.        ],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.04688253,
         0.01549073,  1.        ],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.04452873,
        -0.02593034,  1.        ],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.00422151,
         0.00306441,  1.        ]])

In [62]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(dataset.data, dataset.target, test_size=0.1, random_state=42)

In [63]:
betas = np.matmul(np.linalg.inv(np.matmul(np.transpose(X_train), X_train)), np.matmul(np.transpose(X_train), y_train))

In [64]:
betas

array([  19.92730105, -262.5526422 ,  509.19584695,  336.09319329,
       -849.30479304,  480.22664735,  120.69064852,  236.72228404,
        716.61278802,   70.41001991,  151.72264646])

In [75]:
pred = betas*X_test[0]

In [105]:
pred = np.zeros(len(X_test))
for i in range(len(X_test)):
    pred[i] = sum(betas*X_test[i])

In [114]:
rmse = np.sqrt((sum((pre-y_test)**2))/len(pre))

In [115]:
rmse

52.382356168443806

In [76]:
pred

array([   0.90352342,   11.72077962,   -3.16004607,   -5.37714667,
       -106.17898379,   60.12346436,    2.31569111,    8.12167144,
         23.24141824,   -0.36752653,  151.72264646])