In [1]:
import numpy as np
from sklearn.datasets import load_diabetes

In [3]:
X,y = load_diabetes(return_X_y=True)

In [5]:
X.shape

(442, 10)

In [6]:
y.shape

(442,)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=12)

In [11]:
print(X_train.shape)
print(X_test.shape)

(353, 10)
(89, 10)


In [12]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
reg = lr.fit(X_train,y_train)
y_pred = reg.predict(X_test)

In [13]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.3852693902152976

In [14]:
reg.coef_

array([ -53.9271644 , -228.97924355,  535.29346579,  350.36740549,
       -736.21177951,  427.42399227,   94.8028264 ,  184.23951707,
        721.91554934,  115.46088346])

In [15]:
reg.intercept_

152.89572568540441

#  Creating multiple linear regression like sklearn using maths


$\beta = (X^T X)^{-1} X^T Y$

$y = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_3$


$\beta_0$ = intercept

$\beta_1, \beta_2 and \beta_3$ are coefs 


So in predict function 


$\beta_0 + \text{np.dot} (\beta, x_{\text{test}})$

In [34]:
class myLR:

    def __init__(self):
        self.coef_ = None
        self.intercept_ = None

    def fit(self, X_train, y_train):
        #  This step in class basically adds 1 in every row. Mathematical calcualtions to calculate Bo, B1 and so on..
        X_train = np.insert(X_train, 0,1,axis=1)
        #  calculate coefficients 
        betas = np.linalg.inv(np.dot(X_train.T,X_train)).dot(X_train.T).dot(y_train)
        self.intercept_ = betas[0]
        self.coef_ = betas[1:]
        print(betas) 
 
    def predict(self, X_test):
        y_pred = np.dot(X_test,self.coef_) + self.intercept_
        return y_pred


In [35]:
lr = myLR()
lr.fit(X_train, y_train)

# first value is Bo, then B1 and so on

[ 152.89572569  -53.9271644  -228.97924355  535.29346579  350.36740549
 -736.21177951  427.42399227   94.8028264   184.23951707  721.91554934
  115.46088346]


In [36]:
X_train

array([[ 0.03807591,  0.05068012, -0.02452876, ..., -0.03949338,
        -0.01599887, -0.02593034],
       [-0.06000263,  0.05068012, -0.0105172 , ...,  0.0158583 ,
        -0.00991877, -0.03421455],
       [ 0.03807591,  0.05068012,  0.00564998, ...,  0.03430886,
         0.01482098,  0.06105391],
       ...,
       [ 0.0090156 ,  0.05068012,  0.06924089, ...,  0.03430886,
         0.10329702,  0.07348023],
       [-0.02730979,  0.05068012,  0.06061839, ...,  0.03430886,
         0.03781053,  0.04862759],
       [ 0.08166637,  0.05068012, -0.02560657, ..., -0.00259226,
        -0.04117617, -0.0052198 ]])

In [37]:
y_pred = lr.predict(X_test)

In [38]:
r2_score(y_test,y_pred)

0.3852693902152978

In [39]:
lr.coef_

array([ -53.9271644 , -228.97924355,  535.29346579,  350.36740549,
       -736.21177951,  427.42399227,   94.8028264 ,  184.23951707,
        721.91554934,  115.46088346])

In [40]:
lr.intercept_

152.89572568540441