In [3]:
import pandas as pd
import numpy as np

In [4]:
advertising=pd.read_csv("../datasets/Advertising.csv",index_col=0)

In [5]:
advertising.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


# Multiple Linear Regression

Thanks to matrix algebra, we have

$\hat{\beta}=(X^TX)^{-1}X^Ty$

where $X\in \mathbb{R}^{NxP}$
$y\in \mathbb{R}^{Nx1}$

In [145]:
X=advertising.drop('sales',axis=1)
beta_0=np.ones(X.shape[0]).reshape(-1,1) #adds on intercept
X=np.concatenate((beta_0,X),axis=1)
y=advertising['sales'].values.reshape(-1,1)

In [146]:
X.shape, y.shape

((200, 4), (200, 1))

In [92]:
beta=np.linalg.inv(X.T@X)@X.T@y

In [93]:
print("Beta estimates from scratch:",beta.ravel())

Beta estimates from scratch: [ 2.93888937e+00  4.57646455e-02  1.88530017e-01 -1.03749304e-03]


In [109]:
y_hat=(X@beta).ravel()
y_bar=y.mean()

In [118]:
RSS=np.sum((y.ravel()-y_hat)**2)
TSS=np.sum((y.ravel()-y_bar)**2)

In [119]:
R_squared=1-(RSS/TSS)

In [123]:
print("R_squared from scratch:",R_squared)

R_squared from scratch: 0.8972106381789522


## Validating with statistical package

In [124]:
from sklearn.linear_model import LinearRegression

In [125]:
lin_reg=LinearRegression()

In [129]:
X=advertising.drop('sales',axis=1).values

In [130]:
lin_reg.fit(X,y)

In [142]:
print("beta from sklearn:",np.concatenate([lin_reg.intercept_.ravel(),lin_reg.coef_.ravel()]))

beta from sklearn: [ 2.93888937e+00  4.57646455e-02  1.88530017e-01 -1.03749304e-03]


In [144]:
print("R_squared from sklearn:",lin_reg.score(X,y))

R_squared from sklearn: 0.8972106381789522
