In [1]:
import pandas as pd
import numpy as np

In [5]:
advertising=pd.read_csv("../datasets/Advertising.csv",index_col=0)

In [6]:
advertising.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


# Simple Linear Regression

Note that 

$\hat{\beta_1}=\frac{\sum_{i=1}^{n}({x_i-\bar{x}})(y_i-\bar{y})}{\sum_{i=1}^{n}(x_i-\bar{x})^2}$

$\hat{\beta_0}=\bar{y}-\hat{\beta_1}\bar{x}$

In [12]:
x=advertising['TV'].values
y=advertising['sales'].values

In [14]:
x.shape, y.shape

((200,), (200,))

In [16]:
x_bar = x.mean()
y_bar = y.mean()

In [20]:
b_1=np.sum(((x-x_bar)*(y-y_bar)))/np.sum(((x-x_bar)**2))
b_0=y_bar-(b_1*x_bar)

In [23]:
print("b_0:",b_0)
print("b_1:",b_1)

b_0: 7.0325935491276965
b_1: 0.047536640433019736


In [24]:
y_hat=b_0+b_1*x

In [27]:
y_hat[0:5] # first five predictions

array([17.97077451,  9.14797405,  7.85022376, 14.23439457, 15.62721814])

$R^2 = 1-\frac{RSS}{TSS}$ where

$RSS=\sum_{i=1}^{n}(y_i-\hat{y_i})^2$
$TSS=\sum_{i=1}^{n}(y_i-\bar{y_i})^2$

In [28]:
RSS=np.sum((y-y_hat)**2)
TSS=np.sum((y-y_bar)**2)

In [29]:
R_squared=1-(RSS/TSS)

In [56]:
print("R^2:",R_squared)

R^2: 0.611875050850071


## Validating with statistical package

In [32]:
from sklearn.linear_model import LinearRegression

In [41]:
x=advertising['TV'].values.reshape(-1,1)

In [43]:
lin_reg=LinearRegression()

In [44]:
lin_reg.fit(x,y)

In [51]:
print("b_0:",lin_reg.intercept_)
print("b_1:",lin_reg.coef_)

b_0: 7.0325935491276885
b_1: [0.04753664]


In [52]:
y_pred=lin_reg.predict(x)

In [57]:
print("R_squared from sklearn:",lin_reg.score(x,y))

R_squared from sklearn: 0.611875050850071
