# Least squares

## Setup

In [23]:
import pandas as pd
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

## Data

In [24]:
df = pd.DataFrame(
    {'sales': [2500, 4500, 6500, 8500, 10500, 12500, 14500, 16500, 18500, 20500],
      'ads'  : [900, 1400, 3600, 3800, 6200, 5200, 6800, 8300, 9800, 10100]}
)

In [25]:
# Prepare the data
X = df[["ads"]]
y = df[["sales"]]

## Model

### Fit model

In [26]:
# linear regression model
reg = LinearRegression()

# Fit the model
reg.fit(X, y)


### Coefficients

In [27]:
# Intercept
reg.intercept_

array([1181.21008618])

In [28]:
# Slope
reg.coef_

array([[1.83935649]])

### Predictions

In [29]:
# Make predictions
y_pred = reg.predict(X)

In [30]:
y_pred

array([[ 2836.63092797],
       [ 3756.30917341],
       [ 7802.89345334],
       [ 8170.76475151],
       [12585.22032962],
       [10745.86383874],
       [13688.83422414],
       [16447.86896046],
       [19206.90369678],
       [19758.71064404]])

### Evaluation

In [31]:
r2_score(y, y_pred)

0.9648260865785225

In [32]:
chart = alt.Chart(df).mark_point().encode(
    x='ads',
    y='sales'
)

line = alt.Chart(df).mark_line().encode(
         alt.X('ads', axis=alt.Axis(title='Ads (in $)')),
         alt.Y('sales_prediction', axis=alt.Axis(title="Sales (in units)")),
         color=alt.value("#0001F5"))

chart + line

ValueError: sales_prediction encoding field is specified without a type; the type cannot be inferred because it does not match any column in the data.

alt.LayerChart(...)

### Residuals

In [None]:
# calculate squared difference between actual and predicted
df['errors_squared'] = (df['sales'] - df['sales_prediction']) ** 2
df.head()

Unnamed: 0,sales,ads,sales_prediction,diff,diff_squared,errors_squared
0,2500,900,2300,200,40000,40000
1,4500,1400,3300,1200,1440000,1440000
2,6500,3600,7700,-1200,1440000,1440000
3,8500,3800,8100,400,160000,160000
4,10500,6200,12900,-2400,5760000,5760000


#### Sum of squared errors(SSE)

- Calculate the **sum of squared errors** (`SSE`) 
    - SSE is also known as "sum of squared residuals" (SSR) or "residual sum of squares" (RSS)

In [None]:
SSR = df['errors_squared'].sum()
SSR

14520000

> Is this good or bad?

- To get a better idea, let's calculate the variability in the 
sales values by how far they tend to fall from their mean

## Total sum of squares (SST)

In [None]:
# calculate mean of sales
sales_mean = df['sales'].mean()
sales_mean

11500.0

In [None]:
chart = alt.Chart(df).mark_point().encode(
    x='ads',
    y='sales'
)

line = alt.Chart(df).mark_rule(color='red').encode(
    y='mean(sales)'
)

chart + line

In [None]:
# calculate squared difference between actual and mean
df['variability_squared'] = (df['sales'] - sales_mean) ** 2


- We define the sum of the squared differences as the **total sum of squares** (`SST`)

In [None]:
SST = df['variability_squared'].sum()
SST

330000000.0

In [None]:
SSR < SST

True