# Mean squared error

## Setup

In [1]:
import pandas as pd
import altair as alt

## Data

In [2]:
# let's change some tv values
df = pd.DataFrame(
    {'sales': [2500, 4500, 6500, 8500, 10500, 12500, 14500, 16500, 18500, 20500],
      'ads'  : [900, 1400, 3600, 3800, 6200, 5200, 6800, 8300, 9800, 10100]}
)

In [3]:
df

Unnamed: 0,sales,ads
0,2500,900
1,4500,1400
2,6500,3600
3,8500,3800
4,10500,6200
5,12500,5200
6,14500,6800
7,16500,8300
8,18500,9800
9,20500,10100


In [4]:
alt.Chart(df).mark_point().encode(
    x='ads',
    y='sales'
)

## Model

In [4]:
# We use our simple model from before
df['sales_prediction'] = 500 + df['ads'] * 2 

In [7]:
df.head()

Unnamed: 0,sales,ads,sales_prediction,diff
0,2500,900,2300,200
1,4500,1400,3300,1200
2,6500,3600,7700,-1200
3,8500,3800,8100,400
4,10500,6200,12900,-2400


In [19]:
chart = alt.Chart(df).mark_point().encode(
    x='ads',
    y='sales'
)

line = alt.Chart(df).mark_line().encode(
         alt.X('ads', axis=alt.Axis(title='Ads (in $)')),
         alt.Y('sales_prediction', axis=alt.Axis(title="Sales (in units)")),
         color=alt.value("#0001F5"))

chart + line

### Residuals

In [16]:
# calculate squared difference between actual and predicted
df['errors_squared'] = (df['sales'] - df['sales_prediction']) ** 2
df.head()

Unnamed: 0,sales,ads,sales_prediction,diff,diff_squared,errors_squared
0,2500,900,2300,200,40000,40000
1,4500,1400,3300,1200,1440000,1440000
2,6500,3600,7700,-1200,1440000,1440000
3,8500,3800,8100,400,160000,160000
4,10500,6200,12900,-2400,5760000,5760000


#### Sum of squared errors(SSE)

- Calculate the **sum of squared errors** (`SSE`) 
    - SSE is also known as "sum of squared residuals" (SSR) or "residual sum of squares" (RSS)

In [17]:
SSR = df['errors_squared'].sum()
SSR

14520000

> Is this good or bad?

- To get a better idea, let's calculate the variability in the 
sales values by how far they tend to fall from their mean

## Total sum of squares (SST)

In [23]:
# calculate mean of sales
sales_mean = df['sales'].mean()
sales_mean

11500.0

In [24]:
chart = alt.Chart(df).mark_point().encode(
    x='ads',
    y='sales'
)

line = alt.Chart(df).mark_rule(color='red').encode(
    y='mean(sales)'
)

chart + line

In [None]:
# calculate squared difference between actual and mean
df['variability_squared'] = (df['sales'] - sales_mean) ** 2


- We define the sum of the squared differences as the **total sum of squares** (`SST`)

In [25]:
SST = df['variability_squared'].sum()
SST

330000000.0

In [28]:
SSR < SST

True