# Strength of a fit

## Setup

In [None]:
import pandas as pd
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
alt.data_transformers.disable_max_rows()

## Data

### Create data

In [None]:
df = pd.DataFrame(
    {'sales': [2500, 4500, 6500, 8500, 10500, 12500, 14500, 16500, 18500, 20500],
      'ads'  : [900, 1400, 3600, 3800, 6200, 5200, 6800, 8300, 9800, 10100]}
)

### Variable lists

In [None]:
# Prepare the data
X = df[["ads"]]
y = df[["sales"]]

## Model

### Select model

In [None]:
# linear regression model
reg = LinearRegression()

### Fit model

In [None]:
# Train the model
reg.fit(X, y)

### Coefficients

In [None]:
# Intercept
print(f' Intercept: {reg.intercept_} \n Slope: {reg.coef_[0]}')

### Make predictions 

In [None]:
# Make predictions
y_pred = reg.predict(X)

In [None]:
# Add predictions to dataframe
df['y_pred'] = y_pred

In [None]:
df.head()

### Plot model

In [None]:
chart = alt.Chart(df).mark_point().encode(
    x='ads',
    y='sales'
)

chart + chart.transform_regression('ads', 'sales', extent=[0, 12000]).mark_line()

### Calculate R-squared

#### Squared errors (SE)

Calculate the squared difference between actual and predicted sales and save it as `se`in the dataframe.

In [None]:
### BEGIN SOLUTION
df['se'] = (df['sales'] - df['y_pred']) ** 2
### END SOLUTION

In [None]:
df.head()

In [None]:
# Check your code
assert 113310 < df.iloc[0, 3] < 113330

#### Sum of squared errors (SSE)

Calculate the **sum of squared errors**. Call the result `sse`. 

*Note that `sse` is also known as "sum of squared residuals" or "residual sum of squares".*

In [None]:
### BEGIN SOLUTION
sse = df['se'].sum()
### END SOLUTION

In [None]:
sse

In [None]:
# Check your code
assert 11607390 < sse 11607393

> Is this good or bad?

- To get a better idea, let's calculate the variability in the 
sales values by how far they tend to fall from their mean

#### Total sum of squares (SST)

In [None]:
# we calculate the mean of sales
df['sales_mean'] = df['sales'].mean()

In [None]:
chart = alt.Chart(df).mark_point().encode(
    x='ads',
    y='sales'
)

line = alt.Chart(df).mark_rule(color='red').encode(
    y='mean(sales)'
)

chart + line

In [None]:
# calculate squared difference between actual and mean (the variance)
df['variance'] = (df['sales'] - df['sales_mean']) ** 2


- We define the sum of the squared differences as the **total sum of squares** (`SST`)

In [None]:
sst = df['variance'].sum()

In [None]:
sst

#### R-squared


Calculate $R^2$ and save the result as $r2$:

$$R^2 = (SST - SSE) / SST$$

In [None]:
### BEGIN SOLUTION
r2 = (sst - sse) / sst
### END SOLUTION

In [None]:
r2

In [None]:
### Check your code
assert 0.963 < r2 < 0.965

### Use function to calculate R2

In [None]:
# we can use r2_score
r2_score(y, y_pred)

In [None]:
# we can also use .score
reg.score(X, y)