# Strength of a fit

## Setup

In [1]:
import pandas as pd
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Data

### Create data

In [2]:
df = pd.DataFrame(
    {'sales': [2500, 4500, 6500, 8500, 10500, 12500, 14500, 16500, 18500, 20500],
      'ads'  : [900, 1400, 3600, 3800, 6200, 5200, 6800, 8300, 9800, 10100]}
)

### Variable lists

In [3]:
# Prepare the data
X = df[["ads"]]
y = df[["sales"]]

## Model

### Select model

In [4]:
# linear regression model
reg = LinearRegression()

### Fit model

In [5]:
# Train the model
reg.fit(X, y)

### Coefficients

In [6]:
# Intercept
print(f' Intercept: {reg.intercept_} \n Slope: {reg.coef_[0]}')

 Intercept: [1181.21008618] 
 Slope: [1.83935649]


### Make predictions 

In [7]:
# Make predictions
y_pred = reg.predict(X)

In [8]:
# Add predictions to dataframe
df['y_pred'] = y_pred

In [9]:
df.head()

Unnamed: 0,sales,ads,y_pred
0,2500,900,2836.630928
1,4500,1400,3756.309173
2,6500,3600,7802.893453
3,8500,3800,8170.764752
4,10500,6200,12585.22033


### Plot model

In [10]:
chart = alt.Chart(df).mark_point().encode(
    x='ads',
    y='sales'
)

chart + chart.transform_regression('ads', 'sales', extent=[0, 12000]).mark_line()

### Calculate R-squared

#### Squared errors (SE)

In [11]:
# calculate squared difference between actual and predicted
df['se'] = (df['sales'] - df['y_pred']) ** 2
df.head()

Unnamed: 0,sales,ads,y_pred,se
0,2500,900,2836.630928,113320.4
1,4500,1400,3756.309173,553076.0
2,6500,3600,7802.893453,1697531.0
3,8500,3800,8170.764752,108395.8
4,10500,6200,12585.22033,4348144.0


#### Sum of squared errors (SSE)

- Calculate the **sum of squared errors** (`SSE`) 
    - SSE is also known as "sum of squared residuals" (SSR) or "residual sum of squares" (RSS)

In [12]:
sse = df['se'].sum()
sse

11607391.429087555

> Is this good or bad?

- To get a better idea, let's calculate the variability in the 
sales values by how far they tend to fall from their mean

#### Total sum of squares (SST)

In [13]:
# calculate mean of sales
df['sales_mean'] = df['sales'].mean()

In [14]:
chart = alt.Chart(df).mark_point().encode(
    x='ads',
    y='sales'
)

line = alt.Chart(df).mark_rule(color='red').encode(
    y='mean(sales)'
)

chart + line

In [15]:
# calculate squared difference between actual and mean (the variance)
df['variance'] = (df['sales'] - df['sales_mean']) ** 2


- We define the sum of the squared differences as the **total sum of squares** (`SST`)

In [16]:
sst = df['variance'].sum()
sst

330000000.0

#### R-squared

In [17]:
r2 = (sst - sse) / sst
r2

0.9648260865785225

### Use function to calculate R2

In [18]:
# we can use r2_score
r2_score(y, y_pred)

0.9648260865785225

In [19]:
# we can also use .score
reg.score(X, y)

0.9648260865785225