# Strength of a fit

## Setup

In [1]:
import pandas as pd
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

## Data

In [2]:
df = pd.DataFrame(
    {'sales': [2500, 4500, 6500, 8500, 10500, 12500, 14500, 16500, 18500, 20500],
      'ads'  : [900, 1400, 3600, 3800, 6200, 5200, 6800, 8300, 9800, 10100]}
)

# Prepare the data
X = df[["ads"]]
y = df[["sales"]]

## Model

### Fit the model

In [3]:
# linear regression model
reg = LinearRegression()

# Train the model
reg.fit(X, y)

# Make predictions
y_pred = reg.predict(X)

In [4]:
# Add predictions to dataframe
df['y_pred'] = y_pred
df.head()

Unnamed: 0,sales,ads,y_pred
0,2500,900,2836.630928
1,4500,1400,3756.309173
2,6500,3600,7802.893453
3,8500,3800,8170.764752
4,10500,6200,12585.22033


### Plot model

In [5]:
chart = alt.Chart(df).mark_point().encode(
    x='ads',
    y='sales'
)

chart + chart.transform_regression('ads', 'sales').mark_line()

## Calculate R-squared

In [6]:
# calculate squared difference between actual and predicted
df['errors_squared'] = (df['sales'] - df['y_pred']) ** 2
df.head()

Unnamed: 0,sales,ads,y_pred,errors_squared
0,2500,900,2836.630928,113320.4
1,4500,1400,3756.309173,553076.0
2,6500,3600,7802.893453,1697531.0
3,8500,3800,8170.764752,108395.8
4,10500,6200,12585.22033,4348144.0


### Sum of squared errors(SSE)

- Calculate the **sum of squared errors** (`SSE`) 
    - SSE is also known as "sum of squared residuals" (SSR) or "residual sum of squares" (RSS)

In [7]:
SSE = df['errors_squared'].sum()
SSE

11607391.42908754

> Is this good or bad?

- To get a better idea, let's calculate the variability in the 
sales values by how far they tend to fall from their mean

### Total sum of squares (SST)

In [8]:
# calculate mean of sales
df['sales_mean'] = df['sales'].mean()


In [9]:
chart = alt.Chart(df).mark_point().encode(
    x='ads',
    y='sales'
)

line = alt.Chart(df).mark_rule(color='red').encode(
    y='mean(sales)'
)

chart + line

In [10]:
# calculate squared difference between actual and mean
df['variability_squared'] = (df['sales'] - df['sales_mean']) ** 2


- We define the sum of the squared differences as the **total sum of squares** (`SST`)

In [11]:
SST = df['variability_squared'].sum()
SST

330000000.0

### R-squared

In [12]:
R2 = (SST - SSE) / SST
R2

0.9648260865785226

## Use function to calculate R2

In [13]:
# we can use r2_score
r2_score(y, y_pred)

0.9648260865785226

In [14]:
# we can also use .score
reg.score(X, y)

0.9648260865785226