<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Only do this once per VM, otherwise you'll get multiple clones and nested directories
!git clone https://github.com/jamestheengineer/data-science-from-scratch-Python.git
%cd data-science-from-scratch-Python/
!pip install import-ipynb
import import_ipynb

Cloning into 'data-science-from-scratch-Python'...
remote: Enumerating objects: 83, done.[K
remote: Counting objects:   1% (1/83)[Kremote: Counting objects:   2% (2/83)[Kremote: Counting objects:   3% (3/83)[Kremote: Counting objects:   4% (4/83)[Kremote: Counting objects:   6% (5/83)[Kremote: Counting objects:   7% (6/83)[Kremote: Counting objects:   8% (7/83)[Kremote: Counting objects:   9% (8/83)[Kremote: Counting objects:  10% (9/83)[Kremote: Counting objects:  12% (10/83)[Kremote: Counting objects:  13% (11/83)[Kremote: Counting objects:  14% (12/83)[Kremote: Counting objects:  15% (13/83)[Kremote: Counting objects:  16% (14/83)[Kremote: Counting objects:  18% (15/83)[Kremote: Counting objects:  19% (16/83)[Kremote: Counting objects:  20% (17/83)[Kremote: Counting objects:  21% (18/83)[Kremote: Counting objects:  22% (19/83)[Kremote: Counting objects:  24% (20/83)[Kremote: Counting objects:  25% (21/83)[Kremote: Counting objects:  26% (22

In [0]:
# Linear Regression
def predict(alpha: float, beta: float, x_i: float) -> float:
  return beta * x_i + alpha

def error(alpha: float, beta: float, x_i: float, y_i: float) -> float:
  """
  The error from predicting beta*x_i + alpha
  when the actual value is y_i
  """
  return predict(alpha, beta, x_i) - y_i

from Chapter_04 import Vector

def sum_of_sqerrors(alpha: float, beta: float, x: Vector, y: Vector) -> float:
  return sum(error(alpha, beta, x_i, y_i) **2
             for x_i, y_i in zip(x, y))

In [0]:
from typing import Tuple
from Chapter_05 import correlation, standard_deviation, mean

def least_squares_fit(x: Vector, y: Vector) -> Tuple[float, float]:
  """
  Given two vectors x and y,
  find the least-squares values of alpha and beta
  """
  beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
  alpha = mean(y) - beta * mean(x)
  return alpha, beta

# Quick test
x = [i for i in range(-100, 110, 10)]
y = [3 * i - 5 for i in x]

# Should find that y = 3 * x - 5
assert least_squares_fit(x, y) == (-5, 3)

In [0]:
from Chapter_05 import num_friends_good, daily_minutes_good

alpha, beta = least_squares_fit(num_friends_good, daily_minutes_good)
assert 22.9 < alpha < 23.0
assert 0.9 < beta < 0.905

In [0]:
# Using R-squared is a common way to determine how well we fit
from Chapter_05 import de_mean

def total_sum_of_squares(y: Vector) -> float:
  """The total squared variation of y_i's from their mean"""
  return sum(v ** 2 for v in de_mean(y))

def r_squared(alpha: float, beta: float, x: Vector, y: Vector) -> float:
  """
  The fraction of variation in y cpatured by the model, which equals
  1 - the fraction of variation in y not captured by the model
  """
  return 1.0 - (sum_of_sqerrors(alpha, beta, x, y) /
                total_sum_of_squares(y))

In [0]:
rsq = r_squared(alpha, beta, num_friends_good, daily_minutes_good)
assert 0.328 < rsq < 0.330

In [0]:
# Use gradient descent
import random
import tqdm
from Chapter_08 import gradient_step

num_epochs = 10000
random.seed(0)

guess = [random.random(), random.random()] 

learning_rate = 0.00001

with tqdm.trange(num_epochs) as t:
  for _ in t:
    alpha, beta = guess

    # Partial derivative of loss with respect to alpha
    grad_a = sum(2 * error(alpha, beta, x_i, y_i)
                        for x_i, y_i in zip(num_friends_good, daily_minutes_good))
    
    # Partial derivative of loss with respect to beta
    grad_b = sum(2 * error(alpha, beta, x_i, y_i) * x_i
                        for x_i, y_i in zip(num_friends_good, daily_minutes_good))
    
    # Compute loss to stick in the tqdm description
    loss = sum_of_sqerrors(alpha, beta, num_friends_good, daily_minutes_good)
    t.set_description(f"loss:{loss:.3f}")

    # Finally, update the guess
    guess = gradient_step(guess, [grad_a, grad_b], -learning_rate)

# We should get pretty much the same results
alpha, beta = guess
assert 22.9 < alpha < 23.0
assert 0.9 < beta < 0.905