<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Multiple regression


# Only do this once per VM, otherwise you'll get multiple clones and nested directories
!git clone https://github.com/jamestheengineer/data-science-from-scratch-Python.git
%cd data-science-from-scratch-Python/
!pip install import-ipynb
import import_ipynb

In [None]:
from Chapter_04 import dot, Vector

def predict(x: Vector, beta: Vector) -> float:
  """assumes that the first element of x is 1"""
  return dot(x, beta)

from typing import List

def error(x: Vector, y: float, beta: Vector) -> float:
  return predict(x, beta) - y

def squared_error(x: Vector, y: float, beta: Vector) -> float:
  return error(x, y, beta) ** 2

x = [1, 2, 3]
y = 30
beta = [4, 4, 4] # so prediction = 4 + 8 + 12 = 24

assert error(x,y,beta) == -6
assert squared_error(x, y, beta) == 36


In [None]:
def sqerror_gradient(x: Vector, y: float, beta: Vector) -> Vector:
  err = error(x, y, beta)
  return [2 * err * x_i for x_i in x]

assert sqerror_gradient(x, y, beta) == [-12, -24, -36]


In [None]:
import random
import tqdm
from Chapter_04 import vector_mean
from Chapter_08 import gradient_step

def least_squares_fit(xs: List[Vector],
                     ys: List[float],
                     learning_rate: float = 0.001,
                     num_steps: int = 1000,
                     batch_size: int = 1) -> Vector:
    """
    Find the beta that minimizes the sum of squared errors
    assuming the model y = dot(x, beta).
    """
    # Start with a random guess
    guess = [random.random() for _ in xs[0]]

    for _ in tqdm.trange(num_steps, desc="least squares fit"):
      for start in range(0, len(xs), batch_size):
        batch_xs = xs[start:start+batch_size]
        batch_ys = ys[start:start+batch_size]

        gradient = vector_mean([sqerror_gradient(x, y, guess)
                                for x, y in zip(batch_xs, batch_ys)])
        guess = gradient_step(guess, gradient, -learning_rate)

    return guess

from Chapter_05 import daily_minutes_good
from Chapter_08 import gradient_step

random.seed(0)
# Trial and error used to choose num_iters and step_size
# This will run for a while
learning_rate = 0.001
inputs: List[List[float]] = [[1.,49,4,0],[1,41,9,0],[1,40,8,0],[1,25,6,0],[1,21,1,0],[1,21,0,0],[1,19,3,0],[1,19,0,0],[1,18,9,0],[1,18,8,0],[1,16,4,0],[1,15,3,0],[1,15,0,0],[1,15,2,0],[1,15,7,0],[1,14,0,0],[1,14,1,0],[1,13,1,0],[1,13,7,0],[1,13,4,0],[1,13,2,0],[1,12,5,0],[1,12,0,0],[1,11,9,0],[1,10,9,0],[1,10,1,0],[1,10,1,0],[1,10,7,0],[1,10,9,0],[1,10,1,0],[1,10,6,0],[1,10,6,0],[1,10,8,0],[1,10,10,0],[1,10,6,0],[1,10,0,0],[1,10,5,0],[1,10,3,0],[1,10,4,0],[1,9,9,0],[1,9,9,0],[1,9,0,0],[1,9,0,0],[1,9,6,0],[1,9,10,0],[1,9,8,0],[1,9,5,0],[1,9,2,0],[1,9,9,0],[1,9,10,0],[1,9,7,0],[1,9,2,0],[1,9,0,0],[1,9,4,0],[1,9,6,0],[1,9,4,0],[1,9,7,0],[1,8,3,0],[1,8,2,0],[1,8,4,0],[1,8,9,0],[1,8,2,0],[1,8,3,0],[1,8,5,0],[1,8,8,0],[1,8,0,0],[1,8,9,0],[1,8,10,0],[1,8,5,0],[1,8,5,0],[1,7,5,0],[1,7,5,0],[1,7,0,0],[1,7,2,0],[1,7,8,0],[1,7,10,0],[1,7,5,0],[1,7,3,0],[1,7,3,0],[1,7,6,0],[1,7,7,0],[1,7,7,0],[1,7,9,0],[1,7,3,0],[1,7,8,0],[1,6,4,0],[1,6,6,0],[1,6,4,0],[1,6,9,0],[1,6,0,0],[1,6,1,0],[1,6,4,0],[1,6,1,0],[1,6,0,0],[1,6,7,0],[1,6,0,0],[1,6,8,0],[1,6,4,0],[1,6,2,1],[1,6,1,1],[1,6,3,1],[1,6,6,1],[1,6,4,1],[1,6,4,1],[1,6,1,1],[1,6,3,1],[1,6,4,1],[1,5,1,1],[1,5,9,1],[1,5,4,1],[1,5,6,1],[1,5,4,1],[1,5,4,1],[1,5,10,1],[1,5,5,1],[1,5,2,1],[1,5,4,1],[1,5,4,1],[1,5,9,1],[1,5,3,1],[1,5,10,1],[1,5,2,1],[1,5,2,1],[1,5,9,1],[1,4,8,1],[1,4,6,1],[1,4,0,1],[1,4,10,1],[1,4,5,1],[1,4,10,1],[1,4,9,1],[1,4,1,1],[1,4,4,1],[1,4,4,1],[1,4,0,1],[1,4,3,1],[1,4,1,1],[1,4,3,1],[1,4,2,1],[1,4,4,1],[1,4,4,1],[1,4,8,1],[1,4,2,1],[1,4,4,1],[1,3,2,1],[1,3,6,1],[1,3,4,1],[1,3,7,1],[1,3,4,1],[1,3,1,1],[1,3,10,1],[1,3,3,1],[1,3,4,1],[1,3,7,1],[1,3,5,1],[1,3,6,1],[1,3,1,1],[1,3,6,1],[1,3,10,1],[1,3,2,1],[1,3,4,1],[1,3,2,1],[1,3,1,1],[1,3,5,1],[1,2,4,1],[1,2,2,1],[1,2,8,1],[1,2,3,1],[1,2,1,1],[1,2,9,1],[1,2,10,1],[1,2,9,1],[1,2,4,1],[1,2,5,1],[1,2,0,1],[1,2,9,1],[1,2,9,1],[1,2,0,1],[1,2,1,1],[1,2,1,1],[1,2,4,1],[1,1,0,1],[1,1,2,1],[1,1,2,1],[1,1,5,1],[1,1,3,1],[1,1,10,1],[1,1,6,1],[1,1,0,1],[1,1,8,1],[1,1,6,1],[1,1,4,1],[1,1,9,1],[1,1,9,1],[1,1,4,1],[1,1,2,1],[1,1,9,1],[1,1,0,1],[1,1,8,1],[1,1,6,1],[1,1,1,1],[1,1,1,1],[1,1,5,1]]

beta = least_squares_fit(inputs, daily_minutes_good, learning_rate, 5000, 25)
assert 30.50 < beta[0] < 30.70 # constant
assert 0.96 < beta[1] < 1.00 # num_friends
assert -1.89 < beta[2] < -1.85 # work hours per day
assert 0.91 < beta[3] < 0.94 # has PhD
print(beta)


In [None]:
from Chapter_14 import total_sum_of_squares

def multipl_r_squared(xs: List[Vector], ys: Vector, beta: Vector) -> float:
  sum_of_squared_errors = sum(error(x, y, beta) ** 2
                              for x, y in zip(xs, ys))
  return 1.0 - sum_of_squared_errors / total_sum_of_squares(ys)

assert 0.67 < multipl_r_squared(inputs, daily_minutes_good, beta) < 0.68

In [None]:
# Bootstrapping
from typing import TypeVar, Callable

X = TypeVar('X') # Generic type for data
Stat = TypeVar('Stat') # Generic type for "statistic"

def bootstrap_sample(data: List[X]) -> List[X]:
  """randomly samples len(data) elements with replacement"""
  return [random.choice(data) for _ in data]

def bootstrap_statistic(data: List[X],
                        stats_fn: Callable[[List[X]], Stat],
                        num_samples: int) -> List[Stat]:
    """evaluates stats_fn on num_samples bootstrap samples from data"""
    return [stats_fn(bootstrap_sample(data)) for _ in range(num_samples)]

# 101 points all very close to 100
close_to_100 = [99.5 + random.random() for _ in range(101)]

# 101 points, 50 near 0, 50 near 200
far_from_100 = ([99.5 + random.random()] + 
                [random.random() for _ in range(50)] +
                [200 + random.random() for _ in range(50)])

from Chapter_05 import median, standard_deviation

medians_close = bootstrap_statistic(close_to_100, median, 100)
medians_far = bootstrap_statistic(far_from_100, median, 100)

print(medians_close, medians_far)

assert standard_deviation(medians_close) < 1
assert standard_deviation(medians_far) > 90
                                           

In [None]:
from typing import Tuple
import datetime

def estimate_sample_beta(pairs: List[Tuple[Vector, float]]):
  x_sample = [x for x, _ in pairs]
  y_sample = [y for _, y in pairs]
  beta = least_squares_fit(x_sample, y_sample, learning_rate, 5000, 25)
  print("bootstrap sample", beta)
  return beta

random.seed(0) # so that we get the same results

# This will take a couple of minutes
bootstrap_betas = bootstrap_statistic(list(zip(inputs, daily_minutes_good)),
                                     estimate_sample_beta,
                                     100)

bootstrap_standard_errors = [
                             standard_deviation([beta[i] for beta in bootstrap_betas])
                             for i in range(4)]

print(bootstrap_standard_errors)

In [None]:
from Chapter_06 import normal_cdf

def p_value(beta_hat_j: float, sigma_hat_j: float) -> float:
  if beta_hat_j > 0:
    # if the coefficient is positive, we need to compute twice the
    # probability of seeing an even *larger* value
    return 2 * (1 - normal_cdf(beta_hat_j / sigma_hat_j))
  else:
    # otherwise twice the probability of seeing a *smaller* value
    return 2 * normal_cdf(beta_hat_j / sigma_hat_j)

assert p_value(30.58, 1.27) < 0.001 # constant term
assert p_value(0.972, 0.103) < 0.001 # num_friends
assert p_value(-1.865, 0.155) < 0.001 # work_hours
assert p_value(0.923, 1.249) > 0.4 # phd