<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Chapter 5 - Statistics

# Cheating and copying this list
num_friends = [100.0,49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]

In [0]:
from collections import Counter
import matplotlib.pyplot as plt

friend_counts = Counter(num_friends)
xs = range(101)
ys = [friend_counts[x] for x in xs]
plt.bar(xs, ys)
plt.axis([0,101,0,25])
plt.title("Histogram of Friend Counts")
plt.xlabel("# of friends")
plt.ylabel("# of people")
plt.show()

In [0]:
num_points = len(num_friends)
largest_value = max(num_friends)
smallest_value = min(num_friends)

sorted_values = sorted(num_friends)
smallest_value = sorted_values[0]
second_smallest_value = sorted_values[1]
second_largest_value = sorted_values[-2]
print(num_points, smallest_value, largest_value, second_smallest_value, second_largest_value)

In [0]:
from typing import List

# Mean
def mean(xs: List[float]) -> float:
  return sum(xs) / len(xs)

In [0]:
mean(num_friends)

In [0]:
# Let's write some median functions. If an odd number in the list, then just the median
# If an even number of things, then the average of the two closest to the median

# The underscores indicate that these are "private" functions, as they're intended
# to be called by our median function but not by other people using our statistics library.
def _median_odd(xs: List[float]) -> float:
  """If len(xs) is odd, the median is the middle element"""
  return sorted(xs)[len(xs) // 2]

def _median_even(xs: List[float]) -> float:
  """If len(xs) is even, it's the average of the middle two elements"""
  sorted_xs = sorted(xs)
  hi_midpoint = len(xs) // 2 # e.g., length 4 => hi_midpoint = 2
  return (sorted_xs[hi_midpoint -1] + sorted_xs[hi_midpoint]) / 2

def median(v: List[float]) -> float:
  """Finds the 'middle-most' value of v"""
  return _median_even(v) if len(v) % 2 == 0 else _median_odd(v)

assert median([1,10,2,9,5]) == 5
assert median([1,9,2,10]) == (2 + 9) / 2

In [0]:
print(median(num_friends))

In [0]:
# Quantiles are a generalization of the median
def quantile(xs: List[float], p: float) -> float:
  """Returns the pth-percentile value in x"""
  p_index = int(p * len(xs))
  return sorted(xs)[p_index]

assert quantile(num_friends, 0.10) == 1
assert quantile(num_friends, 0.25) == 3
assert quantile(num_friends, 0.75) == 9
assert quantile(num_friends, 0.90) == 13

In [0]:
# The mode is the most common value. Sometimes of interest
def mode(x: List[float]) -> List[float]:
  """Returns a list, since there might be more than one mode"""
  counts = Counter(x)
  max_count = max(counts.values())
  return [x_i for x_i, count in counts.items()
            if count == max_count]

assert set(mode(num_friends)) == {1, 6}


In [0]:
# Various ways of looking at dispersion

# Range. In Python, range alredy means something, so we'll name this differently
def data_range(xs: List[float]) -> float:
  return max(xs) - min(xs)

assert data_range(num_friends) == 99

In [0]:
# Let's do some colab magic to get the other notebooks accessible in here
!git clone https://github.com/jamestheengineer/data-science-from-scratch-Python.git
%cd data-science-from-scratch-Python/
!pip install import-ipynb
import import_ipynb
import Chapter_4

# First, a function that centers our mean at 0
def de_mean(xs: List[float]) -> List[float]:
  """Translate xs by subtracting its mean (so the result has mean 0)"""
  x_bar = mean(xs)
  return [x - x_bar for x in xs]

# Variance
def variance(xs: List[float]) -> float:
  """Almost the average squared deviation from the mean"""
  n = len(xs)
  assert n >= 2, "variance requires at least two elements"
  deviations = de_mean(xs)
  return Chapter_4.sum_of_squares(deviations) / (n-1)
  
assert 81.54 < variance(num_friends) < 81.55
