<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Only do this once per VM, otherwise you'll get multiple clones and nested directories
!git clone https://github.com/jamestheengineer/data-science-from-scratch-Python.git
%cd data-science-from-scratch-Python/
!pip install import-ipynb

In [0]:
# Working and Exploring data

# Exploring one-dimensional data 
from typing import List, Dict
from collections import Counter
import math

import matplotlib.pyplot as plt

def bucketize(point: float, bucket_size: float) -> float:
  """Floor the point to the next lower multiple of bucket_size"""
  return bucket_size * math.floor(point / bucket_size)

def make_histogram(points: List[float], bucket_size: float) -> Dict[float, int]:
  """Bickets the points and counts how many in each bucket"""
  return Counter(bucketize(point, bucket_size) for point in points)

def plot_histogram(points: List[float], bucket_size: float, title: str = ""):
  histogram = make_histogram(points, bucket_size)
  plt.bar(histogram.keys(), histogram.values(), width = bucket_size)
  plt.title(title)


In [0]:
import random
import import_ipynb
from Chapter_6 import inverse_normal_cdf

In [0]:
random.seed(0)

# uniform between -100 and 100
uniform = [200 * random.random() - 100 for _ in range(10000)]

# normal distribution with mean 0, standard deviation 57
normal = [57 * inverse_normal_cdf(random.random())
          for _ in range(10000)]

#plot_histogram(uniform, 10, "Uniform Histogram")
plot_histogram(normal, 10, "Normal Histogram")

In [0]:
# Two dimensions
def random_normal() -> float:
  """Returns a random draw from a standard normal distribution"""
  return inverse_normal_cdf(random.random())

In [0]:
xs = [random_normal() for _ in range(1000)]
ys1 = [x + random_normal() / 2 for x in xs]
ys2 = [-x + random_normal() / 2 for x in xs]

In [0]:
# Same histogram but different joint distributions
plt.scatter(xs, ys1, marker='.', color='black', label='ys1')
plt.scatter(xs, ys2, marker='.', color='gray', label='ys2')
plt.xlabel('xs')
plt.ylabel('ys')
plt.legend(loc=9)
plt.title("Very Different Joint Distributions")
plt.show()

In [0]:
# The difference is also apparent when you look at the correlations
from Chapter_5 import correlation

print(correlation(xs, ys1))
print(correlation(xs, ys2))

In [0]:
from Chapter_4 import Matrix, Vector, make_matrix

def correlation_matrix(data: List[Vector]) -> Matrix:
  """
  Returns the len(data) x len(data) matrix whose (i, j)-th entry
  is the correlation between data[i] and data[j]
  """
  def correlation_ij(i: int, j: int) -> float:
    return correlation(data[i], data[j])

  return make_matrix(len(data), len(data), correlation_ij)

# corr_data is a list of four 100-d vectors
# Hmmm..don't see this in Joel's github repo. Not sure where the data is.
#num_vectors = len(corr_data)
#fig, ax = plt.subplots(num_vectors, num_vectors)

#for i in range(num_vectors):
#  for j in range(num_vectors):
#    #Scatter column_j on the x-axis vs. column_i on the y-axis
#    if i != j: ax[i][j].scatter(corr_data[j], corr_data[i])

    # unless i == j, in which case show the series name
#    else: ax[i][j].annotate("series " + str(i), (0.5, 0.5),
#                            xycoords='axes fraction',
#                            ha="center", va="center")
    
    # Then hide axis labels except left and bottom charts
#    if i < num_vectors - 1: ax[i][j].xaxis.set_visible(False)
#    if j > 0: ax[i][j].yaxis.set_visible(False)

# Fix the bottom-right and top-left axis labels, which are wrong because
# their charts only have text in them
#ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
#ax[0][0].set_ylim(ax[0][1].get_ylim())

#plt.show()

In [0]:
import datetime

stock_price = {'closing_price': 102.06,
               'date': datetime.date(2014, 8, 29),
               'symbol': 'AAPL'}
print(stock_price)
# But this can be error prone
stock_price['cosing_price'] = 103.06
print(stock_price)


In [0]:
# Python has named tuples
from collections import namedtuple

StockPrice = namedtuple('StockPrice', ['symbol', 'date', 'closing_price'])
price = StockPrice('MSFT', datetime.date(2018, 12, 14), 106.03)

assert price.symbol == 'MSFT'
assert price.closing_price == 106.03

from typing import NamedTuple

class StockPrice(NamedTuple):
  symbol: str
  date: datetime.date
  closing_price: float

  def is_high_tech(self) -> bool:
    """It's a class, so we can add methods too"""
    return self.symbol in ['MSFT', 'GOOG', 'FB', 'AMZN', 'AAPL']

price = StockPrice('MSFT', datetime.date(2018, 12, 14), 106.03)

assert price.symbol == 'MSFT'
assert price.closing_price == 106.03
assert price.is_high_tech()


In [0]:
# Dataclasses in Python are new in Python 3.7
from dataclasses import dataclass

@dataclass
class StockPrice2:
  symbol: str
  date: datetime.date
  closing_price: float

  def is_high_tech(self) -> bool:
    """It's a class, so we can add methods too"""
    return self.symbol in ['MSFT', 'GOOG', 'FB', 'AMZN', 'AAPL']

price2 = StockPrice2('MSFT', datetime.date(2018, 12, 14), 102.06)

assert price2.symbol == 'MSFT'
assert price2.closing_price == 102.06
assert price2.is_high_tech()

# You can modify a dataclass instance unlike a NamedTuple
# stock split
price2.closing_price /= 2
assert price2.closing_price == 51.03

In [0]:
# But you still have the problem of writing props that you didn't intend, so we 
# won't use these
price2.cosing_price = 75 # oops