In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')


## Line Graphs
Continous variables change according to time or some quantities.

https://www.inferentialthinking.com/chapters/07/2/visualizing-numerical-distributions.html

In [None]:
# As of Jan 2017, this census file is online here: 
data = 'http://www2.census.gov/programs-surveys/popest/datasets/2010-2015/national/asrh/nc-est2015-agesex-res.csv'

# A copy can be accessed here in case census.gov moves the file:
# data = 'http://inferentialthinking.com/notebooks/nc-est2015-agesex-res.csv'

full_census_table = Table.read_table(data)
full_census_table
partial = full_census_table.select(['SEX', 'AGE', 4, 9])
us_pop = partial.relabeled(2, '2010').relabeled(3, '2015')
ratio = (us_pop.column(3) / us_pop.column(2))
census = us_pop.with_columns(
        'Change', us_pop.column(3) - us_pop.column(2), 
        'Total Growth', ratio - 1,
        'Annual Growth', ratio ** (1/5) - 1)
census.set_format([2, 3, 4], NumberFormatter)
census.set_format([5, 6], PercentFormatter)

In [None]:
census.where('SEX', are.above(0)).where('AGE', are.below(999)).select('AGE','2010','2015')

In [None]:
census.where('SEX', are.above(0)).where('AGE', are.below(999)).select('AGE','2010','2015').plot(0)

In [None]:
2010-68

In [None]:
2015-68

## Baby boomer in 1947 after 1945

In [None]:
us_pop_2015 = us_pop.drop('2010').where('AGE', are.below(999)).where('SEX', are.above(0))
us_pop_2015

In [None]:
males = us_pop_2015.where('SEX', 1).column('2015')
by_sex = us_pop_2015.where('SEX', 2).drop('SEX').relabeled('2015', 'Females').with_column('Males', males)
by_sex.set_format('Males', NumberFormatter)

In [None]:
by_sex.plot(0)

## Scatter Plots
Each point show two variables according to one observation

In [None]:
actors = Table.read_table('https://raw.githubusercontent.com/data-8/materials-sp18/master/lec/actors.csv')
actors = actors.relabeled(5, '#1 Movie Gross')
actors

In [None]:
actors.labels

In [None]:
actors.select('Number of Movies', 'Average per Movie', 'Gross').scatter('Number of Movies')

## Distributions

In [None]:
top = Table.read_table('https://github.com/data-8/materials-sp18/raw/master/lec/top_movies.csv')
top

In [None]:
top10 = top.take(np.arange(10))
top10.barh(0, 2)

In [None]:
studios = top.group('Studio')
studios.show()

In [None]:
studios.sort(1, descending=True).barh(0)


#### Bar for categorical
probability is the height/area since the support of interval is the same

#### Hist for numerical
probability is the area, height is the density.


A histogram visualizes a single numerical variable. A histogram of a numerical dataset looks very much like a bar chart, though it has some important differences. 

Two defining properties of Histograms are:

1.The bins are drawn to scale and contiguous (though some might be empty), because the values on the horizontal axis form a continuous number line.

2.The area of each bar is proportional to the number of entries in the bin.

Histograms are often drawn using the density scale, where the area of a bar is equal to the percent of entries in that bin. The density scale is advantageous because the areas are interpretable, and the histogram areas are drawn to scale even if the widths of the bars are different.

Computing the bar heights uses that fact that a bar is a rectangle: 

(area of the bar) = (height of the bar) * (width of the bar).



## Binning

In [None]:
age = 2017 - top.column('Year')
top = top.with_column('Age', age)

In [None]:
my_bins = make_array(0, 5, 10, 15, 25, 40, 65, 100)

In [None]:
top.bin('Age', bins = my_bins)

## Good one with area matching the probability

In [None]:
top.hist('Age', bins = my_bins, unit = 'Year')

## Bad one with area not matching the probability

In [None]:
top.hist('Age', bins = my_bins, unit = 'Year', normed = False)

## Density

The hight of the bins = density = probability/support = area / width

How crowned it is 

crowness


In [None]:
my_bins

In [None]:
42/200

In [None]:
40 - 25

In [None]:
21 / 15

### 1.4 is the height of bin [25,40]

In [None]:
incomes = Table.read_table('https://github.com/data-8/materials-sp18/raw/master/lec/incomes.csv')
incomes

In [None]:
incomes.bin(1, bins = [0, 15, 25, 85])

In [None]:
9/20

In [None]:
45/15

3.0 is the height for bin/interval [0,15], 3.0/100 is the density 

In [None]:
incomes.hist(1, bins = [0, 15, 25, 85], unit = 'million')

Overlaid Histograms

In [None]:
height = Table.read_table('https://github.com/data-8/materials-sp18/raw/master/lec/galton.csv').select(1, 2, 7).relabeled(2, 'child')
height.show(6)

In [None]:
height.hist('father', unit='inch')

In [None]:
height.hist('child', unit='inch')

In [None]:
height.hist(unit='inch', bins=np.arange(55, 80, 2))

Children's height is more spread

In [None]:
height.scatter(2)

In [None]:
height = height.with_column(
    'parent average', (height.column('mother') + height.column('father')) / 2
)
height


In [None]:
height.scatter('parent average', 'child')

In [None]:
height.scatter('parent average', 'child')
_ = plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
_ = plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)

In [None]:
def predict_child(pa):
    close_points = height.where('parent average', are.between(pa - 0.5, pa + 0.5))
    return close_points.column('child').mean()     

In [None]:
# Apply predict_child to all the midparent heights

height.with_column(
    'prediction', height.apply(predict_child, 'parent average')
).select(2, 3, 4).scatter('parent average')