In [None]:
%matplotlib inline
%load_ext rpy2.ipython
import matplotlib.pyplot as plt
from matplotlib.mlab import csv2rec
import numpy as np
import scipy.stats
from numpy import mean, median, sqrt, std

# stats60 specific
from code.week1 import stylized_density
from code.utils import sample_density
figsize = (8,8)

# Graphical Summaries

### Different types of data

* Categorical data (e.g. what type of high school) - also called *qualitative*.
* Ordinal data (e.g. number of missed lectures) - sometimes called *discrete quantitative*.
* Continuous data (e.g. age of incoming freshmen).

Different types of data lend themselves to different
descriptions and visualizations. 
  

## Pie graph

* Each category has a sector of the pie.
* Area of wedges proportional to percentage.
* If 58 % went to public school, wedge for public school has angle $\frac{58}{100} * 360^{\circ} \approx 210^{\circ}$

In [None]:
%%capture
fracs = [10.2, 57.6, 31.8, 0.4]
labels = ['International', 'Public', 'Private', 'Home School']
pie_fig = plt.figure(figsize=figsize)
pie_plot = pie_fig.gca()
pie_plot.pie(fracs, labels=labels, autopct='%1.0f%%')
pie_plot.set_title("Where Stanford undergrads went to high school", fontsize=25)

In [None]:
pie_fig

## Summarizing ordinal data

### Missed lectures


Courses | Percentage
--- | ---
0 | 75 %
1 | 7 %
2 | 3 %
3 | 4 %
4 | 6 %
5 | 5 %

This data can also be summarized in a **bar plot**.

In [None]:
%%capture
bar_fig = plt.figure(figsize=figsize)
bar_plot = bar_fig.gca()
bar_plot.bar(np.arange(6),[75,7,3,4,6,5], width=1, facecolor='red', 
             alpha=0.7,
             align='center')
bar_plot.set_title('Summary of number of missed lectures', fontsize=20)
bar_plot.set_ylabel('Percentage of students (%)', fontsize=18)
bar_plot.set_xlabel('Number of lectures missed', fontsize=18)
bar_plot.set_xlim([-0.5,5.5])

In [None]:
bar_fig

## Summarizing continuous data

### California population by age

Age group | Count (1000s)| Percentage
--- | --- | ---
0-20 | 10000 | 29%
20-55 | 17500 | 17500 / 34000 = 52%
55-75 | 4500 | 13%
75+ | 2000 | 6%
**Total** | 34000 | 100 %


## Histogram

We summarize this information graphically with a special
type of bar graph called a **histogram.**

- **Key concept for course!**
- Area of a bar corresponds to percentage.
- Total area (percentage) should be 100%.
- Height of bars is called **density**.
- Area of each bar is
    $$
    \text{area of bar = (height of bar) * (width of bar) }
    $$



In [None]:
%%capture
def CAdensity():
    bins = [0,20,55,75,100]
    count = [29,52,13,6]
    hist_fig = plt.figure(figsize=figsize)
    data = np.array([10]*29 + [30]*52 + [60]*13 + [80.]*6)
    hist_plot, dens, CDF = sample_density(data, bins=bins, alpha=0.5, ax=hist_fig.gca(),
                            facecolor='gray')
    hist_plot.set_ylabel('Percentage per year (%/year)', fontsize=20)
    hist_plot.set_xlabel('Age (years)', fontsize=20)
    hist_plot.set_title('California population by age groups', fontsize=22)
    def area(a, b):
        return np.round(100*(CDF(b) - CDF(a)), 1)
    return hist_fig, dens, area

hist_fig, CAdens, CAarea = CAdensity()

In [None]:
hist_fig

### Interpreting a histogram

#### Percentage in 20-55 age group is

$$\left(1.5 \, \frac{\%}{\text{year}} \right) * \left( 35 \, \text{years} \right) \approx 52 \%$$

In [None]:
%%capture
hist_fig2, dens2, area2 = CAdensity()
ax = hist_fig2.gca()
interval = np.linspace(20.001,54.999,20)
ax.fill_between(interval, 0*interval, dens2(interval), 
                facecolor='yellow', hatch='/')


In [None]:
hist_fig2

In [None]:
CAarea(20,55)

### Interpreting a histogram

#### Percentage in 20-40 age group is

$$\left(1.5 \, \frac{\%}{\text{year}} \right) * \left( 20 \, \text{years} \right) \approx 30 \%$$


In [None]:
%%capture
hist_fig3, dens3, area3 = CAdensity()
ax = hist_fig3.gca()
interval = np.linspace(20.001,40.,20)
ax.fill_between(interval, 0*interval, dens3(interval), 
                facecolor='green', hatch='/')


In [None]:
hist_fig3

In [None]:
CAarea(20,40)

### Interpreting a histogram 

#### Percentage in 40-60 age group is about

$$ \left(1.5 \, \frac{\%}{\text{year}} \right) * \left( 15 \, \text{years} \right) + \left(0.6 \, \frac{\%}{\text{year}} \right) * \left( 5 \, \text{years} \right) \approx 25 \%$$

In [None]:
%%capture
hist_fig4, dens4, area4 = CAdensity()
ax = hist_fig4.gca()
interval = np.linspace(40.001,60.,501)
ax.fill_between(interval, 0*interval, dens3(interval), 
                facecolor='blue', hatch='/')


In [None]:
hist_fig4

In [None]:
CAarea(40,60)

## Histograms from a list of numbers

- Given a *sample*, i.e. a list of numbers $X=[X_1,..., X_n]$ 
and a set of break points $[B_1,..., B_k]$. 

- We can form a histogram by
computing percentages for a bin $[B_j,B_{j+1})$

- Each bar's width is $B_{j+1} - B_j$.

- Each bar's height is percentage of numbers in X greater than our equal to $B_j$    but less than $B_{j+1}$ divided by the width $B_{j+1}-B_j$.


The following is a sample of mothers' and daughters' heights recorded in the early 20th
century in a study by Karl Pearson:

In [None]:
mother = csv2rec('data/pearson_lee.csv')['mother']

In [None]:
mother[:10]

In [None]:
len(mother)

Now, let's make some bins:

In [None]:
binpoints = range(50,80)
binpoints[:5]

In [None]:
%%capture
mother_fig = plt.figure(figsize=(7,7))
sample_density(mother, bins=binpoints, facecolor='gray')
ax = mother_fig.gca()
ax.set_xlabel('Height (inches)', fontsize=15)
ax.set_ylabel('Percentage per inch (%/inch)', fontsize=15)

In [None]:
mother_fig

In [None]:
%%capture
mother_coarse_fig = plt.figure(figsize=(7,7))
sample_density(mother, bins=binpoints[::2], facecolor='gray')
ax = mother_coarse_fig.gca()
ax.set_xlabel('Height (inches)', fontsize=15)
ax.set_ylabel('Percentage per inch (%/inch)', fontsize=15)

In [None]:
mother_coarse_fig

## Continuous histograms

- The flat histograms we drew above can be thought of as *approximations* to a **continuous** histogram.

- If the population of people we used gets larger and larger, the histogram might settle down to a curve.


In [None]:
%%capture
sample = np.random.standard_normal((100000,))
plt.figure(figsize=figsize)
hist_opts = {'facecolor':'gray', 'alpha':0.5, 'histtype':'stepfilled'}
hist_coarse, _, CDF_coarse = sample_density(sample[:100], bins=10, **hist_opts)
hist_coarse.set_xlabel('Units', fontsize=20)
hist_coarse.set_ylabel('Density (% per unit)', fontsize=20)

In [None]:
hist_coarse.figure

In [None]:
%%capture
plt.figure(figsize=figsize)
hist_fine, _, CDF_fine = sample_density(sample[:500], bins=20, **hist_opts)
hist_fine.set_xlabel('Units', fontsize=20)
hist_fine.set_ylabel('Density (% per unit change)', fontsize=20)

In [None]:
hist_fine.figure

In [None]:
%%capture
plt.figure(figsize=figsize)
hist_finer, _, CDF_finer = sample_density(sample[:10000], bins=50, **hist_opts)
hist_finer.set_xlabel('Units', fontsize=20)
hist_finer.set_ylabel('Density (% per unit change)', fontsize=20)

In [None]:
hist_finer.figure

In [None]:
%%capture
from scipy.stats import norm as ndist
X = np.linspace(-4,4,101)
hist_finer.plot(X, ndist.pdf(X), c='k', linewidth=4)
hist_finer.set_title('With more data, the discrete histogram converges to a curve', fontsize=15, color='red')


In [None]:
hist_finer.figure

In [None]:
%%capture
plt.figure(figsize=figsize)
hist_finest, _, CDF_finest = sample_density(sample, bins=200, **hist_opts)
hist_finest.set_xlabel('Units', fontsize=20)
hist_finest.set_ylabel('Density (% per unit)', fontsize=20)
hist_finest.set_title('Even finer resolution...', fontsize=15, color='red')
hist_finest.plot(X, ndist.pdf(X), c='k', linewidth=4)

In [None]:
hist_finest.figure

## Area under a continuous histogram

In [None]:
%%capture
hist_opts['alpha'] = 0.
regions = [((-2,2,501), {'facecolor':'yellow', 'hatch':'/'})]
ax = sample_density(sample[:100], bins=10, regions=regions, ax=hist_coarse, **hist_opts)[0]
ax.set_title('Area: %0.1f' % (100 * (CDF_coarse(2) - CDF_coarse(-2))), fontsize=15)

In [None]:
ax.figure

In [None]:
%%capture
ax = sample_density(sample[:500], bins=20, regions=regions, 
                    ax=hist_fine, **hist_opts)[0]
ax.set_title('Area: %0.1f' % (100 * (CDF_fine(2) - CDF_fine(-2))), fontsize=15)

In [None]:
ax.figure

In [None]:
%%capture
interval = np.linspace(-6,6,501)
ax = sample_density(sample[:10000], bins=50, regions=regions, ax=hist_finer, **hist_opts)[0]
ax.plot(interval, ndist.pdf(interval), 'k', linewidth=4)
ax.set_title('')
ax.set_title('Area: %0.1f' % (100 * (CDF_finer(2) - CDF_finer(-2))), fontsize=15)

In [None]:
ax.figure

In [None]:
%%capture
ax = sample_density(sample, bins=200, regions=regions, ax=hist_finest, **hist_opts)[0]
ax.plot(interval, ndist.pdf(interval), 'k', linewidth=4)
ax.set_title('')
ax.set_title('Area: %0.1f' % (100 * (CDF_finest(2) - CDF_finest(-2))), fontsize=15)

In [None]:
ax.figure

## Area under a continuous histogram

As the width of the bins shrinks and the histogram converges
to a continuous curve, the area also converges:
$$
\begin{aligned}
\text{Area between -2 and 2} &= \sum_{\text{bins in $[-2,2]$}} \text{width(bin) * density(bin)} \\
&\to \int_{-2}^2 f(u) \; du.
\end{aligned}
$$

This last quantity is called an *integral* (e.g. MATH41)

### Shape of a histogram

The shape of a histogram tells us something about the data.

This histogram is **skewed left (i.e. long left tail)**

In [None]:
%%capture

with plt.xkcd():
    skew_left = plt.figure(figsize=(6,6))
    sample = list(np.random.beta(2,1, size=15000)) + list(np.random.beta(10,1.5, size=15000)) 
    stylized_density(sample, ax=skew_left.gca())

In [None]:
skew_left

### Shape of a histogram

This histogram is **skewed right (i.e. long right tail)**

In [None]:
%%capture
with plt.xkcd():
    skew_right = plt.figure(figsize=figsize)
    sample = list(np.random.beta(1, 2.2, size=25000)) + list(np.random.beta(1,1, size=5000)) 
    stylized_density(sample, ax=skew_right.gca())

In [None]:
skew_right

### Shape of a histogram

This histogram is **symmetric**

In [None]:
%%capture
with plt.xkcd():
    symmetric = plt.figure(figsize=figsize)
    sample = np.random.standard_t(40, size=50000)
    stylized_density(sample, ax=symmetric.gca())

In [None]:
symmetric

### Shape of a histogram

This histogram is **has two peaks.** Statisticians call this **bimodal**.

In [None]:
%%capture
with plt.xkcd():
    bimodal = plt.figure(figsize=(7,7))
    sample = (list(np.random.standard_t(40, size=50000)) + 
              list(np.random.standard_t(30, size=20000) + 4))
    stylized_density(sample, ax=bimodal.gca())


In [None]:
bimodal

### Shape of a histogram

This histogram is **flat.**

In [None]:
%%capture
with plt.xkcd():
    flat = plt.figure(figsize=figsize)
    sample = np.random.sample(100000)
    stylized_density(sample, ax=flat.gca())


In [None]:
flat

## Other important visual summaries

### Scatter plot

A plot with two axes:

- X-axis is the *independent variable*;
- Y-axis is the *dependent variable.*

In [None]:
%%capture
with plt.xkcd():
    wage_fig = plt.figure(figsize=figsize)
    wage_ax = wage_fig.gca()
    wage = csv2rec('data/wage.csv')
    wage_ax.scatter(wage['education'], wage['logwage'], facecolor='red', s=50)
    wage_ax.set_xlabel('Education (years)', fontsize=20)
    wage_ax.set_ylabel('log wage (log $)', fontsize=20)
    wage_ax.set_title('Scatterplot', fontsize=20)

In [None]:
wage_fig

## Time series

- X-axis is *time*
- Y-axis is *dependent variable*.
- Randomness is often **structured** through **(auto)correlation**.

In [None]:
%%capture
stocks = csv2rec('data/stocks.csv')
facebook = stocks['facebook']
apple = stocks['apple']
dates = stocks['date']
facebook_fig = plt.figure(figsize=figsize)
facebook_ax = facebook_fig.gca()
facebook_ax.set_ylabel('Price of "FB"', fontsize=20)
facebook_ax.set_xlabel('Date', fontsize=20)
facebook_ax.plot(dates, facebook, 'k', linewidth=3)

In [None]:
facebook_fig

In [None]:
%%capture
apple_fig = plt.figure(figsize=figsize)
apple_ax = apple_fig.gca()
apple_ax.set_ylabel('Price of "AAPL"', fontsize=20)
apple_ax.set_xlabel('Date', fontsize=20)
apple_ax.plot(dates, apple, 'k', linewidth=3)


In [None]:
apple_fig

In [None]:
%%capture
scatter_fig = plt.figure(figsize=figsize)
scatter_ax = scatter_fig.gca()
scatter_ax.scatter(facebook, apple, color='red', s=100,
                   edgecolor='gray')
scatter_ax.set_xlabel('Price of "FB"', fontsize=20)
scatter_ax.set_ylabel('Price of "AAPL"', fontsize=20)

In [None]:
scatter_fig