# Intro to Plotting

### Sneak peak:

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_rows = 10
sns.set(style='ticks', context='talk')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
beer = pd.read_csv('../data/random/beer_subset.csv.gz', parse_dates=['time'], compression='gzip')
review_cols = [c for c in beer.columns if c[0:6] == 'review']
beer.head()

In [None]:
fig, ax = plt.subplots(figsize=(5, 10))
sns.countplot(hue='kind', y='stars', data=(beer[review_cols]
                                           .stack()
                                           .rename_axis(['record', 'kind'])
                                           .rename('stars')
                                           .reset_index()),
              ax=ax, order=np.arange(0, 5.5, .5))
sns.despine()

## Matplotlib

- Tons of features
- "Low-level" library

Check out [the tutorials](http://matplotlib.org/users/beginner.html)

In [None]:
from IPython import display
display.HTML('<iframe src="http://matplotlib.org/users/beginner.html" height=500 width=1024>')

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
plt.plot([1,2,3,4])
plt.ylabel('some numbers')
plt.show()

A single series is interpreted as y values, so x is just the index...

In [None]:
x = np.arange(5)
y = x**2
plt.plot(x, y)

For every x, y pair of arguments, there is an optional third argument which is the format string that indicates the color and line type of the plot. 

In [None]:
plt.plot(x, y, 'ro')

To work on plots in more detail, it's useful to store the "axis" object

In [None]:
fig, ax = plt.subplots()

In [None]:
ax.

In [None]:
ax.plot(x, y, 'ro')

In [None]:
fig

In [None]:
ax.set_xlim([0,5])
ax.set_ylim([0,18])
fig

Lots of `keyword` properties...

In [None]:
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=5, alpha=.3)

#### Overlaying plots

In [None]:
np.random.seed(5)
fig, ax = plt.subplots()
ax.plot(np.arange(10), np.random.rand(10))
ax.plot(np.arange(10), np.random.rand(10))

#### Multiple plots

In [None]:
fig, ax = plt.subplots(nrows=2)

In [None]:
ax

In [None]:
ax[0].plot(x, y)
fig

In [None]:
ax[1].plot(y, x)
fig

In [None]:
fig.tight_layout()
fig

In [None]:
fig.set_size_inches(10,6)
fig

#### Types of axes

In [None]:
fig, ax = plt.subplots()
ax.plot(np.arange(20))

In [None]:
ax.set_yscale('log')
fig

In [None]:
fig, ax = plt.subplots(ncols=2)
ax[0].plot(np.arange(100))
ax[1].plot(np.arange(100))
ax[1].set_yscale('log')
fig.tight_layout()

The best way to learn is [the gallery](http://matplotlib.org/gallery.html)

In [None]:
display.HTML('<iframe src="http://matplotlib.org/gallery.html" height=500 width=1024>')

### A handful of examples

Scatter plots and "bubble charts"

In [None]:
fig, ax = plt.subplots()

n = 20
x = np.random.normal(size=n)
y = np.random.normal(size=n)
c = np.random.uniform(size=n)
s = np.random.randint(100, size=n)
ax.scatter(x, y, c=c, s=s, alpha=0.5)

ax.set_xlabel(r'$\Delta_i$', fontsize=20)
ax.set_ylabel(r'$\Delta_{i+1}$', fontsize=20)
ax.set_title('Some title')

ax.grid(True)
fig.tight_layout()

#### Bar charts

In [None]:
people = ['Annie', 'Brian', 'Chelsea', 'Derek', 'Elise']
performance = 3 + 10 * np.random.rand(len(people))
error = np.random.rand(len(people))

In [None]:
fig, ax = plt.subplots()
ax.barh(np.arange(len(people)), performance, xerr=error, align='center', alpha=0.4)
ax.set_yticks(np.arange(len(people)))
ax.set_yticklabels(people)
ax.set_xlabel('Score')
ax.set_title('Objective Assessment of Human Worth')

# Exercise 1
Using the data we prepared earlier (`data/processed/age_adjusted.hdf`) try to recreate this plot:

![Exercise 1](../img/exercise-1.png)


_Note_: East Asia is location_id 5

__Bonus__: Add uncertainty intervals (_Hint_: look up `plt.fill_between?`)

In [None]:
df = #
df.head()

In [None]:
east_asia = #
east_asia

In [None]:
fig, ax = #

# Plotting with Pandas

matplotlib is a relatively *low-level* plotting package, relative to others. It makes very few assumptions about what constitutes good layout (by design), but has a lot of flexiblility to allow the user to completely customize the look of the output.

On the other hand, Pandas includes methods for DataFrame and Series objects that are relatively high-level, and that make reasonable assumptions about how the plot should look.

In [None]:
normals = pd.Series(np.random.normal(size=10))
normals.plot()

In [None]:
normals.cumsum().plot(grid=True)

Similarly, for a DataFrame:

In [None]:
variables = pd.DataFrame({'normal': np.random.normal(size=100), 
                          'gamma': np.random.gamma(1, size=100), 
                          'poisson': np.random.poisson(size=100)})
variables.cumsum(0).plot()

All Pandas plotting commands return `matplotlib` `axis` objects:

In [None]:
ax = variables.cumsum(0).plot()
type(ax)

In [None]:
ax.set_xlabel('My X Axis')
ax.figure

In [None]:
ax.vlines?

In [None]:
ax.get_ylim()

In [None]:
ax.vlines(80, *ax.get_ylim())
ax.figure

As an illustration of the high-level nature of Pandas plots, we can split multiple series into subplots with a single argument for `plot`:

In [None]:
variables.cumsum(0).plot(subplots=True)

Or, we could use a secondary y-axis:

In [None]:
variables.cumsum(0).plot(secondary_y='normal', grid=False)

(Note that ["friends don't let friends use two y-axes"](https://kieranhealy.org/blog/archives/2016/01/16/two-y-axes/), but we're just showing some examples here...)

If we would like a little more control, we can use matplotlib's `subplots` function directly, and manually assign plots to its axes:

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))
for i,var in enumerate(variables.columns):
    variables[var].cumsum(0).plot(ax=ax[i], title=var)
ax[0].set_ylabel('cumulative sum')

### Bar plots

Bar plots are useful for displaying and comparing measurable quantities, such as counts or volumes. In Pandas, we just use the `plot` method with a `kind='bar'` argument.

For this series of examples, let's load up the Titanic dataset:

In [None]:
titanic = pd.read_excel("../data/random/titanic.xls", "titanic")
titanic.head()

In [None]:
titanic.groupby('pclass')['survived'].sum().plot(kind='bar')

In [None]:
titanic.groupby(['sex','pclass'])['survived'].sum().plot(kind='barh')

In [None]:
death_counts = pd.crosstab([titanic['pclass'], titanic['sex']], titanic['survived'].astype(bool))
death_counts.plot(kind='bar', stacked=True, color=['black','gold'], grid=False)

Or if we wanted to see survival _rate_ instead:

In [None]:
death_counts.div(death_counts.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True, color=['black','gold'])

## Histograms

Frequently it is useful to look at the *distribution* of data before you analyze it. Histograms are a sort of bar graph that displays relative frequencies of data values; hence, the y-axis is always some measure of frequency. This can either be raw counts of values or scaled proportions.

For instance, fare distributions aboard the titanic:

In [None]:
titanic['fare'].hist()

In [None]:
titanic['fare'].hist(grid=False)

In [None]:
titanic['fare'].hist(grid=False, bins=30)

In [None]:
titanic['fare'].dropna().plot(kind='kde', xlim=(0,600))

In [None]:
titanic['fare'].hist(bins=30, normed=True, color='steelblue')
titanic['fare'].dropna().plot(kind='kde', xlim=(0,600), style='r--')

### Boxplots

A different way of visualizing the distribution of data is the boxplot, which is a display of common quantiles; these are typically the quartiles and the lower and upper 5 percent values.

In [None]:
titanic.boxplot(column='fare', by='pclass')

One way to add additional information to a boxplot is to overlay the actual data; this is generally most suitable with small- or moderate-sized data series.

In [None]:
bp = titanic.boxplot(column='age', by='pclass', grid=False)
for i in [1,2,3]:
    y = titanic.age[titanic.pclass==i].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'r.', alpha=0.2)

### Scatter plots

In [None]:
beer.head()

In [None]:
plt.scatter(beer['abv'], beer['review_overall'])
plt.xlabel('ABV')
plt.ylabel('Score')

In [None]:
plt.scatter(beer['abv'], beer['review_overall'], s=np.sqrt(beer['review_palate']*150), alpha=0.3)
plt.xlabel('ABV')
plt.ylabel('Score')

In [None]:
plt.scatter(beer['abv'], beer['review_overall'], alpha=0.3, c=beer.review_palate, cmap='hot')
plt.xlabel('ABV')
plt.ylabel('Score')

In [None]:
jittered_df = beer[review_cols] + (np.random.rand(*beer[review_cols].shape) - 0.5)
pd.scatter_matrix(jittered_df, figsize=(12,8), diagonal='kde', )

# Exercise 2
Recreate this figure using `pandas.DataFrame.plot`:
![Exercise 2](../img/exercise-2.png)

In [None]:
ax = #

### Lots more info on Pandas plotting in [the docs](http://pandas.pydata.org/pandas-docs/stable/visualization.html)

## [Seaborn](http://seaborn.pydata.org/)

High-level interface for `matplotlib`

In [None]:
sns.pairplot(jittered_df[review_cols])

In [None]:
sns.pairplot?

In [None]:
sns.pairplot(jittered_df[review_cols], kind='reg', 
             plot_kws={'line_kws':{'color': 'red'}})

In [None]:
sns.heatmap(beer[review_cols].corr())

In [None]:
sns.kdeplot(beer['abv'])

In [None]:
sns.jointplot(jittered_df['review_aroma'], jittered_df['review_appearance'])

In [None]:
sns.jointplot(jittered_df['review_aroma'], jittered_df['review_appearance'],
              kind='hexbin')

In [None]:
titanic[['age','fare','pclass','survived']].dropna().head()

In [None]:
sns.pairplot(titanic[['age','fare','pclass','survived']].dropna())

In [None]:
sns.pairplot(titanic[['age','fare','pclass','survived']].dropna(),
            hue='pclass')

Seaborn also returns `matplotlib` `axis` objects...

In [None]:
ax = sns.pairplot(titanic[['age','fare','pclass','survived']].dropna(),
            hue='pclass')

In [None]:
ax.axes

In [None]:
ax.axes[1,1].set_title('For Example')
ax.fig

# Exercise 3
Load in `data/processed/gbd_1to4_deaths_and_sdi.hdf` and recreate this plot using `seaborn`:
![Exercise 3](../img/exercise-3.png)

In [None]:
kids = #
kids.head()

## Saving figures

In [None]:
ax.fig.savefig('my-beautiful-figure.png', dpi=300)

# Exercise 4
1. Make a function to produce the figure from Exercise 1 for any GBD region
2. Check out `PdfPages`
    * `from matplotlib.backends.backend_pdf import PdfPages`
    * `PdfPages?`
3. Make a PDF where each page is the graph from Exercise 1 for a different GBD region
    * If you need an example, check out [this one](https://matplotlib.org/examples/pylab_examples/multipage_pdf.html)
4. _Bonus_: Try getting location names from `data/raw/locs.csv`

In [None]:
from matplotlib.backends.backend_pdf import PdfPages

In [None]:
age_adjusted = #

In [None]:
def plot_location_id(location_id, dat=age_adjusted):
    """
    Generate a plot of the age-adjusted log death rate over time
    for a given location
    
    Args:
        location_id (int): id of the location to plot
        dat (pandas.DataFrame): the dataframe from which to retrieve
            data

    Return:
        fig (matplotlib.pyplot.figure): a beautiful graph
    
    """

    # 
    
    return fig

In [None]:
# loop through locations to make a PDF

### [ggplot](http://ggplot.yhathq.com/)

In [None]:
from ggplot import *
ggplot(diamonds, aes(x='carat', y='price', color='cut')) +\
    geom_point() +\
    scale_color_brewer(type='diverging', palette=4) +\
    xlab("Carats") + ylab("Price") + ggtitle("Diamonds")

In [None]:
p = ggplot(diamonds, aes(x='carat', y='price', color='cut'))
p += geom_point()
p += scale_color_brewer(type='diverging', palette=4)
p += xlab("Carats") 
p += ylab("Price") 
p += ggtitle("Diamonds")
p

In [None]:
p.fig.axes

### [Bokeh](http://bokeh.pydata.org/)

In [None]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
from bokeh.palettes import brewer
output_notebook()

N = 20
categories = ['y' + str(x) for x in range(10)]
data = {}
data['x'] = np.arange(N)
for cat in categories:
    data[cat] = np.random.randint(10, 100, size=N)

df = pd.DataFrame(data)
df = df.set_index(['x'])

def stacked(df, categories):
    areas = dict()
    last = np.zeros(len(df[categories[0]]))
    for cat in categories:
        next = last + df[cat]
        areas[cat] = np.hstack((last[::-1], next))
        last = next
    return areas

areas = stacked(df, categories)

colors = brewer["Spectral"][len(areas)]

x2 = np.hstack((data['x'][::-1], data['x']))

p = figure(x_range=(0, 19), y_range=(0, 800))
p.grid.minor_grid_line_color = '#eeeeee'

p.patches([x2] * len(areas), [areas[cat] for cat in categories],
          color=colors, alpha=0.8, line_color=None)

show(p, notebook_handle=True)
push_notebook()

## So many plotting libraries!

In [None]:
display.HTML('<iframe src="https://dansaber.wordpress.com/2016/10/02/a-dramatic-tour-through-pythons-data-visualization-landscape-including-ggplot-and-altair/" width=1024 height=500>')

## References

Slide materials inspired by and adapted from [Chris Fonnesbeck](https://github.com/fonnesbeck/statistical-analysis-python-tutorial) and [Tom Augspurger](https://github.com/TomAugspurger/pydata-chi-h2t)