In [308]:
import numpy as np
import pandas as pd

import bokeh.plotting
import bokeh.io
import bokeh.layouts
import bokeh_catplot

bokeh.io.output_notebook()

# Exercise 6.1: Axes with logarithmic scale and error bars

In [2]:
!cat data/collins_switch.csv

# Data digitized from Fig. 5a of Gardner, et al., *Nature*, **403**, 339, 2000. The last column gives the standard error of the mean normalized GFP intensity.
[IPTG] (mM),normalized GFP expression (a.u.),sem
0.001000,0.004090,0.003475
0.010000,0.010225,0.002268
0.020000,0.022495,0.004781
0.030000,0.034765,0.003000
0.040000,0.067485,0.006604
0.040000,0.668712,0.087862
0.060000,0.740286,0.045853
0.100000,0.840491,0.058986
0.300000,0.936605,0.026931
0.600000,0.961145,0.093553
1.000000,0.940695,0.037624
3.000000,0.852761,0.059035
6.000000,0.910020,0.051052
10.000000,0.893661,0.042773


a) Now, let’s make a plot of IPTG versus GFP.

1. Load in the data set using Pandas. Make sure you use the `comment` kwarg of pd.read_csv() properly.
2. Make a plot of normalized GFP intensity (y-axis) versus IPTG concentration (x-axis).


In [3]:
df = pd.read_csv('data/collins_switch.csv', comment='#')

df.head()

Unnamed: 0,[IPTG] (mM),normalized GFP expression (a.u.),sem
0,0.001,0.00409,0.003475
1,0.01,0.010225,0.002268
2,0.02,0.022495,0.004781
3,0.03,0.034765,0.003
4,0.04,0.067485,0.006604


In [6]:
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='[IPTG] (mM)',
    y_axis_label='normalized GFP expression (a.u.)',
)

p.circle(
    source=df,
    x='[IPTG] (mM)',
    y='normalized GFP expression (a.u.)',
)

bokeh.io.show(p)

b) Now that you have done that, there are some problems with the plot. It is really hard to see the data points with low concentrations of IPTG. In fact, looking at the data set, the concentration of IPTG varies over four orders of magnitude. When you have data like this, it is wise to plot them on a logarithmic scale. You can specify the x-axis as logarithmic when you instantiate a figure with `bokeh.plotting.figure()` by using the `x_axis_type='log'` kwarg. (The obvious analogous kwarg applied for the y-axis.) For this data set, it is definitely best to have the x-axis on a logarithmic scale. Remake the plot you just did with the x-axis logarithmically scaled.

In [7]:
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='[IPTG] (mM)',
    x_axis_type='log',
    y_axis_label='normalized GFP expression (a.u.)',
)

p.circle(
    source=df,
    x='[IPTG] (mM)',
    y='normalized GFP expression (a.u.)',
)

bokeh.io.show(p)

c) The data set also contains the standard error of the mean, or SEM. The SEM is often displayed on plots as error bars. Now construct the plot with error bars.

1. Add columns `error_low` and `error_high` to the data frame containing the Collins data. These will set the bottoms and tops of the error bars. You should base the values in these columns on the standard error of the mean (`sem`). Assuming a Gaussian model, the 95% confidence interval is ±1.96 times the s.e.m.
2. Make a plot with the measured expression levels and the error bars. Hint: Check out the Bokeh docs and think about what kind of glyph works best for error bars.


In [12]:
# Assume Gaussian model, calculate 95% CI

df['error_low'] = df['normalized GFP expression (a.u.)'] - 1.96 * df['sem']
df['error_high'] = df['normalized GFP expression (a.u.)'] + 1.96 * df['sem']

df.head()

Unnamed: 0,[IPTG] (mM),normalized GFP expression (a.u.),sem,error_low,error_high
0,0.001,0.00409,0.003475,-0.002721,0.010901
1,0.01,0.010225,0.002268,0.00578,0.01467
2,0.02,0.022495,0.004781,0.013124,0.031866
3,0.03,0.034765,0.003,0.028885,0.040645
4,0.04,0.067485,0.006604,0.054541,0.080429


In [51]:
p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='[IPTG] (mM)',
    x_axis_type='log',
    y_axis_label='normalized GFP expression (a.u.)',
)

p.circle(
    source=df,
    x='[IPTG] (mM)',
    y='normalized GFP expression (a.u.)',
)

p.segment(
    source=df,
    x0='[IPTG] (mM)',
    y0='error_low',
    x1='[IPTG] (mM)',
    y1='error_high'
)

bokeh.io.show(p)

In [52]:
error_x = []
error_bars = []

for x in df['[IPTG] (mM)']:
    error_x.append((x,x))

for err_low, err_high in zip(df['error_low'], df['error_high']):
    error_bars.append((err_low, err_high))

p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='[IPTG] (mM)',
    x_axis_type='log',
    y_axis_label='normalized GFP expression (a.u.)',
)

p.circle(
    source=df,
    x='[IPTG] (mM)',
    y='normalized GFP expression (a.u.)',
)

p.multi_line(
    xs=error_x,
    ys=error_bars
)


bokeh.io.show(p)

In [50]:
from bokeh.models import ColumnDataSource, Whisker

source_error = ColumnDataSource(data=dict(base=df['[IPTG] (mM)'], lower=df['error_low'], upper=df['error_high']))

p = bokeh.plotting.figure(
    frame_width=400,
    frame_height=300,
    x_axis_label='[IPTG] (mM)',
    x_axis_type='log',
    y_axis_label='normalized GFP expression (a.u.)',
)

p.circle(
    source=df,
    x='[IPTG] (mM)',
    y='normalized GFP expression (a.u.)',
)

p.add_layout(
    Whisker(
        source=source_error,
        base='base',
        upper='upper',
        lower='lower'
    )
)

bokeh.io.show(p)

# Exercise 6.2: Automating scatter plots

Write a function that takes as input a tidy data frame and generates a scatter plot based on two columns of the data frame and colors the glyphs according to a third column that contains categorical variables. The minimal (you can add other kwargs if you want) call signature should be

    scatter(data, cat, x, y)

You will of course test out your function while writing it, and the next exercises give you lots of opportunities to use it.

In [53]:
# Import an example data source

df = pd.read_csv('data/gfmt_sleep.csv', na_values='*')
df['insomnia'] = df['sci'] <= 16

df.head()

Unnamed: 0,participant number,gender,age,correct hit percentage,correct reject percentage,percent correct,confidence when correct hit,confidence incorrect hit,confidence correct reject,confidence incorrect reject,confidence when correct,confidence when incorrect,sci,psqi,ess,insomnia
0,8,f,39,65,80,72.5,91.0,90.0,93.0,83.5,93.0,90.0,9,13,2,True
1,16,m,42,90,90,90.0,75.5,55.5,70.5,50.0,75.0,50.0,4,11,7,True
2,18,f,31,90,95,92.5,89.5,90.0,86.0,81.0,89.0,88.0,10,9,3,True
3,22,f,35,100,75,87.5,89.5,,71.0,80.0,88.0,80.0,13,8,20,True
4,27,f,74,60,65,62.5,68.5,49.0,61.0,49.0,65.0,49.0,13,9,12,True


Let's say we want to generate a scatter plot based on the columns `'correct hit percentage'` and `'correct reject percentage'`, and color the glyphs based on the categorical column `'gender'`.

In [316]:
def scatter(data, 
            cat, 
            x, 
            y, 
            legend_location='top_left', 
            x_axis_type='linear', 
            y_axis_type='linear',
           ):
    """Generates a scatter plot from x and y in data.
    Colors the glyphs according to cat."""
    
    colors = bokeh.palettes.Colorblind[8]
    
    p = bokeh.plotting.figure(
        frame_width=400,
        frame_height=300,
        x_axis_label=x,
        y_axis_label=y,
        x_axis_type=x_axis_type,
        y_axis_type=y_axis_type,
    )

    for i, val in enumerate(np.unique(data[cat])):
        p.circle(
            source=data.loc[data[cat]==val],
            x=x,
            y=y,
            legend_label=str(val),
            color=colors[i % len(palette)],
        )
    
    p.legend.title = cat
    p.legend.location = legend_location
    p.legend.click_policy = 'hide'

    bokeh.io.show(p)

In [315]:
# Note from class: colorcet

import colorcet
colorcet.b_glasbey_category10[0:5]

# Another resource for colors: colorbrewer2.org

['#1f77b3', '#ff7e0e', '#2ba02b', '#d62628', '#9367bc']

In [110]:
data = df
cat = 'gender'
x = 'correct hit percentage'
y = 'correct reject percentage'


scatter(data, cat, x, y, legend_location='bottom_left')

In [111]:
data = df
cat = 'insomnia'
x = 'confidence when correct'
y = 'confidence when incorrect'

scatter(data, cat, x, y)

# Exercise 6.3: Long-term trends in hybridization of Darwin finches.

We will investigate their measurements of beak depth (the distance, top to bottom, of a closed beak) and beak length (base to tip on the top) of Darwin’s finches. We will look at data from two species, Geospiza fortis and Geospiza scandens. The Grants provided data on the finches of Daphne for the years 1973, 1975, 1987, 1991, and 2012. I have included the data in the files `grant_1973.csv`, `grant_1975.csv`, `grant_1987.csv`, `grant_1991.csv`, and `grant_2012.csv`. They are in almost exactly the same format is in the Dryad repository; I have only deleted blank entries at the end of the files.

a) Load each of the files into separate Pandas data frames. You might want to inspect the file first to make sure you know what character the comments start with and if there is a header row.

In [114]:
!head -20 data/grant_1973.csv

# Data taken from the book
#   Grant, PR, Grant, BR (2014) 40 years of evolution:
#   Darwin's finches on Daphne Major Island.
#   Princeton: Princeton University Press.
#
# Accessed throug the Dryad data package:
#   Grant PR, Grant BR(2014) Data from: 40 years of evolution.
#   Darwin's finches on Daphne Major Island. Dryad Digital Repository.
#   http://dx.doi.org/10.5061/dryad.g6g3h
#
# The data appear as in the original file
#        Fig. 10-01.csv
#
band,species,yearband,beak length,beak depth
20123,fortis,73,9.25,8.05
20126,fortis,73,11.35,10.45
20128,fortis,73,10.15,9.55
20129,fortis,73,9.95,8.75
20133,fortis,73,11.55,10.15
20136,fortis,73,11.15,9.85


In [117]:
!head -20 data/grant_1975.csv

# Data taken from the book
#   Grant, PR, Grant, BR (2014) 40 years of evolution:
#   Darwin's finches on Daphne Major Island.
#   Princeton: Princeton University Press.
#
# Accessed throug the Dryad data package:
#   Grant PR, Grant BR(2014) Data from: 40 years of evolution.
#   Darwin's finches on Daphne Major Island. Dryad Digital Repository.
#   http://dx.doi.org/10.5061/dryad.g6g3h
#
# The data appear as in the original file
#        Fig. 10-03 data, 75.csv
#
band,species,"Beak length, mm","Beak depth, mm"
2,fortis,9.4,8
9,fortis,9.2,8.3
12,fortis,9.5,7.5
15,fortis,9.5,8
305,fortis,11.5,9.9
307,fortis,11.1,8.6


In [118]:
!head -20 data/grant_1987.csv

# Data taken from the book
#   Grant, PR, Grant, BR (2014) 40 years of evolution:
#   Darwin's finches on Daphne Major Island.
#   Princeton: Princeton University Press.
#
# Accessed throug the Dryad data package:
#   Grant PR, Grant BR(2014) Data from: 40 years of evolution.
#   Darwin's finches on Daphne Major Island. Dryad Digital Repository.
#   http://dx.doi.org/10.5061/dryad.g6g3h
#
# The data appear as in the original file
#        Fig. 10-03 data, 87.csv
#
band,species,"Beak length, mm","Beak depth, mm"
14613,fortis,9.1,7
15487,fortis,9.14,7.12
15187,fortis,9.24,7.21
15284,fortis,9.2,7.3
14983,fortis,8.83,7.32
14913,fortis,9.14,7.41


In [119]:
!head -20 data/grant_1991.csv

# Data taken from the book
#   Grant, PR, Grant, BR (2014) 40 years of evolution:
#   Darwin's finches on Daphne Major Island.
#   Princeton: Princeton University Press.
#
# Accessed throug the Dryad data package:
#   Grant PR, Grant BR(2014) Data from: 40 years of evolution.
#   Darwin's finches on Daphne Major Island. Dryad Digital Repository.
#   http://dx.doi.org/10.5061/dryad.g6g3h
#
# The data appear as in the original file
#        Fig. 10-03 data, 91.csv
#
band,species,blength,bdepth
2639,fortis,10.3,8.95
2666,fortis,12.81,9.3
2753,fortis,10.89,10.35
2776,fortis,11.3,10
4229,fortis,10.05,8.62
4677,fortis,11.02,10.17


In [120]:
!head -20 data/grant_2012.csv

# Data taken from the book
#   Grant, PR, Grant, BR (2014) 40 years of evolution:
#   Darwin's finches on Daphne Major Island.
#   Princeton: Princeton University Press.
#
# Accessed throug the Dryad data package:
#   Grant PR, Grant BR(2014) Data from: 40 years of evolution.
#   Darwin's finches on Daphne Major Island. Dryad Digital Repository.
#   http://dx.doi.org/10.5061/dryad.g6g3h
#
# The data appear as in the original file
#        Fig. 10-03 data, 12.csv
#
band,species,blength,bdepth
19022,fortis,10,8.5
19028,fortis,12.5,8.9
19032,fortis,9.3,7.5
19041,fortis,10.3,9.6
19044,fortis,11,9.2
19048,fortis,10.1,8.2


In [210]:
df_1973 = pd.read_csv('data/grant_1973.csv', comment='#')

df_1973.head()

Unnamed: 0,band,species,yearband,beak length,beak depth
0,20123,fortis,73,9.25,8.05
1,20126,fortis,73,11.35,10.45
2,20128,fortis,73,10.15,9.55
3,20129,fortis,73,9.95,8.75
4,20133,fortis,73,11.55,10.15


In [211]:
df_1975 = pd.read_csv('data/grant_1975.csv', comment='#')

df_1975.head()

Unnamed: 0,band,species,"Beak length, mm","Beak depth, mm"
0,2,fortis,9.4,8.0
1,9,fortis,9.2,8.3
2,12,fortis,9.5,7.5
3,15,fortis,9.5,8.0
4,305,fortis,11.5,9.9


In [212]:
df_1987 = pd.read_csv('data/grant_1987.csv', comment='#')

df_1987.head()

Unnamed: 0,band,species,"Beak length, mm","Beak depth, mm"
0,14613,fortis,9.1,7.0
1,15487,fortis,9.14,7.12
2,15187,fortis,9.24,7.21
3,15284,fortis,9.2,7.3
4,14983,fortis,8.83,7.32


In [213]:
df_1991 = pd.read_csv('data/grant_1991.csv', comment='#')

df_1991.head()

Unnamed: 0,band,species,blength,bdepth
0,2639,fortis,10.3,8.95
1,2666,fortis,12.81,9.3
2,2753,fortis,10.89,10.35
3,2776,fortis,11.3,10.0
4,4229,fortis,10.05,8.62


In [214]:
df_2012 = pd.read_csv('data/grant_2012.csv', comment='#')

df_2012.head()

Unnamed: 0,band,species,blength,bdepth
0,19022,fortis,10.0,8.5
1,19028,fortis,12.5,8.9
2,19032,fortis,9.3,7.5
3,19041,fortis,10.3,9.6
4,19044,fortis,11.0,9.2


b) We would like to merge these all into one data frame. The problem is that they have different header names, and only the 1973 file has a year entry (called `yearband`). This is common with real data. It is often a bit messy and requires some wrangling.

1. First, change the name of the `yearband` column of the 1973 data to year. Also, make sure the year format is four digits, not two!

2. Next, add a `year` column to the other four data frames. You want tidy data, so each row in the data frame should have an entry for the year.

3. Change the column names so that all the data frames have the same column names. I would choose column names

`['band', 'species', 'beak length (mm)', 'beak depth (mm)', 'year']`

4. Concatenate the data frames into a single data frame. Be careful with indices! If you use `pd.concat()`, you will need to use the `ignore_index=True` kwarg. You might also need to use the `axis` kwarg.

In [215]:
years = (1973, 1975, 1987, 1991, 2012)
dfs = (df_1973, df_1975, df_1987, df_1991, df_2012)

# 1

df_dict = dict(zip(years, dfs))

df_dict[1973] = df_dict[1973].rename(columns={'yearband':'year'})
df_dict[1973]['year'] += 1900

df_dict[1973].head()

Unnamed: 0,band,species,year,beak length,beak depth
0,20123,fortis,1973,9.25,8.05
1,20126,fortis,1973,11.35,10.45
2,20128,fortis,1973,10.15,9.55
3,20129,fortis,1973,9.95,8.75
4,20133,fortis,1973,11.55,10.15


In [216]:
# 2

for year in df_dict:
    if year != 1973:
        df_dict[year]['year'] = year
    
df_dict[1991].head()

Unnamed: 0,band,species,blength,bdepth,year
0,2639,fortis,10.3,8.95,1991
1,2666,fortis,12.81,9.3,1991
2,2753,fortis,10.89,10.35,1991
3,2776,fortis,11.3,10.0,1991
4,4229,fortis,10.05,8.62,1991


In [218]:
# 3

for year in df_dict:
    df_dict[year] = df_dict[year].rename(columns={'beak length': 'beak length (mm)',
                                                  'Beak length, mm': 'beak length (mm)',
                                                  'blength': 'beak length (mm)', 
                                                  'beak depth': 'beak depth (mm)',
                                                  'Beak depth, mm': 'beak depth (mm)',
                                                  'bdepth': 'beak depth (mm)'
                                                 }
                                        )
    
df_dict[1991].head()

Unnamed: 0,band,species,beak length (mm),beak depth (mm),year
0,2639,fortis,10.3,8.95,1991
1,2666,fortis,12.81,9.3,1991
2,2753,fortis,10.89,10.35,1991
3,2776,fortis,11.3,10.0,1991
4,4229,fortis,10.05,8.62,1991


In [219]:
df_all = pd.concat(df_dict,
                   ignore_index=True
                  )

df_all.head(10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,band,beak depth (mm),beak length (mm),species,year
0,20123,8.05,9.25,fortis,1973
1,20126,10.45,11.35,fortis,1973
2,20128,9.55,10.15,fortis,1973
3,20129,8.75,9.95,fortis,1973
4,20133,10.15,11.55,fortis,1973
5,20136,9.85,11.15,fortis,1973
6,20138,8.85,10.05,fortis,1973
7,20142,10.15,11.25,fortis,1973
8,20143,8.15,9.15,fortis,1973
9,20146,8.55,9.25,fortis,1973


In [220]:
df_all.tail(10)

Unnamed: 0,band,beak depth (mm),beak length (mm),species,year
2294,21288,10.0,14.1,scandens,2012
2295,21289,9.0,12.9,scandens,2012
2296,21290,8.7,13.9,scandens,2012
2297,21291,8.8,14.3,scandens,2012
2298,21292,8.4,13.2,scandens,2012
2299,21295,9.3,14.2,scandens,2012
2300,21297,9.8,13.0,scandens,2012
2301,21340,8.9,14.6,scandens,2012
2302,21342,9.8,13.1,scandens,2012
2303,21347,9.1,15.2,scandens,2012


c) The `band` field gives the number of the band on the bird’s leg that was used to tag it. Are some birds counted twice? Are they counted twice in the same year? Do you think you should drop duplicate birds from the same year? How about different years? My opinion is that you should drop duplicate birds from the same year and keep the others, but I would be open to discussion on that. To practice your Pandas skills, though, **let’s delete only duplicate birds from the same year from the data frame**. When you have made this data frame, save it as a CSV file.

Hint: The data frame methods `duplicated()` and `drop_duplicates()` will be useful.

After doing this work, it is worth saving your tidy data frame in a CSV document. do this using the `to_csv()` method of your data frame. Since the indices are uninformative, you should use the `index=False` kwarg.

In [253]:
df_all = df_all.drop_duplicates(['band', 'year'])

In [238]:
df_all.to_csv('data/grant_wrangled.csv', index=False)

d) Make a plots exploring how beak depth changes over time for each species. Think about what might be effective ways to display the data.

In [309]:
# ECDFs

p1 = bokeh_catplot.ecdf(
    data=df_all.loc[df_all['species'] == 'fortis'],
    cats=['year'],
    val='beak depth (mm)',
    style='staircase',
    title='fortis',
)

p2 = bokeh_catplot.ecdf(
    data=df_all.loc[df_all['species'] == 'scandens'],
    cats=['year'],
    val='beak depth (mm)',
    style='staircase',
    title='scandens',
)

bokeh.io.show(bokeh.layouts.column(p1, p2))

# Note from class: it would be nice to use a quantitative colorscheme for the years, since years are quantities

In [323]:
# Strip-box plots

p1 = bokeh_catplot.strip(
    data=df_all,
    cats=['species', 'year'],
    val='beak depth (mm)',
    horizontal=True,
    jitter=True,
    frame_height=250,
    title='fortis',
)

p1 = bokeh_catplot.box(
    data=df_all,
    cats=['species', 'year'],
    val='beak depth (mm)',
    horizontal=True,
    whisker_caps=True,
    display_points=False,
    box_kwargs=dict(fill_color=None, line_color='gray'),
    median_kwargs=dict(line_color='gray'),
    whisker_kwargs=dict(line_color='gray'),
    p=p1,
)

bokeh.io.show(p1)


In [306]:
# Strip plots over time

scatter(df_all,
        'species',
        'year',
        'beak depth (mm)',
        legend_location='top_right',
)

e) It is informative to plot the measurement of each bird’s beak as a point in the beak depth-beak length plane. For the 1987 data, plot beak depth vs. beak width for *Geospiza fortis* and for *Geospiza scandens*. The function you wrote in Exercise 6.2 will be useful to do this.

In [258]:
scatter(df_all.loc[df_all['year'] == 1987],
        'species', 'beak depth (mm)',
        'beak length (mm)',
        legend_location='bottom_right')

f) Do part (d) again for all years. Hint: To display all of the plots, check out the Bokeh documentation for layouts. In your plots, make sure all plots have the same range on the axes. If you want to set two plots, say `p1` and `p2` to have the same axis ranges, you can do the following.

In [311]:
x = 'beak depth (mm)'
y = 'beak length (mm)'

plots_list = []

colors = bokeh.palettes.Colorblind[8]
    
for year in df_dict:
    p = bokeh.plotting.figure(
        frame_width=400,
        frame_height=300,
        x_axis_label=x,
        y_axis_label=y,
        title = str(year)
    )

    for i, val in enumerate(np.unique(df_all['species'])):
        p.circle(
            source=df_all.loc[(df_all['species'] == val) & (df_all['year'] == year)],
            x=x,
            y=y,
            legend_label=str(val),
            color=colors[i],
        )
    
    p.legend.title = 'species'
    p.legend.location = 'bottom_right'
    p.legend.click_policy = 'hide'
    
    plots_list.append(p)
    
# Make all plots have the same range on the axes

for i, _ in enumerate(plots_list):
    if i > 0:
        plots_list[i].x_range = plots_list[0].x_range
        plots_list[i].y_range = plots_list[0].y_range
    
bokeh.io.show(bokeh.layouts.column(plots_list))

In [307]:
%load_ext watermark
%watermark -v -p numpy,pandas,bokeh,bokeh_catplot,jupyterlab

CPython 3.7.7
IPython 7.13.0

numpy 1.18.1
pandas 0.24.2
bokeh 2.0.2
bokeh_catplot 0.1.7
jupyterlab 1.2.6
