In [1]:
import numpy as np

In [2]:
from bokeh.plotting import figure, show, ColumnDataSource, output_file
from bokeh.layouts import gridplot

In [3]:
# load some PCA data
coordinates = np.load("3L_10000000_30000000.coords.npy")
coordinates.shape

(765, 10)

In [4]:
pve = np.load("3L_10000000_30000000.pve.npy")
pve

array([ 0.01200335,  0.01070692,  0.00852672,  0.00606027,  0.00494645,
        0.00357033,  0.00341437,  0.00317344,  0.00313177,  0.00299404])

In [5]:
import pandas as pd

In [6]:
df = pd.read_table(
    "/kwiat/vector/ag1000g/release/phase1.AR3/samples/samples.meta.txt",
    index_col=0)[['ox_code', 'population', 'country', 'year']]

In [7]:
df.head()

Unnamed: 0_level_0,ox_code,population,country,year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,AB0085-C,BFS,Burkina Faso,2012
1,AB0087-C,BFM,Burkina Faso,2012
2,AB0088-C,BFM,Burkina Faso,2012
3,AB0089-C,BFM,Burkina Faso,2012
4,AB0090-C,BFM,Burkina Faso,2012


In [8]:
pop_colours = {
    'BFM': '#FF0000',
    'GAS': '#008000',
    'GNS': '#00FFFF',
    'UGS': '#90EE90',
    'GWA': '#FFA500',
    'AOM': '#8B0000',
    'BFS': '#1E90FF',
    'KES': '#808080',
    'CMS': '#0000FF',
}

In [9]:
import matplotlib as mpl
def plot_pca_coords(coords, pve, pc1, pc2, fig, populations):

    x = coords[:, pc1]
    y = coords[:, pc2]
    
    qdf = df.copy()
    qdf["x"] = x
    qdf["y"] = y
    
    for pop in populations:
        
        source = ColumnDataSource(
            data=qdf.query("population == @pop"))
        
        try:
            q = mpl.colors.rgb2hex(pop_colours[pop])
        except ValueError:
            print("X")
            q = mpl.colors.to_hex(pop_colours[pop])
        
        fig.circle(
            'x', 'y', 
            source=source,
            line_color='black',
            line_width=0.5,
            size=6,
            fill_color=q)
    
    fig.xaxis.axis_label = 'PC {0} ({1:.2f}%)'.format(
        pc1 + 1, 100 * pve[pc1])
    
    fig.yaxis.axis_label = 'PC {0} ({1:.2f}%)'.format(
        pc2 + 1, 100 * pve[pc2])
    
    return fig

In [10]:
components = np.array(range(8)).reshape((2, 2, 2)).tolist()

In [11]:
TOOLTIPS = [
    ("ox_code", "@ox_code"),
    ("population", "@population"),
    ("collection year", "@year"),
    ("(x, y)", "($x, $y)"),]

In [12]:
output_file("pca_plots.html")
grid = []

for row in components:
    
    l = []
    
    for (c1, c2) in row:

        p = figure(plot_width=400, plot_height=400, tooltips=TOOLTIPS)
        l.append(
            plot_pca_coords(
                coordinates, pve, c1, c2, p,
                pop_colours.keys()))
        
    grid.append(l)

col = gridplot(grid)
show(col)

In [13]:
from IPython.display import HTML

In [14]:
HTML(filename="pca_plots.html")