In [2]:
import sys, os, re
import numpy as np
import pandas as pd
import allel
import zarr
import numcodecs
import warnings
from pathlib import Path

%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina', 'png')
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
sns.set()
sns.set_style("white")
sns.set_context("notebook")

## Read in meta data

In [None]:
chromosomes = [line.split()[0] for line in open('../metadata/panu3_chrom_sizes.txt').readlines()]
chromosomes[:5]

In [3]:
meta_data = pd.read_excel('../metadata/Papio-Genomes_JR_120720_MR-CR-KM_geoloc.xlsx')
meta_data.head(3)

Unnamed: 0.1,Unnamed: 0,PGDP_ID,Provider_ID,Provider,Genus,Species,Origin,Sex,address,longitude,latitude
0,0,PD_0067,1043,Roos,Theropithecus,gelada,captive,M,"SDSU Captive Wildlife Research Facility, Brook...",-96.79328,44.334031
1,1,PD_0199,09SNF1101115,Knauf/Chuma/Roos,Papio,anubis,"Serengeti, Tanzania",F,"Serengeti, Mara, Lake Zone, Tanzania",34.742544,-1.996626
2,2,PD_0200,11SNF1101115,Knauf/Chuma/Roos,Papio,anubis,"Serengeti, Tanzania",F,"Serengeti, Mara, Lake Zone, Tanzania",34.742544,-1.996626


Get baboon sample IDs:

In [33]:
baboon_samples = [x for x in meta_data.PGDP_ID if x.startswith('PD')] # to not get the SciAdvPaper samples
baboon_samples[:3]

## Building the Zarr data structures

One-off generation of zarr persistent data structure:

If you had a single VCF file with *all* data and all chromosomes you would need to load it one chromosome at a time using the `region` keyword argument:

## Open Zarr data

In [19]:
callability_masks = zarr.open_group('../steps/callability.zarr', mode='r')
callability_masks

<zarr.hierarchy.Group '/' read-only>

In [14]:
callset = zarr.open_group('../steps/callset.zarr', mode='r+')
callset.tree(expand=False)

Tree(nodes=(Node(disabled=True, name='/', nodes=(Node(disabled=True, name='chr1', nodes=(Node(disabled=True, n…

In [32]:
!ls 

baboon_ranges.ipynb  callability.zarr	 Untitled.ipynb
callability.ipynb    scikit-allel.ipynb  vcf_files.txt


## Playing with scikit-allel

In [40]:
gt_zarr = callset['chr1/calldata/GT']
gt_zarr.info

0,1
Name,/chr1/calldata/GT
Type,zarr.core.Array
Data type,int8
Shape,"(8435583, 169, 2)"
Chunk shape,"(65536, 64, 2)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,2851227054 (2.7G)


In [22]:
pos = allel.SortedIndex(callset['chr1/variants/POS'])
pos

0,1,2,3,4,...,3560094,3560095,3560096,3560097,3560098
5358,5362,5388,5447,5542,...,223615954,223615964,223616040,223616053,223616073


In [23]:
loc_region = pos.locate_range(20000000, 20100000)
loc_region

slice(375626, 377378, None)

In [24]:
gt_region = allel.GenotypeArray(gt_zarr[loc_region])
gt_region

Unnamed: 0,0,1,Unnamed: 3
0,1/1,1/1,
1,1/1,./.,
2,1/1,1/1,
...,...,...,...
1749,1/1,1/1,
1750,1/1,1/1,
1751,1/1,./.,


In [29]:
multi_allelic = callset['chr1/variants/numalt'][:] > 1
multi_allelic

array([False, False, False, ..., False, False, False])

In [30]:
loc_variant_selection = ~multi_allelic #& (afr_af[:, 0] > 0.05)
loc_variant_selection

array([ True,  True,  True, ...,  True,  True,  True])

In [31]:
np.count_nonzero(loc_variant_selection)


3556332

In [32]:
gt = allel.GenotypeArray(gt_zarr)
gt

Unnamed: 0,0,1,Unnamed: 3
0,0/1,./.,
1,1/1,./.,
2,0/1,./.,
...,...,...,...
3560096,1/1,./.,
3560097,1/1,./.,
3560098,1/1,./.,


In [33]:
gt_variant_selection = gt.compress(loc_variant_selection, axis=0)
gt_variant_selection

Unnamed: 0,0,1,Unnamed: 3
0,0/1,./.,
1,1/1,./.,
2,0/1,./.,
...,...,...,...
3556329,1/1,./.,
3556330,1/1,./.,
3556331,1/1,./.,


In [34]:
gt_dask = allel.GenotypeDaskArray(gt_zarr)
gt_dask

Unnamed: 0,0,1,Unnamed: 3
0,0/1,./.,
1,1/1,./.,
2,0/1,./.,
...,...,...,...
3560096,1/1,./.,
3560097,1/1,./.,
3560098,1/1,./.,


In [35]:
gt_variant_selection = gt_dask.compress(loc_variant_selection, axis=0).compute()
gt_variant_selection

Unnamed: 0,0,1,Unnamed: 3
0,0/1,./.,
1,1/1,./.,
2,0/1,./.,
...,...,...,...
3556329,1/1,./.,
3556330,1/1,./.,
3556331,1/1,./.,


In [36]:
samples = callset['chr1/samples'][:]
samples

array(['PD_0216', 'PD_0219'], dtype=object)

In [49]:
panel = pd.DataFrame(dict(sample=['PD_0219', 'PD_0216'], super_pop=['AFR', 'EUR']))

In [50]:
samples_list = list(samples)
samples_callset_index = [samples_list.index(s) for s in panel['sample']]
panel['callset_index'] = samples_callset_index
panel.head()

Unnamed: 0,sample,super_pop,callset_index
0,PD_0219,AFR,1
1,PD_0216,EUR,0


In [51]:
loc_samples_afr = panel[panel.super_pop == 'AFR'].callset_index.values
loc_samples_afr

array([1])

In [53]:
loc_samples_afr = panel[panel.super_pop == 'AFR'].callset_index.values
loc_samples_afr

array([1])

In [54]:
gt_afr = gt_variant_selection.take(loc_samples_afr, axis=1)
gt_afr

Unnamed: 0,0,Unnamed: 2
0,./.,
1,./.,
2,./.,
...,...,...
3556329,./.,
3556330,./.,
3556331,./.,


In [55]:
gt_afr = gt_dask.subset(loc_variant_selection, loc_samples_afr).compute()
gt_afr

Unnamed: 0,0,Unnamed: 2
0,./.,
1,./.,
2,./.,
...,...,...
3556329,./.,
3556330,./.,
3556331,./.,
