In [None]:
# This step by step exercise will give you a sense of how populations from different
# continents of origin can distinguished by a small numnber of variants. Even though we 
# share 99% of our genome, there are sufficient population diffences to allow a simple PCA
# to demonstrate the difference. However the vast majority of these variants are likely not
# biologically or medically informative (see our optional reading materials in this regard)
# Note this is just using variants from a single chromosome!

In [None]:
install.packages("Matrix",repos="http://cran.us.r-project.org")
library(Matrix)

#### Let’s read the variant data for chromosome 20 into an R sparse matrix. Note that we only care about the variant number and sample (person) number in this exercise and ignore everything else.

In [None]:
p = pipe("zcat ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz  | sed /^#/d  | cut  -f '10-' | ./a.out | cut -f '1-2'")

In [None]:
# takes some time
x = read.table(p, colClasses=c("integer","integer"), fill=TRUE, row.names=NULL)

In [None]:
x[1:10,]

In [None]:
chr20 = sparseMatrix(i=x[,2], j=x[,1], x=1.0)

In [None]:
chr20[461:470,1:10]

In [None]:
install.packages("irlba", repos="http://cran.us.r-project.org")

In [None]:
library("irlba")

In [None]:
install.packages("threejs", repos="http://cran.us.r-project.org")

In [None]:
library("threejs")

#### The next step computes the first three principal component vectors using the irlba package and plots a 3d scatterplot using the threejs package.

In [None]:
# takes some time
cm = colMeans(chr20)
p = irlba(chr20, nv=3, nu=3, tol=0.1, center=cm)

In [None]:
plot(x = p$u[,1], y = p$u[,2], xlab = "PC 1", ylab = "PC 2")
plot(x = p$u[,1], y = p$u[,3], xlab = "PC 1", ylab = "PC 3")
plot(x = p$u[,2], y = p$u[,3], xlab = "PC 2", ylab = "PC 3")

#### The data exhibit obvious groups, and those groups correspond to ethnicities. That can be illustrated by loading ancillary data from the 1000 genomes project that identifies the “superpopulation” of each sample.

In [None]:
# Read just the header of the chromosome file to obtain the sample identifiers
ids = readLines(pipe("zcat ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz  | sed -n /^#CHROM/p | tr '\t' '\n' | tail -n +10"))

In [None]:
# Download and parse the superpopulation data for each sample, order by ids
ped = read.table(url("ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_g1k.ped"),sep="\t",header=TRUE,row.names=2)[ids,6,drop=FALSE]

In [None]:
# Download the subpopulation and superpopulation codes
# WARNING: These links occasionally change. Beware!
pop = read.table("ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/20131219.populations.tsv",sep="\t",header=TRUE)
pop = pop[1:26,]
super = pop[,3]
names(super) = pop[,2]
super = factor(super)
# The last rows of pop are summary data or non-relevant:

In [None]:
# Map sample sub-populations to super-populations
ped$Superpopulation = super[as.character(ped$Population)]

In [None]:
N = length(levels(super))
plot(x = p$u[,1], y = p$u[,2], col=rainbow(N)[ped$Superpopulation], xlab = "PC 1", ylab = "PC 2")
plot(x = p$u[,1], y = p$u[,3], col=rainbow(N)[ped$Superpopulation], xlab = "PC 1", ylab = "PC 3")
plot(x = p$u[,2], y = p$u[,3], col=rainbow(N)[ped$Superpopulation], xlab = "PC 2", ylab = "PC 3")

In [None]:
Sys.time()
sessionInfo()