# Obtain SNP Data

## Download and install bgenix
### see https://enkre.net/cgi-bin/code/bgen/doc/trunk/doc/wiki/bgenix.md

In [1]:
cd /opt/notebooks
wget http://code.enkre.net/bgen/tarball/release/bgen.tgz
tar xvfz bgen.tgz > /dev/null
cd bgen.tgz/
./waf configure 
./waf 
./build/test/unit/test_bgen
./build/apps/bgenix -g example/example.16bits.bgen –list
cd /opt/notebooks

--2022-09-14 12:32:14--  http://code.enkre.net/bgen/tarball/release/bgen.tgz
Resolving code.enkre.net (code.enkre.net)... 91.197.228.37
Connecting to code.enkre.net (code.enkre.net)|91.197.228.37|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://enkre.net/cgi-bin/code/bgen/tarball/release/bgen.tgz [following]
--2022-09-14 12:32:14--  https://enkre.net/cgi-bin/code/bgen/tarball/release/bgen.tgz
Resolving enkre.net (enkre.net)... 91.197.228.37
Connecting to enkre.net (enkre.net)|91.197.228.37|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25476745 (24M) [application/x-compressed]
Saving to: ‘bgen.tgz’


2022-09-14 12:32:30 (64.8 MB/s) - ‘bgen.tgz’ saved [25476745/25476745]

Setting top to                           : /opt/notebooks/bgen.tgz 
Setting out to                           : /opt/notebooks/bgen.tgz/build 
Checking for 'gcc' (C compiler)          : /usr/bin/gcc 
Checking for 'g++' (C++ compiler)        : /usr/bin/g++ 


## download and install PLINK2
### see https://www.cog-genomics.org/plink/2.0/
### WARNING: the download link may have to be updated

In [2]:

cd /opt/notebooks
wget https://s3.amazonaws.com/plink2-assets/alpha3/plink2_linux_avx2_20220814.zip
unzip -o plink2_linux_avx2_20220814.zip
./plink2 --version

--2022-09-14 12:32:56--  https://s3.amazonaws.com/plink2-assets/alpha3/plink2_linux_avx2_20220814.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.111.61
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.111.61|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9134949 (8.7M) [application/zip]
Saving to: ‘plink2_linux_avx2_20220814.zip’


2022-09-14 12:32:57 (14.1 MB/s) - ‘plink2_linux_avx2_20220814.zip’ saved [9134949/9134949]

Archive:  plink2_linux_avx2_20220814.zip
  inflating: plink2                  
PLINK v2.00a3.6LM AVX2 Intel (14 Aug 2022)


## Create shortcuts to genotype directories (i.e. to avoid problems with Jupyter using blanks in file names)

In [3]:
DIR=/mnt/project/Bulk/Imputation/UKB*imputation*from*genotype/
ln -sf $DIR /opt/notebooks/impu

DIR=/mnt/project/Bulk/Genotype*Results/Genotype*calls/
ln -sf $DIR /opt/notebooks/geno

# define a function to extract genotyped SNPs [not really needed, just to keep the syntax, we use imputed SNPs below]
function extract_SNP () {
  SNP=${1-rs174547}
  CHR=${2-11}
  echo "extracting SNP $SNP on chromosome $CHR"
  ./plink2 --bfile  geno/ukb22418_c${CHR}_b0_v2 --snp $SNP --export  A  'include-alt' --out ${SNP}
  head -5 ${SNP}.raw
}
## test the function on a single SNP
extract_SNP rs780094 2


extracting SNP rs780094 on chromosome 2
PLINK v2.00a3.6LM AVX2 Intel (14 Aug 2022)     www.cog-genomics.org/plink/2.0/
(C) 2005-2022 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to rs780094.log.
Options in effect:
  --bfile geno/ukb22418_c2_b0_v2
  --export A include-alt
  --out rs780094
  --snp rs780094

Start time: Wed Sep 14 12:33:00 2022
15544 MiB RAM detected; reserving 7772 MiB for main workspace.
Using up to 8 compute threads.
488377 samples (264746 females, 223430 males, 201 ambiguous; 488377 founders)
loaded from geno/ukb22418_c2_b0_v2.fam.
61966 variants loaded from geno/ukb22418_c2_b0_v2.bim.
1 categorical phenotype loaded (488377 values).
--snp: 1 variant remaining.
1 variant remaining after main filters.
--export A pass 1/1: loading... 0writing... 101011111212131314141515161617171818191920202121222223232424252526262727282829293030313132323333343435353636373738383939404041414242434344444545464647474848494950505151525253535454555556565757585859596

In [4]:
# get the list of SNPs to retrieve from the main directory of the project
# the SNPs need to be in the format CHR:POS-POS (note that chr 1 needs to be written with a leading zero)
dx download -f SNPlist_REGION.tsv
cp SNPlist_REGION.tsv mypos.txt
head -3 mypos.txt
echo ...
tail -3 mypos.txt
wc -l mypos.txt

01:914852-914852
01:2147162-2147162
01:2326009-2326009
...
22:50746706-50746706
22:50840573-50840573
22:50858813-50858813
1835 mypos.txt


In [5]:
# extract SNPs by position

# genotype data is stored by chromosome, so we loop over these files
CFROM=1
CTO=22

export CHR=$CFROM
while [ $CHR -le $CTO ] ; do

  echo "working on CHR $CHR"
  
  # extract from one BGEN file
  /opt/notebooks/bgen.tgz/build/apps/bgenix -g impu/ukb22828_c${CHR}_b0_v3.bgen -incl-range mypos.txt > mypos.${CHR}.bgen

  # convert to text using PLINK
  ./plink2 --bgen mypos.${CHR}.bgen ref-first --sample impu/ukb22828_c${CHR}_b0_v3.sample --export  A  'include-alt' --out mypos.${CHR} --make-bed
  head -5 mypos.${CHR}.raw
  wc -l mypos.${CHR}.raw

  ls -ltr mypos.${CHR}*

  ((CHR=CHR+1))
  
done

working on CHR 1

Welcome to bgenix
(version: 1.1.7, revision )

(C) 2009-2017 University of Oxford

Building query                                              :  (154/?,14.3s,10.8/s)
Processing 154 variants                                     : [******************************] (154/154,39.6s,3.9/s)
bgenix: wrote data for 154 variants to stdout.

Thank you for using bgenix.
PLINK v2.00a3.6LM AVX2 Intel (14 Aug 2022)     www.cog-genomics.org/plink/2.0/
(C) 2005-2022 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to mypos.1.log.
Options in effect:
  --bgen mypos.1.bgen ref-first
  --export A include-alt
  --make-bed
  --out mypos.1
  --sample impu/ukb22828_c1_b0_v3.sample

Start time: Wed Sep 14 12:43:38 2022
15544 MiB RAM detected; reserving 7772 MiB for main workspace.
Using up to 8 compute threads.
--bgen: 154 variants detected, format v1.2.
487409 samples imported from .sample file to mypos.1-temporary.psam .
--bgen: mypos.1-temporary.pgen + mypos.1-tempora

In [None]:
tar cvfz SNPdata.tgz mypos.*

In [None]:
dx upload SNPdata.tgz