# Task 4 Association analysis

The association analysis tries to identify genetic variants (SNPs) that can be associated with a trait.

To complete this task it is necessary to have generated the genotypes dossages 'chri.dose.rsq.DS.vcf.gz', the fam file 'chr22.dose.for.assoc.fam' updated with phenotype and sex and the covariates file 'covar_mds.txt' from the Task 2.2 Stratification Analysis.

In [1]:
%load_ext rpy2.ipython

In [2]:
import os

# Create directories for the output files
path="/mnt/data/GWAS/output/build37/task4_assoc"
if not os.path.exists(path):
    os.makedirs(path)

In [3]:
# Set an environment variable to hold the path to the output directory
# It is recommended to send the output to the datavolume (so that you don't fill up the home directory). You will be able to acces it from your host machine
%env path= /mnt/data/GWAS/output/build37/task4_assoc
%env task3path= /mnt/data/GWAS/output/build37/task3_imputation/imputed_files

env: path=/mnt/data/GWAS/output/build37/task4_assoc
env: task3path=/mnt/data/GWAS/output/build37/task3_imputation/imputed_files


## Association analysis

In [4]:
%%bash
# Perform association analysis with PLINK (Purcell et al. 2007)
# Association of genotype dosages with the AD case-control status is explored through regression model adjusted by sex, and the 10 MDS dimensions as covariates using PLINK.
for i in {1..22}
do
plink --fam $task3path/chr22.dose.for.assoc.fam --out $path/chr$i.imputed.dosage --ci 0.95 --covar /mnt/data/GWAS/output/build37/task2_QC/covar_mds.txt --hide-covar --dosage $task3path/chr$i.dose.rsq.0.3.DS.vcf.gz format=1 noheader
done

PLINK v1.90b3.45 64-bit (13 Jan 2017)      https://www.cog-genomics.org/plink2
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /mnt/data/GWAS/output/build37/task4_assoc/chr1.imputed.dosage.log.
Options in effect:
  --ci 0.95
  --covar /mnt/data/GWAS/output/build37/task2_QC/covar_mds.txt
  --dosage /mnt/data/GWAS/output/build37/task3_imputation/imputed_files/chr1.dose.rsq.0.3.DS.vcf.gz format=1 noheader
  --fam /mnt/data/GWAS/output/build37/task3_imputation/imputed_files/chr22.dose.for.assoc.fam
  --hide-covar
  --out /mnt/data/GWAS/output/build37/task4_assoc/chr1.imputed.dosage

Note: --hide-covar flag deprecated.  Use e.g. '--linear hide-covar'.
257659 MB RAM detected; reserving 128829 MB for main workspace.
496 people (237 males, 259 females) loaded from .fam.
496 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
--covar: 10 covariates loaded.
496 people pass filters and QC.
Among remaining phenotypes,

In [5]:
%%bash
#Merge chr results in a single file
for i in {2..22}
do
awk '{if (NR>1) print $0}' $path/chr$i.imputed.dosage.assoc.dosage > $path/chr$i.imputed.dosage.assoc.dosage.nh
done
for i in {2..22}
do cat $path/chr$i.imputed.dosage.assoc.dosage.nh; done > $path/chr2-22.imputed.dosage.assoc.dosage.nh
cat $path/chr1.imputed.dosage.assoc.dosage $path/chr2-22.imputed.dosage.assoc.dosage.nh > $path/dataset.b37.imputed.dosage.full.assoc.dosage

In [6]:
%%bash
# Remove missing values
wc -l $path/dataset.b37.imputed.dosage.full.assoc.dosage
awk '{OFS="\t"; gsub("^ +","",$0); gsub(" +","\t",$0); print $0}' $path/dataset.b37.imputed.dosage.full.assoc.dosage |awk '{if ($8!="NA") print $0}' > $path/dataset.b37.imputed.dosage.full.assoc.dosage.clean
wc -l $path/dataset.b37.imputed.dosage.full.assoc.dosage.clean
head $path/dataset.b37.imputed.dosage.full.assoc.dosage.clean

16093211 /mnt/data/GWAS/output/build37/task4_assoc/dataset.b37.imputed.dosage.full.assoc.dosage
7856796 /mnt/data/GWAS/output/build37/task4_assoc/dataset.b37.imputed.dosage.full.assoc.dosage.clean
SNP	A1	A2	FRQ	INFO	OR	SE	P
1:49298:T:C	T	C	0.6621	0.4660	0.9172	0.4038	0.8305
1:54676:C:T	C	T	0.3611	0.4268	1.2771	0.4364	0.5752
1:86028:T:C	T	C	0.0606	0.9142	1.0473	0.5194	0.929
1:91536:G:T	G	T	0.4853	0.5284	0.5885	0.3625	0.1436
1:234313:C:T	C	T	0.0676	0.3908	0.6839	0.7846	0.6283
1:534192:C:T	C	T	0.2262	0.4186	1.3682	0.4805	0.5141
1:534583:C:G	C	G	0.0119	0.6564	35.3463	1.6454	0.03026
1:546697:A:G	A	G	0.8784	0.4792	1.1050	0.6354	0.8752
1:564862:T:C	T	C	0.0380	0.6638	2.6774	0.8295	0.2351


## Add rs ID

In [7]:
%%R
## Add rs ID
path= "/mnt/data/GWAS/output/build37/task4_assoc"
assoc_results<-read.table(paste(path,"dataset.b37.imputed.dosage.full.assoc.dosage.clean",sep="/"), sep="\t", header=T)
annot<-read.table("/mnt/data/GWAS/ref_files/HRC.r1-1.GRCh37.minimac4.output.annot.clean.txt", sep="\t", header=T)
head(assoc_results)
head(annot)
merged <-merge(assoc_results,annot, by="SNP", all.x=TRUE)
write.table(merged,paste(path,"dataset.b37.imputed.dosage.full.assoc.dosage.clean.rs",sep="/"), sep="\t", row.names=FALSE , quote=FALSE)


In [8]:
%%bash
head $path/dataset.b37.imputed.dosage.full.assoc.dosage.clean.rs
sed 's/:/\t/g' $path/dataset.b37.imputed.dosage.full.assoc.dosage.clean.rs | awk 'BEGIN{OFS="\t"; print "CHR","BP","SNP","A1","A2","FRQ","INFO","OR","SE","P","RS"};{OFS="\t"; if (NR>1) print $1,$2,$1":"$2":"$3":"$4,$5,$6,$7,$8,$9,$10,$11,$12}' > $path/dataset.b37.imputed.dosage.full.assoc.dosage.clean.rs.chr.bp
head $path/dataset.b37.imputed.dosage.full.assoc.dosage.clean.rs.chr.bp

SNP	A1	A2	FRQ	INFO	OR	SE	P	RS
10:100000625:A:G	A	G	0.4496	0.9979	1.2232	0.2799	0.4717	rs7899632
10:100000645:A:C	A	C	0.2238	0.9554	0.6617	0.3298	0.2105	rs61875309
10:100001867:C:T	C	T	0.0105	0.9292	1.2906	1.0324	0.8048	rs150203744
10:100002464:T:C	T	C	0.0112	0.9812	0.7513	1.0186	0.7789	rs111551711
10:100003242:T:G	T	G	0.1401	1.0713	1.9542	0.3874	0.08371	rs12258651
10:100003304:A:G	A	G	0.0362	0.9602	2.6659	0.7874	0.213	rs72828461
10:100003785:T:C	T	C	0.3236	0.9361	1.1277	0.2847	0.6729	rs1359508
10:100004360:G:A	G	A	0.2238	0.9554	0.6617	0.3298	0.2105	rs1048754
10:100004441:G:C	G	C	0.6734	0.9275	0.906	0.2859	0.7298	rs1048757
CHR	BP	SNP	A1	A2	FRQ	INFO	OR	SE	P	RS
10	100000625	10:100000625:A:G	A	G	0.4496	0.9979	1.2232	0.2799	0.4717	rs7899632
10	100000645	10:100000645:A:C	A	C	0.2238	0.9554	0.6617	0.3298	0.2105	rs61875309
10	100001867	10:100001867:C:T	C	T	0.0105	0.9292	1.2906	1.0324	0.8048	rs150203744
10	100002464	10:100002464:T:C	T	C	0.0112	0.9812	0.7513	1.0186	0.7789	rs111551711
10	100003242

## Add nearest genes

In [9]:
%%bash
#Annotate results 
plink --annotate $path/dataset.b37.imputed.dosage.full.assoc.dosage.clean.rs.chr.bp ranges=/mnt/data/GWAS/ref_files/hg19.refGene.plink.txt --border 200 --out $path/dataset.b37.imputed.dosage.full.assoc.dosage.clean.rs.200kb.annotated 


PLINK v1.90b3.45 64-bit (13 Jan 2017)      https://www.cog-genomics.org/plink2
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /mnt/data/GWAS/output/build37/task4_assoc/dataset.b37.imputed.dosage.full.assoc.dosage.clean.rs.200kb.annotated.log.
Options in effect:
  --annotate /mnt/data/GWAS/output/build37/task4_assoc/dataset.b37.imputed.dosage.full.assoc.dosage.clean.rs.chr.bp ranges=/mnt/data/GWAS/ref_files/hg19.refGene.plink.txt
  --border 200
  --out /mnt/data/GWAS/output/build37/task4_assoc/dataset.b37.imputed.dosage.full.assoc.dosage.clean.rs.200kb.annotated

257659 MB RAM detected; reserving 128829 MB for main workspace.
--annotate ranges: 23967 annotations loaded from
/mnt/data/GWAS/ref_files/hg19.refGene.plink.txt (counting multi-chromosome
annotations once per spanned chromosome).
--annotate: 6506795 out of 7856795 rows annotated; new report written to
/mnt/data/GWAS/output/build37/task4_assoc/dataset.b37.imputed.dosage.full.assoc.dosag

In [10]:
%%bash
sort -gk 10,10 $path/dataset.b37.imputed.dosage.full.assoc.dosage.clean.rs.200kb.annotated.annot | awk '{OFS="\t"; if($10<0.0001) print $0}' | sed 's/ /\t/g' > $path/temp
awk 'BEGIN {OFS="\t"; print "CHR","BP","SNP","A1","A2","FRQ","INFO","OR","P","SE","RS","ANNOT"};{OFS="\t"; print $0}' $path/temp > $path/dataset.b37.imputed.dosage.assoc.200kb.annot.tops
wc $path/dataset.b37.imputed.dosage.assoc.200kb.annot.tops
head $path/dataset.b37.imputed.dosage.assoc.200kb.annot.tops
rm $path/temp


  548  6576 84823 /mnt/data/GWAS/output/build37/task4_assoc/dataset.b37.imputed.dosage.assoc.200kb.annot.tops
CHR	BP	SNP	A1	A2	FRQ	INFO	OR	P	SE	RS	ANNOT
2	62514587	2:62514587:C:G	C	G	0.4279	1.0372	5.8969	0.351	4.288e-07	rs12614194	B3GNT2(+62.72kb)|COMMD1(+151.4kb)|MIR5192(+81.54kb)
10	79119844	10:79119844:G:C	G	C	0.0307	0.9511	0.0072	1.0354	1.88e-06	rs118093470	KCNMA1(0)
10	79110632	10:79110632:C:T	C	T	0.0307	0.9515	0.0072	1.0369	1.986e-06	rs144195574	KCNMA1(0)
15	86321907	15:86321907:C:T	C	T	0.5786	0.9746	5.9959	0.3792	2.319e-06	rs2542594	AKAP13(+29.32kb)|KLHL25(0)|MIR1276(+8.098kb)
6	29814513	6:29814513:G:C	G	C	0.0278	0.8949	26.9058	0.7017	2.706e-06	rs13216671	HCG4(+53.66kb)|HCG4B(-77.86kb)|HCG8(-165.4kb)|HCG9(-128.4kb)|HLA-A(-95.73kb)|HLA-F(+119.4kb)|HLA-F-AS1(+97.69kb)|HLA-G(+15.61kb)|HLA-H(-40.87kb)|HLA-J(-159.2kb)|IFITM4P(+95.59kb)|LOC554223(+48.93kb)|MOG(+174.4kb)|ZFP57(+169.6kb)|ZNRD1-AS1(-154.3kb)
19	48578889	19:48578889:A:C	A	C	0.7349	0.9911	0.1667	0.3899	4.345e-06	rs1991722	

In [11]:
%%bash
cp $path/dataset.b37.imputed.dosage.full.assoc.dosage.clean.rs.200kb.annotated.annot $path/dataset.b37.imputed.assoc.dosage.clean.rs.200kb.annot
# comment for me; copy by now but change file names in the future

In [12]:
%%bash
head $path/dataset.b37.imputed.assoc.dosage.clean.rs.200kb.annot

CHR	BP	SNP	A1	A2	FRQ	INFO	OR	SE	P	RS ANNOT
10	100000625	10:100000625:A:G	A	G	0.4496	0.9979	1.2232	0.2799	0.4717	rs7899632 HPS1(-175.3kb)|LOXL4(-6.817kb)|MIR1287(-154.3kb)|MIR4685(-190.4kb)|PYROXD2(-142.7kb)|R3HCC1L(0)
10	100000645	10:100000645:A:C	A	C	0.2238	0.9554	0.6617	0.3298	0.2105	rs61875309 HPS1(-175.3kb)|LOXL4(-6.797kb)|MIR1287(-154.3kb)|MIR4685(-190.4kb)|PYROXD2(-142.7kb)|R3HCC1L(0)
10	100001867	10:100001867:C:T	C	T	0.0105	0.9292	1.2906	1.0324	0.8048	rs150203744 HPS1(-174.1kb)|LOXL4(-5.575kb)|MIR1287(-153.1kb)|MIR4685(-189.2kb)|PYROXD2(-141.5kb)|R3HCC1L(0)
10	100002464	10:100002464:T:C	T	C	0.0112	0.9812	0.7513	1.0186	0.7789	rs111551711 HPS1(-173.5kb)|LOXL4(-4.978kb)|MIR1287(-152.5kb)|MIR4685(-188.6kb)|PYROXD2(-140.9kb)|R3HCC1L(0)
10	100003242	10:100003242:T:G	T	G	0.1401	1.0713	1.9542	0.3874	0.08371	rs12258651 HPS1(-172.7kb)|LOXL4(-4.2kb)|MIR1287(-151.7kb)|MIR4685(-187.8kb)|PYROXD2(-140.1kb)|R3HCC1L(0)
10	100003304	10:100003304:A:G	A	G	0.0362	0.9602	2.6659	0.7874	0.213	rs7282846

**For the next step you need the following file:**
- dataset.b37.imputed.assoc.dosage.clean.rs.200kb.annot