# Compatibility assessment for VIDUS population controls
__Author:__Jesse Marks

This document logs the steps taken to compare genomic inflation factors for the [VIDUS](http://www.cfenet.ubc.ca/research/vidus) population controls. These include:

* [Lung Cancer in Never Smokers](https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000634.v1.p1)
* [Non-Hodgkin Lymphoma Study](https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000801.v2.p1)

The starting point for this analysis uses quality controlled observed genotypes.

## Software and tools
The software and tools used for processing these data are:

* AWS S3 and EC2 instance with Linux AMI
* GNU bash version 4.2.46
* [PLINK v1.9 beta 3.45](https://www.cog-genomics.org/plink/)
* [EIGENSOFT v6.1.4](https://www.hsph.harvard.edu/alkes-price/software/)
* [R v3.3.3](https://www.r-project.org/)

# S3 data retrieval

In [None]:
# EC2 command line #
cd /shared/data/studies

# Copy genotype and phenotype data for LCNS
aws s3 cp s3://rti-common/dbGaP/phs000634_lung_cancer/genotype/original/final/ phs000634_lung_cancer/genotype/original/final --recursive
aws s3 cp s3://rti-common/dbGaP/phs000634_lung_cancer/phenotype/unprocessed/ phs000634_lung_cancer/phenotype --recursive --exclude="*" --include="*Sample.MULTI.txt.gz"
aws s3 cp s3://rti-common/dbGaP/phs000634_lung_cancer/phenotype/unprocessed/ phs000634_lung_cancer/phenotype --recursive --exclude="*" --include="*Subject.MULTI.txt.gz"
aws s3 cp s3://rti-common/dbGaP/phs000634_lung_cancer/phenotype/unprocessed/ phs000634_lung_cancer/phenotype --recursive --exclude="*" --include="*Subject_Phenotypes.GRU-MDS.txt.gz"

#  Copy genotype and phenotype data for NHL
aws s3 cp s3://rti-common/dbGaP/phs000801_lymphoma/genotype/original/final/ phs000801_lymphoma/genotype/original/final --recursive
aws s3 cp s3://rti-common/dbGaP/phs000801_lymphoma/phenotype/unprocessed/ phs000801_lymphoma/phenotype --recursive --exclude="*" --include="*Sample_Attributes.GRU-NPU-MDS.txt"
aws s3 cp s3://rti-common/dbGaP/phs000801_lymphoma/phenotype/unprocessed/ phs000801_lymphoma/phenotype --recursive --exclude="*" --include="*Subject_Phenotypes.GRU-NPU-MDS.txt.gz"

gunzip phs000634_lung_cancer/genotype/original/final/*gz
gunzip phs000634_lung_cancer/phenotype/*.gz
gunzip phs000801_lymphoma/genotype/original/final/*.gz

# EA subject analysis
## Merge test
To determine whether any of the SNPs are flipped between studies, a merge is attempted. If any multi-allelic variants are identified (suggestive of flipping) then an error is raised. In this case, position duplicates were identified, but these will be removed during the SNP intersection step. 

In [None]:
# EC2 command line #
mkdir -p /shared/sandbox/vidus_lambda_comparison/merge_test
cd /shared/sandbox/vidus_lambda_comparison/

# Attempt data set merge
ancestry="ea"
/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --bfile /shared/data/studies/phs000634_lung_cancer/genotype/original/final/ea_chr_all \
    --bmerge /shared/data/studies/phs000801_lymphoma/genotype/original/final/ea_chr_all \
    --make-bed \
    --out merge_test/merged_unflipped
'Error: 9 variants with 3+ alleles present.'

## SNP intersection

In [None]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison
mkdir intersect

ancestry="ea"
studies=(phs000634_lung_cancer phs000801_lymphoma) # array of study names

# Get first intersection set
file1=/shared/data/studies/${studies[0]}/genotype/original/final/${ancestry}_chr_all.bim
file2=/shared/data/studies/${studies[1]}/genotype/original/final/${ancestry}_chr_all.bim
echo -e "\nCalculating intersection between ${file1} and ${file2}...\n"
comm -12 <(cut -f 2,2 $file1 | sort -u) <(cut -f 2,2 $file2 | sort -u) \
    > intersect/${ancestry}_variant_intersection.txt

# Make new PLINK binary file sets
for study in ${studies[@]}; do
    /shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
        --noweb \
        --bfile /shared/data/studies/${study}/genotype/original/final/${ancestry}_chr_all \
        --extract intersect/${ancestry}_variant_intersection.txt \
        --exclude merge_test/merged_unflipped-merge.missnp \
        --make-bed \
        --out intersect/${study}_${ancestry}
done

'--extract: 630048 variants remaining.
--exclude: 630039 variants remaining.'

## Second pass merge
To ensure data set compatipatibility, a second pass merge is executed.

In [None]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison

# Re-attempt data set merge
ancestry="ea"
/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --bfile intersect/phs000634_lung_cancer_ea \
    --bmerge intersect/phs000801_lymphoma_ea \
    --make-bed \
    --out merge_test/merged_intersect

# Clean-up
rm -r merge_test

## Assign case and controls

In [None]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison

# Arbitrarily assign one study as case and the other as control

ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma

# Modify FAM file to include case/control status
awk '{ $6=2; print $0 }' intersect/${study1}_${ancestry}.fam \ # note $0 is whole line argument
    > intersect/${study1}_vs_${study2}_${ancestry}.fam
awk '{ $6=1; print $0 }' intersect/${study2}_${ancestry}.fam \
    > intersect/${study2}_vs_${study1}_${ancestry}.fam

# EIGENSTRAT
## PLINK file set merge and MAF filter

In [None]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison
mkdir eigenstrat

# Arbitrarily assign one study as case and the other as control
ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lyphoma

# Create temporary file sets
/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --fam intersect/${study1}_vs_${study2}_${ancestry}.fam \
    --bim intersect/${study1}_${ancestry}.bim \
    --bed intersect/${study1}_${ancestry}.bed \
    --make-bed \
    --out eigenstrat/${study1}_vs_${study2}_${ancestry}.tmp
/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --fam intersect/${study2}_vs_${study1}_${ancestry}.fam \
    --bim intersect/${study2}_${ancestry}.bim \
    --bed intersect/${study2}_${ancestry}.bed \
    --make-bed \
    --out eigenstrat/${study2}_vs_${study1}_${ancestry}.tmp

# Merge file sets
/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --bfile eigenstrat/${study1}_vs_${study2}_${ancestry}.tmp \
    --bmerge eigenstrat/${study2}_vs_${study1}_${ancestry}.tmp \
    --make-bed \
    --out eigenstrat/${study1}_vs_${study2}_${ancestry}_merged.tmp

# MAF > 0.01
/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --bfile eigenstrat/${study1}_vs_${study2}_${ancestry}_merged.tmp \
    --maf 0.01 \
    --make-bed \
    --out eigenstrat/${study1}_vs_${study2}_${ancestry}

# Clean up
rm eigenstrat/*.tmp.*

### Remove high-LD region variants

In [None]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison/eigenstrat

ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma

# Generate list of variants in known high-LD regions
perl -lane 'if (($F[0]==5 && $F[3] >= 43964243 && $F[3] <= 51464243) || ($F[0]==6 && $F[3] >= 24892021 && $F[3] <= 33392022) || ($F[0]==8 && $F[3] >= 7962590 && $F[3] <= 11962591) || ($F[0]==11 && $F[3] >= 45043424 && $F[3] <= 57243424)) { print $F[1]."\n"; }' ${study1}_vs_${study2}_${ancestry}.bim \
    > ${study1}_vs_${study2}_${ancestry}.high_ld_regions.remove
            
# Remove SNPs in known high-LD regions
/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --bfile ${study1}_vs_${study2}_${ancestry} \
    --exclude ${study1}_vs_${study2}_${ancestry}.high_ld_regions.remove \
    --make-bed \
    --out ${study1}_vs_${study2}_${ancestry}_high_ld_regions_removed

### Linkage disequilibrium pruning
Linkage disequilibrium (LD) pruning eliminates a large degree of redundancy in the data and reduces the influence of chromosomal artifacts. The objective of LD pruning is to select a subset of variants based off of LD such that the variants in the subset are indepdendent. This filtering will not carry forward to the final processed results, but this step improves the quality of EIGENSTRAT calculations. Consequently, the LD pruned data will be used as input for those calculations.

LD pruning is implemented using [PLINK --indep-pairwise](https://www.cog-genomics.org/plink/1.9/ld#indep).

In [1]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison/eigenstrat

ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma

for chr in {1..23}; do
    /shared/bioinformatics/software/scripts/qsub_job.sh \
        --job_name ${study1}_${study2}_${ancestry}_${chr}_ld_prune \
        --script_prefix ${study1}_vs_${study2}_${ancestry}_${chr}_ld_prune \
        --mem 3 \
        --nslots 1 \
        --program /shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
            --noweb \
            --memory 3000 \
            --bfile ${study1}_vs_${study2}_${ancestry}_high_ld_regions_removed \
            --indep-pairwise 1500 150 0.2 \
            --chr ${chr} \
            --out ${study1}_vs_${study2}_${ancestry}_chr${chr}_ld_pruned
done

# Merge *prune.in files
ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma
cat ${study1}_vs_${study2}_${ancestry}_chr*_ld_pruned.prune.in > ${study1}_vs_${study2}_${ancestry}_chr_all_ld_pruned.prune.in

# Create new PLINK filesets with only lD pruned variants
/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --bfile ${study1}_vs_${study2}_${ancestry} \
    --extract ${study1}_vs_${study2}_${ancestry}_chr_all_ld_pruned.prune.in \
    --make-bed \
    --out ${study1}_vs_${study2}_${ancestry}_ld_pruned

# Clean up
rm *${ancestry}*ld_pruned.{prune.in,prune.out,log}
rm *${ancestry}*ld_prune*qsub*
rm *${ancestry}*high_ld_regions*
rm *${ancestry}*chr23_ld_pruned.hh

ERROR: Error in parse(text = x, srcfile = src): <text>:8:5: unexpected symbol
7: 
8: for chr
       ^


### Rename BIM/FAM file IDs
BIM file IDs have the potential issue of being too long if encoded using the 1000 Genomes Phase 3 IMPUTE2 format. This will cause smartpca to throw an error. To resolve this the IDs are replaced by numeric values. FAM file IDs are treated similarly.

In [None]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison/eigenstrat

# Count ID lengths greater than 39 characters
ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma

awk '{ if(length($1)+length($2)>39){print $2} }' ${study1}_vs_${study2}_${ancestry}_ld_pruned.fam | wc -l
awk '{ if(length($2)>39){print $2} }' ${study1}_vs_${study2}_${ancestry}_ld_pruned.bim | wc -l

# Rename BIM file IDs
awk '{$2="ID_"NR; print $0}' ${study1}_vs_${study2}_${ancestry}_ld_pruned.bim \
    > ${study1}_vs_${study2}_${ancestry}_ld_pruned_id_renamed.bim

# Rename FAM file IDs
awk '{$1="ID_"NR; print $0}' ${study1}_vs_${study2}_${ancestry}_ld_pruned.fam \
    > ${study1}_vs_${study2}_${ancestry}_ld_pruned_id_renamed.fam

### Run EIGENSTRAT
__NOTE__: For `smartpca` to run, the BIM and FAM file IDs need to be less than 39 characters in length. Often this is not a problem, but with a BIM file that has IDs renamed to the 1000 Genomes Phase 3 format, the IDs for indels can exceed the character limit. Also, the EIGENSOFT `bin` directory should be part of the PATH variable.

The `smartpca.pl` program documentation is shown below:

DOCUMENTATION of `smartpca.pl` program:

This program calls the smartpca program (see ../POPGEN/README). 
For this to work, the bin directory containing smartpca MUST be in your path. 
See `./example.perl` for a toy example.
```
../bin/smartpca.pl 
-i example.geno  : genotype file in any format (see ../CONVERTF/README)
-a example.snp   : snp file in any format (see ../CONVERTF/README)
-b example.ind   : indiv file in any format (see ../CONVERTF/README)
-k k             : (Default is 10) number of principal components to output
-o example.pca   : output file of principal components.  Individuals removed
                   as outliers will have all values set to 0.0 in this file.
-p example.plot  : prefix of output plot files of top 2 principal components.
                   (labeling individuals according to labels in indiv file)
-e example.eval  : output file of all eigenvalues
-l example.log   : output logfile
-m maxiter       : (Default is 5) maximum number of outlier removal iterations.
                   To turn off outlier removal, set -m 0.
-t topk          : (Default is 10) number of principal components along which 
                   to remove outliers during each outlier removal iteration.
-s sigma         : (Default is 6.0) number of standard deviations which an
                   individual must exceed, along one of topk top principal
           components, in order to be removed as an outlier.

OPTIONAL FLAGS:
-w poplist       : compute eigenvectors using populations in poplist only,
                   where poplist is an ASCII file with one population per line
-y plotlist      : output plot will include populations in plotlist only, 
                   where plotlist is an ASCII file with one population per line
-z badsnpname    : list of SNPs which should be excluded from the analysis
-q YES/NO        : If set to YES, assume that there is a single population and
                   the population field contains real-valued phenotypes.
           (Corresponds to qtmode parameter in smartpca program.)
           The default value for this parameter is NO.

Estimated running time of the smartpca program is 
  2.5e-12 * nSNP * NSAMPLES^2 hours            if not removing outliers.
  2.5e-12 * nSNP * NSAMPLES^2 hours * (1+m)    if m outlier removal iterations.
Thus, under the default of up to 5 outlier removal iterations, running time is 
  up to 1.5e-11 * nSNP * NSAMPLES^2 hours.```

In [None]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison/eigenstrat

mkdir results

# Count ID lengths greater than 39 characters
ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma

# Run smartpca
/shared/bioinformatics/software/scripts/qsub_job.sh \
    --job_name ${study1}_${study2}_${ancestry}_smartpca \
    --script_prefix smartpca_${study1}_vs_${study2}_${ancestry} \
    --mem 15 \
    --nslots 4 \
    --priority 0 \
    --program /shared/bioinformatics/software/third_party/EIG-6.1.4/bin/smartpca.perl \
        -i ${study1}_vs_${study2}_${ancestry}_ld_pruned.bed \
        -a ${study1}_vs_${study2}_${ancestry}_ld_pruned_id_renamed.bim \
        -b ${study1}_vs_${study2}_${ancestry}_ld_pruned.fam \
        -o results/${study1}_vs_${study2}_${ancestry}_ld_pruned.pca \
        -p results/${study1}_vs_${study2}_${ancestry}_ld_pruned.plot \
        -e results/${study1}_vs_${study2}_${ancestry}_ld_pruned.eval \
        -l results/${study1}_vs_${study2}_${ancestry}_ld_pruned.pca.log \
        -m 0

### Extract eigenvectors

In [None]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison/eigenstrat/results

# Grab top 10 eigenvectors from PCA 
ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma

echo "FID IID EV1 EV2 EV3 EV4 EV5 EV6 EV7 EV8 EV9 EV10" > ${study1}_vs_${study2}_${ancestry}_ld_pruned_top10_eigenvecs.txt
tail -n +2 ${study1}_vs_${study2}_${ancestry}_ld_pruned.pca.evec | \
    perl -lne 's/:/ /; @F=split; print join(" ",$F[0],$F[1],$F[2],$F[3],$F[4],$F[5],$F[6],$F[7],$F[8],$F[9],$F[10],$F[11]);' \
    >> ${study1}_vs_${study2}_${ancestry}_ld_pruned_top10_eigenvecs.txt

# Phenotype processing
For Non-Hodgkin Lymphoma (NHL), age is given in ranges as follows:
* LESS-25
* 26-30
* 31-35
* 36-40
* 41-45
* 46-50
* 51-55
* 56-60
* 61-65
* 66-70
* 71-75
* 76-80
* 81-85

The Lung Cancer in Never Smokers study used actual ages in years, so the ages for NHL subjects will be converted as follows:
* 23=(LESS-25)
* 28=(26-30)
* 33=(31-35)
* 38=(36-40)
* 43=(41-45)
* 48=(46-50)
* 53=(51-55)
* 58=(56-60)
* 63=(61-65)
* 68=(66-70)
* 73=(71-75)
* 78=(76-80)
* 83=(81-85)
### Retieve covariates

In [2]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison
mkdir phenotype

# Get subjet ID and sex variables

# LCNS
tail -n +12 /shared/data/studies/phs000634_lung_cancer/phenotype/Subject_Phenotypes.GRU-MDS.txt | \
    awk '{ print $2,$3,$5}' > phenotype/phs000634_lung_cancer_id_sex_age_covar.txt.copy
    #awk 'BEGIN {FS="\t"; sex_map["M"]=2; sex_map["F"]=1} { print $2,sex_map[$4] }' \
    #> phenotype/phs000634_lung_cancer_id_sex_covar.txt

# NHL
tail -n +12 /shared/data/studies/phs000801_lymphoma/phenotype/Subject_Phenotypes.GRU-NPU-MDS.txt |
    awk 'BEGIN {FS="\t"; age_map["LESS-25"]=23; age_map["26-30"]=28; age_map["31-35"]=33; \
    age_map["36-40"]=38; age_map["41-45"]=43; age_map["46-50"]=48; age_map["51-55"]=53; \
    age_map["56-60"]=58; age_map["61-65"]=63; age_map["66-70"]=68; age_map["71-75"]=73; \
    age_map["76-80"]=78; age_map["81-85"]=83; \
    sex_map["MALE"]=2; sex_map["FEMALE"]=1} \
    { print $2,sex_map[$4],age_map[$5] }' > phenotype/phs000801_lymphoma_id_sex_age_covar.txt.copy

# map the LCNS fam id to the subject ID
cd /shared/sandbox/vidus_lambda_comparison/phenotype
awk 'NR==FNR{_[$2]=$5;next} $1 in _{$1=_[$1]}1' OFS=' ' Subject.MULTI.txt \
    phs000634_lung_cancer_id_sex_age_covar.txt.copy > file1.txt
mv file3.txt phs000634_lung_cancer_id_sex_age_covar.txt

# map the NHL fam ids given in the phenotype file to the subject id that is given in the .fam file
awk 'NR==FNR{_[$1]=$2;next}$1 in _{$1=_[$1]}1' OFS=' ' ../eigenstrat/phs000634_lung_cancer_vs_phs000801_lymphoma_ea.fam \
    phs000801_lymphoma_id_sex_age_covar.txt.copy  > file4.txt

mv file4.txt phs000801_lymphoma_id_sex_age_covar.txt

ERROR: Error in parse(text = x, srcfile = src): <text>:3:7: unexpected symbol
2: cd /shared/sandbox/vidus_lambda_comparison
3: mkdir phenotype
         ^


### Merge eigenvectors and covariates

In [None]:
# R script #

# merge_covariate_data.R
#
# Args:
#   1) FAM file
#   2) Covariate data (no headers) for study 1
#   3) Covariate data (no header) for study 2
#   4) Eigenvector file
#   5) Outfile

# Load data
options(stringsAsFactors=F)
args <- commandArgs(trailingOnly=T)
args <- c("eigenstrat/phs000634_lung_cancer_vs_phs000801_lymphoma_ea.fam",
         "phenotype/phs000634_lung_cancer_id_sex_age_covar.txt",
         "phenotype/phs000801_lymphoma_id_sex_age_covar.txt",
         "eigenstrat/results/phs000634_lung_cancer_vs_phs000801_lymphoma_ea_ld_pruned_top10_eigenvecs.txt",
         "phenotype/phs000634_lung_cancer_vs_phs000801_lymphoma_ea_sex_age_ev.covariates")

fam.data <- read.delim(args[1], header=F, sep=" ", colClasses=c(V1="character", V2="character"))
colnames(fam.data) <- c("FID", "IID")
cov.data1 <- read.delim(args[2], header=F, sep=" ", colClasses=c(V1="character"))
colnames(cov.data1) <- c("ID", "SEX", "AGE")
cov.data2 <- read.delim(args[3], header=F, sep=" ", colClasses=c(V1="character"))
colnames(cov.data2) <- c("ID", "SEX", "AGE")
cov.data.merged <- rbind(cov.data1, cov.data2)
ev.data <- read.delim(args[4], header=T, sep="", colClasses=c(FID="character", IID="character"))

# Subset covariate data to only subjects in FAM file
merge1 <- merge(x=fam.data, y=cov.data.merged, by.x="IID", by.y="ID")[,c("FID","IID","SEX", "AGE")]

# Add eigenvector covariates
merge2 <- merge(x=merge1, y=ev.data, by=c("FID","IID"))

# Export data
write.table(x=merge2, file=args[5], quote=F, sep="\t", row.names=F, col.names=T, append=F)

In [None]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison

# Merge covariate files
ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma

# Run covariate data processing script
Rscript merge_covariate_data.R eigenstrat/${study1}_vs_${study2}_${ancestry}.fam \
    phenotype/${study1}_id_sex_age_covar.txt \
    phenotype/${study2}_id_sex_age_covar.txt \
    eigenstrat/results/${study1}_vs_${study2}_${ancestry}_ld_pruned_top10_eigenvecs.txt \
    phenotype/${study1}_vs_${study2}_${ancestry}_sex_age_ev.covariates

# PCA
We visualize pairwise population differences by looking at the top PCA eigenvectors.

In [None]:
# Load PCA data
options(stringsAsFactors=F)
lung.lymphoma <- read.table("/shared/sandbox/vidus_lambda_comparison/eigenstrat/results/phs000634_lung_cancer_vs_phs000801_lymphoma_ea_ld_pruned.pca.evec", comment.char="#", sep="", row.names=1)

# Function for plotting off-diagonal scatterplots
plot.offdiag <- function(x, y, colors=label.colors){
    #Make x and y axes equal
    limits <- c(min(x,y), max(x,y)) * 1.3
    par(usr = c(limits, limits))
    points(x,y, col=colors, pch=16)
}

# Function for plotting diagonal density plots
plot.diag <- function(x, colors=label.colors){
    d <- density(x, bw=0.01, kernel="gaussian")
    #Update plot axes limits
    par(usr = c(range(d$x), range(d$y)*1.3))
    points(d$x,d$y, type="l")
    #Add PC values
    points(x=x, y=jitter(rep(mean(d$y),times=d$n),factor=5), col=label.colors, pch=16)
}

#Run PCA and get % variance
dspace <- lung.lymphoma[1:10] #transformed dataspace

#Color label samples
sample.names <- lung.lymphoma[,11]
label.colors <- rep(NA, length(sample.names))
label.colors[grepl(x=sample.names, pattern="Case", ignore.case=T)] <- rgb(0,80,190,80, maxColorValue=255) #blue
label.colors[grepl(x=sample.names, pattern="Control", ignore.case=T)] <- rgb(200,0,0,80, maxColorValue=255) #red
legend.cols <- c(rgb(0,80,190,255, maxColorValue=255), rgb(255,0,0,255, maxColorValue=255))

options(repr.plot.width=8, repr.plot.height=8)
#Redefine outer margin
par(oma = c(4, 1, 2, 1))
pairs(x=dspace[,1:4], panel=plot.offdiag, diag.panel=plot.diag, 
      labels=paste0("PC",1:4), label.pos=0.9, cex.labels=1.8)
par(fig=c(0, 1, 0, 1), oma=c(0, 0, 0, 0), mar=c(0, 0, 1.5, 0), new=TRUE)
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n", main="Lung Cancer vs Lymphoma", cex.main=1)
legend("bottom", c("Lung Cancer", "Lymphoma"), 
       pch=19, col=legend.cols,
       xpd=TRUE, horiz=TRUE, inset=c(0,-0.02), 
       bty="n", cex=1)

# Troubleshooting

## run PCA with 1000G data

In [None]:
mkdir -p /shared/sandbox/vidus_lambda_comparison/trouble_shooting/1000g
cd /shared/sandbox/vidus_lambda_comparison/trouble_shooting

cp ../eigenstrat/phs000634_lung_cancer_vs_phs000801_lymphoma_ea_ld_pruned.* .
awk '{print $2}' phs000634_lung_cancer_vs_phs000801_lymphoma_ea_ld_pruned.bim > all_variant_IDs

cd 1000g
# Rename 1000G autosome variant IDs
for chr in {1..22}; do
    /shared/bioinformatics/software/scripts/qsub_job.sh \
        --job_name recode_to_1000g_${chr} \
        --script_prefix ${ancestry}_chr${chr}_id_rename \
        --mem 8 \
        --nslots 1 \
        --priority 0 \
        --program /shared/bioinformatics/software/perl/id_conversion/convert_to_1000g_p3_ids.pl \
        --file_in /shared/data/ref_panels/1000G/2013.05/plink/ALL.chr${chr}.bim \
        --file_out chr${chr}_renamed.bim \
        --legend /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr$chr.legend.gz \
        --file_in_header 0 \
        --file_in_id_col 1 \
        --file_in_chr_col 0 \
        --file_in_pos_col 3 \
        --file_in_a1_col 4 \
        --file_in_a2_col 5 \
        --chr ${chr}
done


# Get subject IDs by ancestry
awk 'BEGIN { FS="\t"; OFS="\t" } { if($7=="African"){print $1,$1} }'  /shared/data/ref_panels/1000G/igsr_samples.tsv \
    > 1000g_subject_IDs
awk 'BEGIN { FS="\t"; OFS="\t" } { if($7=="East Asian"){print $1,$1} }'  /shared/data/ref_panels/1000G/igsr_samples.tsv \
    >> 1000g_subject_IDs
awk 'BEGIN { FS="\t"; OFS="\t" } { if($7=="European"){print $1,$1} }' /shared/data/ref_panels/1000G/igsr_samples.tsv \
    >> 1000g_subject_IDs

## combine 1000g data
#touch 1000g_chr_all_renamed.bim
#for chr in {1..22}; do
#    cat chr${chr}_renamed.bim >> 1000g_chr_all_renamed.bim
#done

# Get list of variants from 1000G
/shared/bioinformatics/software/scripts/qsub_job.sh \
    --job_name merge_1000g_snps \
    --script_prefix merge_1000g_snps \
    --mem 3 \
    --priority 0 \
    --program "cat 1000g_chr_all_renamed.bim | \
        cut -f2,2 | \
        sort -u | \
        grep \"rs\" > 1000g_phase3_snps.txt"



# Get SNP overlap between study data and 1000G
comm -12 ../all_variant_IDs 1000g_phase3_snps.txt > study_1000g_shared_snps.txt

# Select 10,000 random SNPs from study and 1000G overlap
perl -ne 'print rand()."\t".$_' study_1000g_shared_snps.txt | \
    sort -k1,1 | \
    head -10000 | \
    cut -f2,2 \
    > 10k_snp_random_sample.txt


# create binary filesets with 10k sample
for chr in {1..22}; do
    /shared/bioinformatics/software/scripts/qsub_job.sh \
        --job_name chr${chr}_filter \
        --script_prefix ancestry_partition_chr${chr} \
        --mem 8 \
        --priority 0 \
        --program /shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink  \
            --noweb \
            --memory 10000 \
            --bim chr${chr}_renamed.bim \
            --fam /shared/data/ref_panels/1000G/2013.05/plink/ALL.chr${chr}.fam \
            --bed /shared/data/ref_panels/1000G/2013.05/plink/ALL.chr${chr}.bed \
            --bfile /shared/data/ref_panels/1000G/2013.05/plink/ALL.chr${chr} \
            --keep 1000g_subject_IDs \
            --extract 10k_snp_random_sample.txt \
            --make-bed \
            --out chr${chr}_10k_sample
done
#-------------------------------------------------------------------------------


for chr in {1..22}; do
    /shared/bioinformatics/software/scripts/qsub_job.sh \
        --job_name chr${chr}_filter \
        --script_prefix ancestry_partition_chr${chr} \
        --mem 8 \
        --priority 0 \
        --program /shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink  \
            --noweb \
            --memory 10000 \
            --bim chr${chr}_renamed.bim \
            --fam /shared/data/ref_panels/1000G/2013.05/plink/ALL.chr${chr}.fam \
            --bed /shared/data/ref_panels/1000G/2013.05/plink/ALL.chr${chr}.bed \
            --keep 1000g_subject_IDs \
            --extract snp_outliers.txt \
            --allow-no-vars \
            --make-bed \
            --out chr${chr}_28_sample
done


        /shared/bioinformatics/software/scripts/qsub_job.sh \
            --job_name merge_plink_filesets \
            --script_prefix merge_plink_filesets \
            --mem 4 \
            --priority 0 \
            --program /shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
                --noweb \
                --memory 4000 \
                --merge-list autosome_merge_list.txt \
                --snps-only just-acgt \
                --allow-no-vars \
                --make-bed \
                --out all_autosomes_28_snp_sample


/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --memory 4000 \
    --bfile ../phs000634_lung_cancer_vs_phs000801_lymphoma_ea_ld_pruned \
    --extract snp_outliers.txt \
    --make-bed \
    --out study_28_sample


# merge 1000g with study data
/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --memory 4000 \
    --bfile all_autosomes_28_snp_sample \
    --bmerge study_28_sample \
    --snps-only just-acgt \
    --make-bed \
    --out study_and_1000g_combined_28_sample

-------------------------------------------------------------------------------
# create merge list
touch autosome_merge_list.txt
for chr in {1..22};do
   echo "chr${chr}_10k_sample" >> autosome_merge_list.txt
done

# merge autosomes
    /shared/bioinformatics/software/scripts/qsub_job.sh \
        --job_name merge_plink_filesets \
        --script_prefix merge_plink_filesets \
        --mem 4 \
        --priority 0 \
        --program /shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
            --noweb \
            --memory 4000 \
            --merge-list autosome_merge_list.txt \
            --snps-only just-acgt \
            --make-bed \
            --out all_autosomes_10k_snp_random_sample

cd ../
# subset study data by 10k snp sample
/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --memory 4000 \
    --bfile phs000634_lung_cancer_vs_phs000801_lymphoma_ea_ld_pruned \
    --extract 1000g/10k_snp_random_sample.txt \
    --make-bed \
    --out study_10k_sample



# merge 1000g with study data
/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --memory 4000 \
    --bfile 1000g/all_autosomes_10k_snp_random_sample \
    --bmerge study_10k_sample \
    --snps-only just-acgt \
    --make-bed \
    --out study_and_1000g_combined_10k_sample


## run eigenstrat on 10k sample

In [None]:
mkdir -p /shared/sandbox/vidus_lambda_comparison/trouble_shooting/eigenstrat/results
cd /shared/sandbox/vidus_lambda_comparison/trouble_shooting/eigenstrat

awk '{$6= "-9"} {print $0}' ../study_and_1000g_combined_10k_sample.fam > ../final.fam

# Run smartpca
/shared/bioinformatics/software/scripts/qsub_job.sh \
    --job_name study_vs_1000g_smartpca \
    --script_prefix smartpca_study_vs_1000g_ \
    --mem 8 \
    --nslots 2 \
    --priority 0 \
    --program /shared/bioinformatics/software/third_party/EIG-6.1.4/bin/smartpca.perl \
        -i ../study_and_1000g_combined_10k_sample.bed \
        -a ../study_and_1000g_combined_10k_sample.bim \
        -b ../final.fam \
        -o results/study_and_1000g_10k_snp.pca \
        -p results/study_and_1000g_10k_snp.plot \
        -e results/study_and_1000g_10k_snp.eval \
        -l results/study_and_1000g_10k_snp.pca.log \
        -m 0


--------------------------------------------------------------------------------

awk '{a{$6}= "-9"} {print $0}' study_and_1000g_combined_28_sample.fam > final.fam

/shared/bioinformatics/software/scripts/qsub_job.sh \
    --job_name study_vs_1000g_smartpca \
    --script_prefix smartpca_study_vs_1000g_ \
    --mem 8 \
    --nslots 3 \
    --priority 0 \
    --program /shared/bioinformatics/software/third_party/EIG-6.1.4/bin/smartpca.perl \
        -i study_and_1000g_combined_28_sample.bed \
        -a study_and_1000g_combined_28_sample.bim \
        -b final.fam \
        -o results/study_and_1000g_28_snp.pca \
        -p results/study_and_1000g_28_snp.plot \
        -e results/study_and_1000g_28_snp.eval \
        -l results/study_and_1000g_28_snp.pca.log \
        -m 0

## 1000g pca


In [None]:
cd /shared/sandbox/vidus_lambda_comparison/trouble_shooting/eigenstrat/results
# Get subject IDs by ancestry
awk 'BEGIN { FS="\t"; OFS="\t" } { if($7=="African"){print $1} }'  /shared/data/ref_panels/1000G/igsr_samples.tsv \
    > AFR_1000g_subject_IDs
awk 'BEGIN { FS="\t"; OFS="\t" } { if($7=="East Asian"){print $1} }'  /shared/data/ref_panels/1000G/igsr_samples.tsv \
    > EAS_1000g_subject_IDs
awk 'BEGIN { FS="\t"; OFS="\t" } { if($7=="European"){print $1} }' /shared/data/ref_panels/1000G/igsr_samples.tsv \
    > EUR_1000g_subject_IDs

In [1]:
# Load PCA data
options(stringsAsFactors=F)
lung.lymphoma <- read.table("/shared/sandbox/vidus_lambda_comparison/trouble_shooting/eigenstrat/results/study_and_1000g_10k_snp.pca.evec", comment.char="#", sep="", row.names=1)

# Function for plotting off-diagonal scatterplots
plot.offdiag <- function(x, y, colors=label.colors){
    #Make x and y axes equal
    limits <- c(min(x,y), max(x,y)) * 1.3
    par(usr = c(limits, limits))
    points(x,y, col=colors, pch=16)
}

# Function for plotting diagonal density plots
plot.diag <- function(x, colors=label.colors){
    d <- density(x, bw=0.01, kernel="gaussian")
    #Update plot axes limits
    par(usr = c(range(d$x), range(d$y)*1.3))
    points(d$x,d$y, type="l")
    #Add PC values
    points(x=x, y=jitter(rep(mean(d$y),times=d$n),factor=5), col=label.colors, pch=16)
}

#Run PCA and get % variance
dspace <- lung.lymphoma[1:10] #transformed dataspace

AFR = scan(file="AFR_1000g_subject_IDs", what=character())
EAS = scan(file="EAS_1000g_subject_IDs", what=character())
EUR = scan(file="EUR_1000g_subject_IDs", what=character())

#Color label samples
sample.names <- row.names(lung.lymphoma)
label.colors <- rep(NA, length(sample.names))
label.colors[grepl(x=sample.names, pattern="LLP|Mayo|MDA|Wayne", ignore.case=F)] <- rgb(0,0,255,80, maxColorValue=255) #blue
label.colors[grepl(x=sample.names, pattern="TGS", ignore.case=F)] <- rgb(255,0,0,80, maxColorValue=255) #red
label.colors[grepl(x=sample.names, pattern=paste(AFR,collapse="|"), ignore.case=F)] <- rgb(0,0, 0,80, maxColorValue=255) #black
label.colors[grepl(x=sample.names, pattern=paste(EAS,collapse="|"), ignore.case=F)] <- rgb(0,255,0,80, maxColorValue=255) #green
label.colors[grepl(x=sample.names, pattern=paste(EUR,collapse="|"), ignore.case=F)] <- rgb(255,215,0,80, maxColorValue=255) #gold
legend.cols <- c(rgb(0,0,255,255, maxColorValue=255), rgb(255,0,0,255, maxColorValue=255),
                 rgb(0,0,0,255, maxColorValue=255), rgb(0,255,0,255, maxColorValue=255),
                 rgb(255,215,0,255, maxColorValue=255))
#legend.cols <- c(rgb(0,0,255,255, maxColorValue=255),rgb(255,0,0,255, maxColorValue=255))

options(repr.plot.width=8, repr.plot.height=8)
#Redefine outer margin
par(oma = c(4, 1, 2, 1))
pairs(x=dspace[,1:4], panel=plot.offdiag, diag.panel=plot.diag, 
      labels=paste0("PC",1:4), label.pos=0.9, cex.labels=1.8)
par(fig=c(0, 1, 0, 1), oma=c(0, 0, 0, 0), mar=c(0, 0, 1.5, 0), new=TRUE)
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n", main="Lung Cancer vs Lymphoma vs 1000G", cex.main=1)
legend("bottom", c("Lung Cancer", "Lymphoma", "AFR", "EAS", "EUR"), 
       pch=19, col=legend.cols,
       xpd=TRUE, horiz=TRUE, inset=c(0,-0.02), 
       bty="n", cex=1)
#legend("bottom", c("Lung Cancer", "Lymphoma"), 
#       pch=19, col=legend.cols,
#       xpd=TRUE, horiz=TRUE, inset=c(0,-0.02), 
#       bty="n", cex=1)


"cannot open file '/shared/sandbox/vidus_lambda_comparison/trouble_shooting/eigenstrat/results/study_and_1000g_10k_snp.pca.evec': No such file or directory"

ERROR: Error in file(file, "rt"): cannot open the connection


## Eigenstrat with snpweightoutname
To determine which SNPs are highly associated with PC2, I run the smartpca program in EIGENSTRAT with the additional parameter `snpweightoutname`. This function outputs a file containing SNP weighting of each principal component. 


The syntax of smartpca is `"../bin/smartpca -p parfile"`.  We illustrate
how parfile works via a toy example (see example.perl in this directory).
This example takes input in EIGENSTRAT format.  The syntax of how to take input
in other formats is analogous to the convertf program, see ../CONVERTF/README.

`
genotypename:     input genotype file (in any format: see ../CONVERTF/README)
snpname:          input snp file      (in any format: see ../CONVERTF/README)
indivname:        input indiv file    (in any format: see ../CONVERTF/README)
evecoutname:      output file of eigenvectors.  See numoutevec parameter below.
evaloutname:      output file of all eigenvalues
numoutlieriter:   maximum number of outlier removal iterations. Default is 5. 
                  To turn off outlier removal, set this parameter to 0.
snpweightoutname: output file containing SNP weightings of each principal component.
                  Note that this output file does not contain entries.
numthreads:       Multithreading option. Specify number of CPUs to use. Typically,
                  one will want to use N-1, where N is the number of CPUs of the 
                  compute nodes.
`

__parfile__

`genotypename:   ../phs000634_lung_cancer_vs_phs000801_lymphoma_ea_ld_pruned.bed
snpname:        ../phs000634_lung_cancer_vs_phs000801_lymphoma_ea_ld_pruned_id_renamed.bim
indivname:      ../phs000634_lung_cancer_vs_phs000801_lymphoma_ea_ld_pruned.fam
evecoutname:    results/phs000634_lung_cancer_vs_phs000801_lymphoma_ea_ld_pruned.pca
evaloutname:    results/phs000634_lung_cancer_vs_phs000801_lymphoma_ea_ld_pruned.plot
numoutlieriter: 0
snpweightoutname:       results/phs000634_lung_cancer_vs_phs000801_lymphoma_ea_ld_pruned.snp.weight
numthreads:     3
`

In [None]:
# EC2 command line #
mkdir -p /shared/sandbox/vidus_lambda_comparison/trouble_shooting/eigenstrat2/results
cd /shared/sandbox/vidus_lambda_comparison/trouble_shooting/eigenstrat2/

# Count ID lengths greater than 39 characters
ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma

# Run smartpca
/shared/bioinformatics/software/scripts/qsub_job.sh \
    --job_name ${study1}_${study2}_${ancestry}_smartpca \
    --script_prefix smartpca_${study1}_vs_${study2}_${ancestry} \
    --mem 15 \
    --nslots 4 \
    --priority 0 \
    --program /shared/bioinformatics/software/third_party/EIG-6.1.4/bin/smartpca -p parfile 
        


# Run GWAS

In [None]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison
mkdir pairwise_gwas

# Run PLINK GWAS
ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma

# Submit GWAS to SGE job scheduler
/shared/bioinformatics/software/scripts/qsub_job.sh \
    --mem 15 \
    --nslots 4 \
    --job_name ${study1}_vs_${study2}_${ancestry}_gwas \
    --script_prefix pairwise_gwas/${study1}_vs_${study2}_${ancestry}_gwas \
    --priority 0 \
    --program /shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
        --memory 15000 \
        --noweb \
        --bfile eigenstrat/${study1}_vs_${study2}_${ancestry} \
        --logistic \
        --covar phenotype/${study1}_vs_${study2}_${ancestry}_sex_age_ev.covariates \
        --out pairwise_gwas/${study1}_vs_${study2}_${ancestry}_sex+age+evs


ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma
cut -f 1-5,7-14 phenotype/phs000634_lung_cancer_vs_phs000801_lymphoma_ea_sex_age_ev.covariates >\
    pairwise_gwas_no2/phs000634_lung_cancer_vs_phs000801_lymphoma_ea_sex_age_ev.covariates

# Submit GWAS to SGE job scheduler
/shared/bioinformatics/software/scripts/qsub_job.sh \
    --mem 15 \
    --nslots 4 \
    --job_name ${study1}_vs_${study2}_${ancestry}_gwas \
    --script_prefix pairwise_gwas_no2/${study1}_vs_${study2}_${ancestry}_gwas \
    --priority 0 \
    --program /shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
        --memory 15000 \
        --noweb \
        --bfile eigenstrat/${study1}_vs_${study2}_${ancestry} \
        --chr 1 \
        --logistic \
        --covar pairwise_gwas_no2/${study1}_vs_${study2}_${ancestry}_sex_age_ev.covariates \
        --out pairwise_gwas_no2/${study1}_vs_${study2}_${ancestry}_sex+age+evs


### Results

In [None]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison/pairwise_gwas

# Retrieve variant info
ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma

# Grab header and 'ADD' lines
head -n 1 ${study1}_vs_${study2}_${ancestry}_sex+age+evs.assoc.logistic > ${study1}_vs_${study2}_${ancestry}_sex+age+evs_add_only.assoc.logistic
grep "ADD" ${study1}_vs_${study2}_${ancestry}_sex+age+evs.assoc.logistic > ${study1}_vs_${study2}_${ancestry}_sex+age+evs_add_only.assoc.logistic

# Make results table
outfile=${study1}_vs_${study2}_${ancestry}_sex+age+evs_add_only.assoc.logistic.table
echo -e "VARIANT_ID\tCHR\tPOSITION\tP\tTYPE" > $outfile
infile=${study1}_vs_${study2}_${ancestry}_sex+age+evs_add_only.assoc.logistic
echo "Processing ${infile}"
tail -n +2 ${infile} | \
    perl -lane 'if (($F[3] eq "A" || $F[3] eq "C" || $F[3] eq "G" || $F[3] eq "T")) {
                    print join("\t",$F[1],$F[0],$F[2],$F[8],"snp");
                } else {
                    print join("\t",$F[1],$F[0],$F[2],$F[8],"indel");
                }' >> $outfile


### Create manhoattan and Q-Q plots

In [None]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison/pairwise_gwas

# Submit plotting jobs
ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma

# Plot all chrommosomes
/shared/bioinformatics/software/scripts/qsub_job.sh \
    --job_name gwas_plots \
    --script_prefix ${study1}_vs_${study2}_${ancestry}_sex+age+evs.assoc.logistic.plot \
    --mem 7 \
    --priority 0 \
    --nslots 2 \
    --program Rscript /shared/bioinformatics/software/R/generate_gwas_plots.R \
        --in ${study1}_vs_${study2}_${ancestry}_sex+age+evs_add_only.assoc.logistic.table \
        --in_chromosomes autosomal_nonPAR \
        --in_header \
        --out ${study1}_vs_${study2}_${ancestry}_sex+age+evs_add_only.assoc.logistic.all_chr \
        --col_id VARIANT_ID \
        --col_chromosome CHR \
        --col_position POSITION \
        --col_p P \
        --col_variant_type TYPE \
        --generate_snp_indel_manhattan_plot \
        --manhattan_odd_chr_color red3 \
        --manhattan_even_chr_color dodgerblue3 \
        --manhattan_points_cex 1.25 \
        --generate_snp_indel_qq_plot \
        --qq_lines \
        --qq_points_bg black \
        --qq_lambda

### Generate filtered results tables with allele frequencies

In [None]:
# EC2 command line #
cd /shared/sandbox/vidus_lambda_comparison/pairwise_gwas


ancestry="ea"
study1=phs000634_lung_cancer
study2=phs000801_lymphoma

# Allele frequency calculation
/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --bfile ../eigenstrat/${study1}_vs_${study2}_${ancestry} \
    --freq case-control \
    --make-bed \
    --out ${study1}_vs_${study2}_${ancestry}

# P-value filter of PLINK regression results
echo -ne "CHR\tSNP_ID\tBP\tA1\tTEST\tN_OBS\tOR\tSTAT\tP\tA1\tA1_CASE_FRQ\tA1_CTRL_FRQ\tA1_CASE_N_OBS\tA1_CTRL_N_OBS" \
    > ${study1}_vs_${study2}_${ancestry}_sex+age+evs_pval_lt_0.00000005_add_only.freq.assoc.logistic
awk '{if(NR==FNR){freq_map[$2]=$3" "$5" "$6" "$7" "$8; next}; if($9<0.00000005){print $0,freq_map[$2]}}' \
    ${study1}_vs_${study2}_${ancestry}.frq.cc \
    ${study1}_vs_${study2}_${ancestry}_sex+age+evs_add_only.assoc.logistic \
    | perl -pi -e 's/^\s+/newline/g; s/\s+/\t/g; s/newline/\n/g' >> ${study1}_vs_${study2}_${ancestry}_sex+age+evs_pval_lt_0.00000005_add_only.freq.assoc.logistic

## Summary


In [None]:
/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --memory 3000 \
    --bfile /shared/sandbox/vidus_lambda_comparison/intersect/phs000801_lymphoma_ea \
    --freq \
    --out /shared/sandbox/vidus_lambda_comparison/temp/801

/shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
    --noweb \
    --memory 3000 \
    --bfile /shared/sandbox/vidus_lambda_comparison/intersect/phs000634_lung_cancer_ea \
    --freq \
    --out /shared/sandbox/vidus_lambda_comparison/temp/634





eight01 <- read.table("801_maf", header=TRUE)
six34 <- read.table("634_maf", header=TRUE)





