# Applying Nyholt correction to snp lookup for GSCAN
__Author:__ Jesse Marks

This is documentation is relevant to [GitHub Issue #59](https://github.com/RTIInternational/bioinformatics/issues/59#issuecomment-355282624). In order to make corrections for multiple testing in the data, one can apply the [Nyholt Correction Method](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1181954/). The Nyholt Correction Method is a simple correction for multiple testing for single-nucleotide polymorphisms in linkage disequilibrium with each other.

## Nyholt Correction Script
This script was aquired from Nathan Gaddis. The script is currently on MIDAS but will need to be transfered to EC2 at some point in the near future.

### 044

In [None]:
# EC2 command line ##
cd /home/ec2-user/nyholt/044

inFile=/home/ec2-user/nyholt/044/044_snps.txt
outFile=/home/ec2-user/nyholt/044/044_snps.final
# Copy variants with phase 3 IDs to final file
perl -lane 'if ($F[0] =~ /\:/) { print $F[0]; }' $inFile >> $outFile
# Convert non-phase 3 variant IDs to phase 3 IDs based on name
for variant in $(perl -lane 'if ($F[0] !~ /\:/) { print $F[0]; }' $inFile); do
    chr=$(grep -P "$variant\s" $inFile | perl -lane 'print $F[1];')
    gunzip -c /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr${chr}.legend.gz|
        grep "$variant:" |
        perl -lane 'print $F[0];'
done >> $outFile
# Convert non-phase 3 variant IDs to phase 3 IDs based on position
for position in $(perl -lane 'if ($F[0] !~ /\:/) { print $F[2]; }' $inFile); do
    chr=$(grep -P "$position" $inFile | grep -v ":" | perl -lane 'print $F[1];')
    gunzip -c /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr${chr}.legend.gz|
        grep ":$position:"
done > /home/ec2-user/nyholt/044/044_variant_list.nonphase3.legend
perl -lane 'BEGIN { $lastPosition = 0 } if ($F[1] != $lastPosition) { print $F[0]; $lastPosition = $F[1]; }' \
    /home/ec2-user/nyholt/044/044_variant_list.nonphase3.legend >> $outFile


### START Extract replication SNPs from 1000G ###

# Extract variants from 1000G panel
for (( chr=1; chr<23; chr++ )); do
    /shared/bioinformatics/software/scripts/qsub_job.sh \
      --job_name ALL_1000G \
      --script_prefix /home/ec2-user/nyholt/044/1000G_ALL.chr$chr.extracted_variants \
      --mem 14 \
      --priority 0 \
      --program /shared/bioinformatics/software/perl/file_conversion/convert_reference_panels.pl \
      --impute2_hap /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr$chr.hap.gz \
      --impute2_legend /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr$chr.legend.gz \
      --impute2_sample /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3.sample \
      --extract $outFile \
      --out 1000G_ALL.chr$chr.extracted_variants \
      --generate_plink_ped_file \
      --generate_plink_map_file \
      --chr $chr
done

### END Extract replication SNPs from 1000G ###


### START Generate correlation matrix ###

for (( chr=1; chr<23; chr++ )); do
    /shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
        --noweb \
        --file 1000G_ALL.chr$chr.extracted_variants.plink \
        --r \
        --matrix \
        --out 1000G_ALL.chr$chr.extracted_variants.r
done

# for chr 2, rs568413:21235475:T:C appears to be causing trouble so remove it and recalculate matrix
#echo rs568413:21235475:T:C > /data/common/data/studies/hiv/imputed/nyholt/009/rs568413
#/data/common/software/plink \
#    --noweb \
#    --file /data/common/data/studies/hiv/imputed/nyholt/009/1000G_ALL.chr2.extracted_variants.plink \
#    --exclude /data/common/data/studies/hiv/imputed/nyholt/009/rs568413 \
#    --recode \
#    --out /data/common/data/studies/hiv/imputed/nyholt/009/1000G_ALL.chr2.extracted_variants.minus_rs568413.plink
#/data/common/software/plink \
#    --noweb \
#    --file /data/common/data/studies/hiv/imputed/nyholt/009/1000G_ALL.chr2.extracted_variants.minus_rs568413.plink \
#    --r \
#    --matrix \
#    --out /data/common/data/studies/hiv/imputed/nyholt/009/1000G_ALL.chr2.extracted_variants.minus_rs568413.r

 ### END Generate correlation matrix ###


### START Run matSpD analysis ###

cd /home/ec2-user/nyholt/044/matSpD
for chr in {1..22}; do
    echo $chr
    cp ../1000G_ALL.chr$chr.extracted_variants.r.ld correlation.matrix
    R CMD BATCH matSpDlite.R
    mv matSpDlite.out ../1000G_ALL.chr$chr.extracted_variants.matspdlite
done

#cp /data/common/data/studies/hiv/imputed/nyholt/009/1000G_ALL.chr2.extracted_variants.minus_rs568413.r.ld \
#    /data/common/software/matSpD/correlation.matrix
#R CMD BATCH matSpDlite.R
#mv /data/common/software/matSpD/matSpDlite.out /data/common/data/studies/hiv/imputed/nyholt/009/1000G_ALL.chr2.extracted_variants.matspdlite

cd /home/ec2-user/nyholt/044
veffLi=0
for chr in {1..22}; do
    chrVeffLi=$(grep -A 2 Equation 1000G_ALL.chr$chr.extracted_variants.matspdlite |\
                tail -n +3 | perl -pe 's/^\s+//; s/\s+$//;')
    echo $chr
    echo $chrVeffLi
    veffLi=`echo $veffLi + $chrVeffLi | bc`
done
echo $veffLi

'''398'''

### END Run matSpD analysis ###


### 045

In [None]:
# EC2 command line ##cd /home/ec2-user/nyholt/045

inFile=/home/ec2-user/nyholt/045/045_snps.txt
outFile=/home/ec2-user/nyholt/045/045_snps.final
# Copy variants with phase 3 IDs to final file
perl -lane 'if ($F[0] =~ /\:/) { print $F[0]; }' $inFile >> $outFile
# Convert non-phase 3 variant IDs to phase 3 IDs based on name
for variant in $(perl -lane 'if ($F[0] !~ /\:/) { print $F[0]; }' $inFile); do
    chr=$(grep -P "$variant\s" $inFile | perl -lane 'print $F[1];')
    gunzip -c /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr${chr}.legend.gz|
        grep "$variant:" |
        perl -lane 'print $F[0];'
done >> $outFile
# Convert non-phase 3 variant IDs to phase 3 IDs based on position
for position in $(perl -lane 'if ($F[0] !~ /\:/) { print $F[2]; }' $inFile); do
    chr=$(grep -P "$position" $inFile | grep -v ":" | perl -lane 'print $F[1];')
    gunzip -c /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr${chr}.legend.gz|
        grep ":$position:"
done > /home/ec2-user/nyholt/045/045_variant_list.nonphase3.legend
perl -lane 'BEGIN { $lastPosition = 0 } if ($F[1] != $lastPosition) { print $F[0]; $lastPosition = $F[1]; }' \
    /home/ec2-user/nyholt/045/045_variant_list.nonphase3.legend >> $outFile


### START Extract replication SNPs from 1000G ###

# Extract variants from 1000G panel
for (( chr=1; chr<23; chr++ )); do
    /shared/bioinformatics/software/scripts/qsub_job.sh \
      --job_name ALL_1000G_chr${chr} \
      --script_prefix /home/ec2-user/nyholt/045/1000G_ALL.chr$chr.extracted_variants \
      --mem 14 \
      --priority 0 \
      --program /shared/bioinformatics/software/perl/file_conversion/convert_reference_panels.pl \
      --impute2_hap /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr$chr.hap.gz \
      --impute2_legend /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr$chr.legend.gz \
      --impute2_sample /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3.sample \
      --extract $outFile \
      --out 1000G_ALL.chr$chr.extracted_variants \
      --generate_plink_ped_file \
      --generate_plink_map_file \
      --chr $chr
done
### END Extract replication SNPs from 1000G ###


### START Generate correlation matrix ###

touch cor_mat.log
for (( chr=1; chr<23; chr++ )); do
    /shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
        --noweb \
        --file 1000G_ALL.chr$chr.extracted_variants.plink \
        --r \
        --matrix \
        --out 1000G_ALL.chr$chr.extracted_variants.r >> cor_mat.log
done
### END Generate correlation matrix ###


### START Run matSpD analysis ###
cd /home/ec2-user/nyholt/045/matSpD
for chr in {1..22}; do
    echo $chr
    cp ../1000G_ALL.chr$chr.extracted_variants.r.ld correlation.matrix
    R CMD BATCH matSpDlite.R
    mv matSpDlite.out ../1000G_ALL.chr$chr.extracted_variants.matspdlite
done

cd /home/ec2-user/nyholt/045
veffLi=0
for chr in {1..22}; do
    chrVeffLi=$(grep -A 2 Equation 1000G_ALL.chr$chr.extracted_variants.matspdlite |\
                tail -n +3 | perl -pe 's/^\s+//; s/\s+$//;')
    echo $chr
    echo $chrVeffLi
    veffLi=`echo $veffLi + $chrVeffLi | bc`
done
echo $veffLi

'''409'''

### END Run matSpD analysis ###

### 046

In [None]:
# EC2 command line ##cd /home/ec2-user/nyholt/046

inFile=/home/ec2-user/nyholt/046/046_snps.txt
outFile=/home/ec2-user/nyholt/046/046_snps.final
# Copy variants with phase 3 IDs to final file
perl -lane 'if ($F[0] =~ /\:/) { print $F[0]; }' $inFile >> $outFile
# Convert non-phase 3 variant IDs to phase 3 IDs based on name
for variant in $(perl -lane 'if ($F[0] !~ /\:/) { print $F[0]; }' $inFile); do
    chr=$(grep -P "$variant\s" $inFile | perl -lane 'print $F[1];')
    gunzip -c /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr${chr}.legend.gz|
        grep "$variant:" |
        perl -lane 'print $F[0];'
done >> $outFile
# Convert non-phase 3 variant IDs to phase 3 IDs based on position
for position in $(perl -lane 'if ($F[0] !~ /\:/) { print $F[2]; }' $inFile); do
    chr=$(grep -P "$position" $inFile | grep -v ":" | perl -lane 'print $F[1];')
    gunzip -c /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr${chr}.legend.gz|
        grep ":$position:"
done > /home/ec2-user/nyholt/046/046_variant_list.nonphase3.legend
perl -lane 'BEGIN { $lastPosition = 0 } if ($F[1] != $lastPosition) { print $F[0]; $lastPosition = $F[1]; }' \
    /home/ec2-user/nyholt/046/046_variant_list.nonphase3.legend >> $outFile


### START Extract replication SNPs from 1000G ###

study=046
# Extract variants from 1000G panel
for (( chr=1; chr<23; chr++ )); do
    /shared/bioinformatics/software/scripts/qsub_job.sh \
      --job_name ALL_1000G_chr${chr}_study${study} \
      --script_prefix /home/ec2-user/nyholt/046/1000G_ALL.chr$chr.extracted_variants \
      --mem 14 \
      --priority 0 \
      --program /shared/bioinformatics/software/perl/file_conversion/convert_reference_panels.pl \
      --impute2_hap /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr$chr.hap.gz \
      --impute2_legend /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr$chr.legend.gz \
      --impute2_sample /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3.sample \
      --extract $outFile \
      --out 1000G_ALL.chr$chr.extracted_variants \
      --generate_plink_ped_file \
      --generate_plink_map_file \
      --chr $chr
done
### END Extract replication SNPs from 1000G ###

### START Generate correlation matrix ###

for (( chr=1; chr<23; chr++ )); do
    /shared/bioinformatics/software/third_party/plink-1.90-beta-4.10-x86_64/plink \
        --noweb \
        --file 1000G_ALL.chr$chr.extracted_variants.plink \
        --r \
        --matrix \
        --out 1000G_ALL.chr$chr.extracted_variants.r
done
### END Generate correlation matrix ###


### START Run matSpD analysis ###
cd /home/ec2-user/nyholt/046/matSpD
for chr in {1..22}; do
    echo $chr
    cp ../1000G_ALL.chr$chr.extracted_variants.r.ld correlation.matrix
    R CMD BATCH matSpDlite.R
    mv matSpDlite.out ../1000G_ALL.chr$chr.extracted_variants.matspdlite
done

cd /home/ec2-user/nyholt/046
veffLi=0
for chr in {1..22}; do
    chrVeffLi=$(grep -A 2 Equation 1000G_ALL.chr$chr.extracted_variants.matspdlite |\
                tail -n +3 | perl -pe 's/^\s+//; s/\s+$//;')
    echo $chr
    echo $chrVeffLi
    veffLi=`echo $veffLi + $chrVeffLi | bc`
done
echo $veffLi

''''''

### END Run matSpD analysis ###