# Wave 3 FTND (Nicotine Dependence) meta-analysis: 1000G phase 3
**Author:** Jesse Marks <br>
**Results**: The results of this meta-analysis are on AWS S3 at: `s3://rti-nd/gwas_meta/categorical_ftnd/results/1df/`<br>
**GitHub Issue:** [#55](https://github.com/RTIInternational/bioinformatics/issues/55)<br>
**Paper**: [B. Quach et.al.](https://www.biorxiv.org/content/10.1101/2020.01.15.898858v1)

Here we perform two meta-analysis (1) cross-ancestry and (2) EUR-specific. We are combining UK Biobank (UKB) with our FTND results.
1. Cross-ancestry: UKB 33,791 current smokers and wave3 FTND N=58,000 subjects for a total of **N=91,791**.
2. EUR-specific: UKB N=31,854 current smokers and wave3 FTND N=46,213 subjects for a total of **N=78,067**.


**All FTND wave3 cohorts**
```
AAND_AFR
ADAA_AFR
COGEND2_AFR
COGEND2_EUR
COGEND_AFR
COGEND_EUR
COHRA1_EUR
COPDGENE1_AFR
COPDGENE1_EUR
COPDGENE2_AFR
COPDGENE2_EUR
DECODE_EUR
EAGLE_EUR
EMERGE_EUR
FINRISK_EUR
FTC_EUR
GAIN_AFR
GAIN_EUR
GERMAN_EUR
JHS_ARIC_AFR
MCTFR_EUR
NELSON_EUR
NONGAIN_EUR
NTR_EUR
S4S_EUR
SAGE_AFR
SAGE_EUR
UW_TTURC_AFR
UW_TTURC_EUR
YALE_PENN_AFR
YALE_PENN_EUR
```

**Data Locations**:<br>
FTND—`s3://rti-nd/gwas/`<br>
UKB cross-ancestry—`s3://rti-nd/gwas/uk_biobank/HSI`<br>
UKB EUR-specific—`s3://rti-nd/gwas/uk_biobank/GWA_003`

## Data Wrangle

In [None]:
# create dir structure
study_dir=/shared/jmarks/projects/nicotine/meta/data

mkdir -p ${study_dir}/aand/afr
mkdir -p ${study_dir}/adaa/afr
mkdir -p ${study_dir}/cogend2/afr
mkdir -p ${study_dir}/cogend2/eur
mkdir -p ${study_dir}/cogend/afr
mkdir -p ${study_dir}/cogend/eur
mkdir -p ${study_dir}/cohra1/eur
mkdir -p ${study_dir}/copdgene1/afr
mkdir -p ${study_dir}/copdgene1/eur
mkdir -p ${study_dir}/copdgene2/afr
mkdir -p ${study_dir}/copdgene2/eur
mkdir -p ${study_dir}/decode/eur
mkdir -p ${study_dir}/eagle/eur
mkdir -p ${study_dir}/emerge/eur
mkdir -p ${study_dir}/finrisk/eur
mkdir -p ${study_dir}/ftc/eur
mkdir -p ${study_dir}/gain/afr
mkdir -p ${study_dir}/gain/eur
mkdir -p ${study_dir}/german/eur
mkdir -p ${study_dir}/jhs_aric/afr
mkdir -p ${study_dir}/mctfr/eur
mkdir -p ${study_dir}/nelson/eur
mkdir -p ${study_dir}/nongain/eur
mkdir -p ${study_dir}/ntr/eur
mkdir -p ${study_dir}/s4s/eur
mkdir -p ${study_dir}/sage/afr
mkdir -p ${study_dir}/sage/eur
mkdir -p ${study_dir}/uw_tturc/afr
mkdir -p ${study_dir}/uw_tturc/eur
mkdir -p ${study_dir}/yale_penn/afr
mkdir -p ${study_dir}/yale_penn/eur

mkdir -p ${study_dir}/ukb/{eur,cross}


# Download data
for cohort in {aand,adaa,cogend,cogend2,cohra1,copdgene1,copdgene2,decode,eagle,emerge,finrisk,ftc,gain,german,jhs_aric,mctfr,nelson,nongain,ntr,s4s,sage,uw_tturc,yale_penn}; do
    for ancestry in {afr,eur}; do
        aws s3 sync s3://rti-nd/gwas/$cohort/results/categorical_ftnd/0001/$ancestry/final_stats/ \
        ${study_dir}/$cohort/$ancestry/
    done
done

aws s3 sync s3://rti-nd/gwas/uk_biobank/HSI/ ${study_dir}/ukb/cross/
aws s3 sync s3://rti-nd/gwas/uk_biobank/GWA_003/ ${study_dir}/ukb/eur/

## unzip (make sure you have enough storage!)
gunzip -r * &

In [None]:
## convert UKB to 1000G_phase3 IDs
cd /shared/jmarks/projects/nicotine/meta/data/ukb/cross

for chr in {1..22}; do
    inF=ukb.hsi.sex.age.age2.agesex.age2sex.20evs.chr$chr.maf_gt_0.01_ukb
    outF=ukb.hsi.sex.age.age2.agesex.age2sex.20evs.chr$chr.maf_gt_0.01_ukb.1000g_p3

    /shared/bioinformatics/software/perl/id_conversion/convert_to_1000g_p3_ids.pl \
        --file_in $inF \
        --file_out $outF \
        --legend /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr${chr}.legend.gz \
        --file_in_header 1 \
        --file_in_id_col 1 \
        --file_in_chr_col 0 \
        --file_in_pos_col 2 \
        --file_in_a1_col 3 \
        --file_in_a2_col 4 \
        --chr $chr
done



# EUR
cd /shared/jmarks/projects/nicotine/meta/data/ukb/eur/

# add chr column and header (they are missing)
head -1 ../cross/ukb.hsi.sex.age.age2.agesex.age2sex.20evs.chr22.maf_gt_0.01_ukb > head.txt
for chr in {1..22}; do 
    inF=ukb.hsi.sex.age.4evs.white.chr$chr.1df.1df
    awk -v chr=$chr '
        {print chr, $0}' $inF > ${inF}.add_chr

    cat head.txt ${inF}.add_chr > tmp && mv tmp ${inF}.add_chr

done


for chr in {1..22}; do
    inF=ukb.hsi.sex.age.4evs.white.chr$chr.1df.1df.add_chr
    outF=ukb.hsi.sex.age.4evs.white.chr$chr.1df.1df.1000g_p3

    /shared/bioinformatics/software/scripts/qsub_job.sh \
    --job_name chr$chr.convert \
    --script_prefix convert.chr$chr.1000g_p3 \
    --mem 5 \
    --nslots 2 \
    --priority 0 \
    --program  /shared/bioinformatics/software/perl/id_conversion/convert_to_1000g_p3_ids.pl \
        --file_in $inF \
        --file_out $outF \
        --legend /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr${chr}.legend.gz \
        --file_in_header 1 \
        --file_in_id_col 1 \
        --file_in_chr_col 0 \
        --file_in_pos_col 2 \
        --file_in_a1_col 3 \
        --file_in_a2_col 4 \
        --chr $chr
done

# remove intermediary files
for chr in {1..22}; do; rm ukb.hsi.sex.age.4evs.white.chr$chr.1df.1df.add_chr; done

# apply maf and r2 filters
for chr in {1..22}; do 
    zcat ukb.hsi.sex.age.4evs.white.chr$chr.1df.1df.1000g_p3.gz  |\
    awk '$7 > 0.01 && $9 > 0.3 {print $0}' > \
    ukb.hsi.sex.age.4evs.white.chr$chr.1df.1df.1000g_p3.maf_gt_0.01.rsq_gt_0.3.txt
done

# Results

## EUR-specific

In [1]:
library("IRdisplay")
#display_png(file="C:/Users/jmarks/OneDrive - Research Triangle Institute/Projects/nicotine/ftnd/meta/0004/nicotine_ftnd_wave3_gwas_meta_analysis_eur.snps+indels.manhattan.png", width=800, height=800)
#display_png(file="C:/Users/jmarks/OneDrive - Research Triangle Institute/Projects/nicotine/ftnd/meta/0004/nicotine_ftnd_wave3_gwas_meta_analysis_eur.snps+indels.qq.png", width=800, height=800)

# Code
Below are the methods to perform the meta-analysis in the AWS environment. The main parts that need modified are the paths to the cohorts' data which has been modified. For example, the R section at the end on lines 315 and 316.

Also note that we are converting everything to the 1000 Genome Phase 3 format and therefore we will include the script for performing this below.

## methods
Perform meta-analysis with METAL and produce the Manhattan and QQ plots.

In [None]:
### Author: Jesse Marks
### Date: April 30, 2020
### Description: Rerunning the FTND EUR-specific GWAS meta-analysis with the 
###              cohorts removed: Minnesota (MCTFR).
###
###

version="001"
df="1df"
filePrefix="nicotine_ftnd_wave3_gwas_meta_analysis_eur"
outDir="/shared/jmarks/projects/nicotine/meta/results/eur/$version"
dataDir="/shared/jmarks/projects/nicotine/meta/data"
gcList="" # function name of the study(s) for which to apply GC to. if none leave ""
minchr=1  # first chr in sequence
maxchr=22  # last chr in sequence

mkdir -p $outDir/{processing,final}


### functions to create cohort paths. arg1="base/directory/path" arg2="chromosome#"
#aand_afr() { echo $1/aand/afr/aand_cogend2.aa.1000G_p3.chr$2.CAT_FTND~SNP+SEX+AGE+EVs.maf_gt_0.01_afr+aand_cogend2+RSQ.phase3ID; }
#adaa_afr() { echo $1/adaa/afr/adaa.aa.1000G.chr$2.CAT_FTND~SNP+covar+EVs.maf_gt_0.01_afr+adaa+RSQ.phase3ID.flipped.quality_added; }
#cogend_afr() { echo $1/cogend/afr/cogend.aa.1000G_p3.chr$2.CAT_FTND~SNP+AGE_INT+SEX+EVs.maf_gt_0.01_afr+cogend+RSQ.phase3ID; }
cogend_eur() { echo $1/cogend/eur/cogend.ea.1000G_p3.chr$2.CAT_FTND~SNP+AGE_INT+SEX+EVs.maf_gt_0.01_eur+cogend+RSQ.phase3ID; }
#cogend2_afr() { echo $1/cogend2/afr/cogend2.aa.1000G_p3.chr$2.CAT_FTND~SNP+SEX+AGE+EVs.maf_gt_0.01_afr+cogend2+RSQ.phase3ID; }
cogend2_eur() { echo $1/cogend2/eur/cogend2.ea.1000G_p3.chr$2.CAT_FTND~SNP+SEX+AGE+EVs.maf_gt_0.01_eur+cogend2+RSQ.phase3ID; }
#copdgene1_afr() { echo $1/copdgene1/afr/copdgene1.aa.1000G_p3.chr$2.CAT_FTND~SNP+AGE+SEX+GOLD+EVs.maf_gt_0.01_afr+copdgene1+RSQ.phase3ID; }
copdgene1_eur() { echo $1/copdgene1/eur/copdgene1.ea.1000G_p3.chr$2.CAT_FTND~SNP+AGE+SEX+GOLD+EVs.maf_gt_0.01_eur+copdgene1+RSQ.phase3ID; }
#copdgene2_afr() { echo $1/copdgene2/afr/copdgene2.aa.1000G_p3.chr$2.CAT_FTND~SNP+AGE+SEX+GOLD+EVs.maf_gt_0.01_afr+copdgene2+RSQ.phase3ID; }
copdgene2_eur() { echo $1/copdgene2/eur/copdgene2.ea.1000G_p3.chr$2.CAT_FTND~SNP+AGE+SEX+GOLD+EVs.maf_gt_0.01_eur+copdgene2+RSQ.phase3ID; }
decode_eur() { echo $1/decode/eur/decode.ea.1000G.chr$2.1df.maf_gt_0.01_eur_decode.quality_added; }
dental_caries_eur() { echo $1/dental_caries/eur/dental_caries.ea.1000G_p3.chr$2.CAT_FTND~SNP+SEX+AGE_INT+EVs.maf_gt_0.01_eur+dental_caries+RSQ.phase3ID; }
eagle_eur() { echo $1/eagle/eur/eagle.ea.1000G_p3.chr$2.CAT_FTND~SNP+AGE_INT+SEX+EVs.maf_gt_0.01_eur+eagle+RSQ.phase3ID; }
emerge_eur() { echo $1/emerge/eur/emerge.ea.1000G_p3.chr$2.CAT_FTND~SNP+AGE+SEX+PC3+PC7+PC6+PC9.maf_gt_0.01_subject+eur.rsq_gt_0.30.stats.alleles_switched; }
finrisk_eur() { echo $1/finrisk/eur/FINRISK.ea.1000G.chr$2.CAT_FTND~1df.maf_gt_0.01_eur_FINRISK_HWEfilter_RSQ.quality_added; }
ftc_eur() { echo $1/ftc/eur/finn_twin.ea.1000G.chr$2.CAT_FTND~1df.maf_gt_0.01_eur_finn_twin_exNA_RSQ.quality_added; }
#gain_afr() { echo $1/gain/afr/schizo_gain.aa.1000G_p3.chr$2.CAT_FTND~SNP+SEX+AGE+EVs.maf_gt_0.01_afr+schizo_gain+RSQ.phase3ID; }
gain_eur() { echo $1/gain/eur/schizo_gain.ea.1000G_p3.chr$2.CAT_FTND~SNP+SEX+AGE+EVs.maf_gt_0.01_eur+schizo_gain+RSQ.phase3ID; }
german_eur() { echo $1/german/eur/$2_GermanCohort_nofilter_ftn_SNP_age_sex_5pca.maf_gt_0.01_eur+GermanCohort+exMAF0+RSQ.phase3ID; }
#jhs_aric_afr() { echo $1/jhs_aric/afr/jhs_aric_aa.aa.1000G_p3.chr$2.CAT_FTND~SNP+AGE+SEX+EVs.maf_gt_0.01_afr+jhs_aric_aa+RSQ.phase3ID; }
#minnesota_twins_eur() { echo $1/minnesota_twins/eur/minnesota_twins.ea.1000G_p3.chr$2.CAT_FTND~1df.add.out.maf_gt_0.01_eur+minnesota_twins.RSQ.probabel.quality_added; }
nelson_eur() { echo $1/nelson/eur/nelson.ea.1000G_p3.chr$2.CAT_FTND~SNP+AGE+SEX+EVs.maf_gt_0.01_subject+eur.RSQ.filtered.probabel.quality_added; }
nongain_eur() { echo $1/nongain/eur/schizo_nongain.ea.1000G_p3.chr$2.CAT_FTND~SNP+SEX+AGE+EVs.maf_gt_0.01_eur+schizo_nongain+RSQ.phase3ID; }
ntr_eur() { echo $1/ntr/eur/ntr.ea.1000G.chr$2.CAT_FTND~1df.maf_gt_0.01_eur_ntr_info.quality_added; }
#sage_afr() { echo $1/sage/afr/sage.aa.1000G_p3.chr$2.CAT_FTND~SEX+AGE_INT+ALC_DEP+COC_DEP+EVs.maf_gt_0.01_afr+sage+RSQ.phase3ID; }
sage_eur() { echo $1/sage/eur/sage.ea.1000G_p3.chr$2.CAT_FTND~SEX+AGE_INT+ALC_DEP+COC_DEP+EVs.maf_gt_0.01_eur+sage+RSQ.phase3ID; }
s4s_eur() { echo $1/s4s/eur/s4s.eur.1000G_p3.chr$2.CAT_FTND~1df.maf_gt_0.01.eur+s4s.RSQ.probabel.quality_added; }
#uw_tturc_afr() { echo $1/uw_tturc/afr/uw-tturc.aa.1000G_p3.chr$2.CAT_FTND~SNP+SEX+AGE_INT+EVs.maf_gt_0.01_afr+uw-tturc+RSQ.phase3ID; }
uw_tturc_eur() { echo $1/uw_tturc/eur/uw-tturc.ea.1000G_p3.chr$2.CAT_FTND~SNP+SEX+AGE_INT+EVs.maf_gt_0.01_eur+uw-tturc+RSQ.phase3ID; }
#yale_penn_afr() { echo $1/yale_penn/afr/yale_penn.aa.1000G_p3.chr$2.FTND~SEX+AGE+EVs+CHIP_num.maf_gt_0.01_afr_yale_penn_exNA_RSQ.quality_added; }
yale_penn_eur() { echo $1/yale_penn/eur/yale_penn.ea.1000G_p3.chr$2.FTND~SEX+AGE+EVs+CHIP_num.maf_gt_0.01_eur_yale_penn_exNA_RSQ.quality_added; }


#NOTE, the following loop only works running in a new vim shell without any
# predefined function. You could do this manually if you otherwise wanted to.
# create capitalized cohort labels based off of the above functions
cohortLabels=""
for function in $(compgen -A function);do
    cohortLabels+=$(echo ${function^^})
    cohortLabels+=" "
done

# the cohort labels below were retrieved from running the loop above
# Note, make sure there are no other functions printed out from the compgen -a function command
cohortLabels=${cohortLabels//"MODULE"}
echo $cohortLabels
#cohortLabels="COGEND2_EUR COGEND_EUR COPDGENE1_EUR COPDGENE2_EUR DECODE_EUR DENTAL_CARIES_EUR EAGLE_EUR EMERGE_EUR FINRISK_EUR FTC_EUR GAIN_EUR GERMAN_EUR NELSON_EUR NONGAIN_EUR NTR_EUR S4S_EUR SAGE_EUR UW_TTURC_EUR YALE_PENN_EUR"
#################################################################################################################
#################################################################################################################

##name    chrom   position        A1      A2      Freq1   MAF     Quality Rsq     n       Mean_predictor_allele   beta_SNP_addA1  sebeta_SNP_addA1      chi2_SNP_add     chi     p       or_95_percent_ci ##

if [ $df == "1df" ];then
    pv="p"
    std="STDERR"
elif [ $df == "2df" ]; then
    pv="p_2df"
    std="INTERACTION"
fi



### START METAL analysis ###
#for (( chr=${minchr}; chr<=${maxchr}; chr++)); do
#    my_string=""
#    for cohort in ${cohortLabels}; do
#        if [[ $gcList !=  *"${cohort,,}"* ]]; then # if cohort not in GC list
#            string_fun() { echo "--PROCESS" $(${cohort,,} $1 $2); }
#            my_string+=$(string_fun $dataDir $chr)
#            my_string+=" "
#        fi
#    done
#
#    
#    if [ ${#gcList} != 0 ]; then # if there are cohorts to apply GC to
#        my_string+="--GENOMICCONTROL ON  "
#        for cohort in ${gcList}; do
#            string_fun() { echo "--PROCESS" $(${cohort,,} $1 $2); }
#            my_string+=$(string_fun $dataDir $chr)
#            my_string+=" "
#        done 
#    fi
#
#    bash ~/bin/qsub_metal.sh \
#		--chr $chr \
#		--script_prefix metal_chr${chr} \
#        --SCHEME $std \
#        --PVALUE $pv \
#        --MARKER name \
#        --ALLELE A1 A2 \
#        --EFFECT beta_SNP_add \
#        --STDERR sebeta_SNP_add \
#        --GENOMICCONTROL OFF \
#        ${my_string} \
#        --OUTFILE $outDir/processing/${filePrefix}.chr$chr.$df .metal \
#        --ANALYZE HETEROGENEITY \
#        --script_prefix $outDir/processing/${filePrefix}.chr$chr.$df \
#        --mem 3.8 \
#        --priority 0
#done 

###############################################################################
# remove singleton SNP/indels (snps founnd in only one cohort) ###
### python script ###

#minchr = 1 # starting chr
#maxchr = 23 # ending chr plus 1
#filePrefix= "nicotine_ftnd_wave3_gwas_meta_analysis_eur"
#outDir = "/shared/jmarks/projects/nicotine/meta/results/eur/001/processing" # no slash at end
#
#
#for chrom in range(minchr,maxchr):
#    file_name = "{}/nicotine_ftnd_wave3_gwas_meta_analysis_eur.chr{}.1df1.metal".format(outDir,chrom)
#    out_name = "{}/nicotine_ftnd_wave3_gwas_meta_analysis_eur_chr{}_exclude_singletons_stats.txt".format(outDir,chrom)
#
#    print("Processing chromosome " + str(chrom) + "...")
#    inF = open(file_name, "r")
#    outF = open(out_name, "w")
#    firstLine = inF.readline()
#    outF.write(firstLine)
#    line = inF.readline()
#    while(line):
#        tmp = line.split()
#        if(sum([d != "?" for d in tmp[6]]) > 1):
#            outF.write(line)
#        line = inF.readline()
#    inF.close()
#    outF.close()
## end python script
#################################################################################

## START Merge ProbABEL and METAL results ###
for (( chr=${minchr}; chr<=${maxchr}; chr++ )); do
    my_string=""
    for cohort in ${cohortLabels}; do
        string_fun() { echo $(${cohort,,} $1 $2); }
        my_string+=$(string_fun $dataDir $chr)
        my_string+=" "
    done

  /shared/bioinformatics/software/scripts/qsub_job.sh \
    --job_name merge_meta_${chr} \
    --script_prefix $outDir/processing/${filePrefix}.chr$chr \
    --mem 10 \
    --nslots 1 \
    --priority 0 \
    --program python ~/bin/merge_probabel_metal_v02.py \
        --metal $outDir/processing/${filePrefix}_chr${chr}_exclude_singletons_stats.txt \
        --probabel \
        ${my_string} \
        --probabel_labels ${cohortLabels} \
        --out $outDir/final/${filePrefix}_chr${chr}_exclude_singletons_stats_merged.txt
done
### END Merge ProbABEL and METAL results ###

# Check for completion (should return/print out nothing)
for (( chr=${minchr}; chr<=${maxchr}; chr++ )); do
  file=$outDir/processing/${filePrefix}.chr$chr.qsub.log
  if [ -f $file ]
  then
    logLineCount=$(wc -l $file | perl -lane 'print $F[0];')
    if [ $logLineCount -eq 0 ]; then
      echo $file empty
    else
      tail -n 1 $file |
        perl -ne 'chomp; if (!/Done$/) { print "'$file'\n".$_."\n"; }'
    fi
  else
    echo $file missing
  fi
done


### START Generate plots ###
# create SNP table for input to plotting script
outFile=$outDir/processing/${filePrefix}.table
echo -e "VARIANT_ID\tCHR\tPOSITION\tP\tTYPE" > $outFile
for (( chr=${minchr}; chr<=${maxchr}; chr++ )); do
    inFile=$outDir/final/${filePrefix}_chr${chr}_exclude_singletons_stats_merged.txt
    echo Processing $inFile
    tail -n +2 $inFile |
    perl -lne '/^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+\S+\s+(\S+)/;
                if (($4 eq "a" || $4 eq "c" || $4 eq "g" || $4 eq "t") && ($5 eq "a" || $5 eq "c" || $5 eq "g" || $5 eq "t")) {
                  print join("\t",$1,$2,$3,$6,"snp");
                } else {
                  print join("\t",$1,$2,$3,$6,"indel");
                }' >> $outFile
done


# generate plots
/shared/bioinformatics/software/scripts/qsub_job.sh \
    --job_name gwas_plots \
    --script_prefix $outDir/processing/${filePrefix}.plots \
    --mem 10 \
    --nslots 3 \
    --priority 0 \
    --program Rscript /shared/bioinformatics/software/R/generate_gwas_plots.R \
    --in $outDir/processing/${filePrefix}.table \
    --in_chromosomes autosomal_nonPAR \
    --in_header \
    --out $outDir/final/${filePrefix}\
    --col_id VARIANT_ID \
    --col_chromosome CHR \
    --col_position POSITION \
    --col_p P \
    --col_variant_type TYPE \
    --generate_snp_indel_manhattan_plot \
    --manhattan_odd_chr_color red \
    --manhattan_even_chr_color blue \
    --manhattan_points_cex 1.5 \
    --manhattan_cex_lab 1.5 \
    --manhattan_cex_axis 1.5 \
    --generate_snp_indel_qq_plot \
    --qq_lines \
    --qq_points_bg black \
    --qq_lambda

## plot with ylim
#/shared/bioinformatics/software/scripts/qsub_job.sh \
#  --job_name gwas_plots_no_singletons_ylim \
#  --script_prefix $outDir/processing/${filePrefix}.exclude_singletons.ylim_9.plots \
#  --mem 10 \
#  --nslots 3 \
#  --priority 0 \
#  --program Rscript /shared/bioinformatics/software/R/generate_gwas_plots.R \
#  --in $outDir/processing/${filePrefix}.exclude_singletons.table \
#  --in_chromosomes autosomal_nonPAR \
#  --in_header \
#  --out $outDir/processing/${filePrefix}.exclude_singletons.ylim_9 \
#  --col_id VARIANT_ID \
#  --col_chromosome CHR \
#  --col_position POSITION \
#  --col_p P \
#  --col_variant_type TYPE \
#  --generate_snp_indel_manhattan_plot \
#  --manhattan_odd_chr_color red \
#  --manhattan_even_chr_color blue \
#  --manhattan_points_cex 1.5 \
#  --manhattan_cex_lab 1.5 \
#  --manhattan_cex_axis 1.5 \
#  --generate_snp_indel_qq_plot \
#  --qq_lines \
#  --qq_points_bg black \
#  --qq_lambda \
#  --manhattan_ylim 9
### END Generate plots ###


#nicotine_ftnd_wave3_gwas_meta_analysis_eur_chr1_exclude_singletons_stats_merged.txt

### START Filter by p-value  < 0.001 ###
outFile=$outDir/processing/${filePrefix}_exclude_singletons.p_lte_0.001.csv
head -n 1 $outDir/final/${filePrefix}_chr1_exclude_singletons_stats_merged.txt > $outFile
for (( chr=${minchr}; chr<=${maxchr}; chr++ )); do
  inFile=$outDir/final/${filePrefix}_chr${chr}_exclude_singletons_stats_merged.txt
  echo Processing $inFile
  tail -n +2 $inFile |
    perl -lane 'if ($F[7] <= 0.001) { print; }' >> $outFile
done
### end p-value filtering ###


#### Sort p-value filtered file (Rscript) ###
#R
#
#outDir <- "/shared/jmarks/hiv/meta/results/024/v02"
#pfile <- "hiv_acq_cross_1df_meta_024_exclude_singletons.p_lte_0.001"
#inData <- read.table(paste(sep="", outDir, "/processing/", pfile), header = TRUE)
#outData <- inData[order(inData$P.value),]
#write.csv(outData, file=paste(sep="", outDir, "/final/", pfile, ".csv"), row.names = FALSE, quote=FALSE)
#### END Filter by p-value ###




####################################################################################################
### Create FUMA file ###
# Chr 
# Position
# MarkerName
# P-value
# Allele1
# Allele2
# Effect
# StdErr

#MarkerName chr position Allele1 Allele2 Effect StdErr P-value Direction HetISq HetChiSq HetDf HetPVal
base_dir=$outDir/final

in_file() {
echo $1/hiv_acquisition_1df_meta_analysis_uhs1-4_aa+uhs1-4_ea+vidus_ea+wihs1_aa+wihs1_ea+wihs1_ha+wihs2_aa.chr$2.exclude_singletons.1df;
 }
outfile=$base_dir/hiv_acquisition_1df_meta_analysis_uhs1-4_aa+uhs1-4_ea+vidus_ea+wihs1_aa+wihs1_ea+wihs1_ha+wihs2_aa.exclude_singletons.ALL_CHR.FUMA
echo $(in_file $base_dir 1)

# print header to file
awk '{print $1,$2,$3,$4,$5,$6,$7,$8;exit}'\
    $(in_file $base_dir 1) > $outfile
for (( chr=${minchr}; chr<=${maxchr}; chr++ )); do
    echo processing chr$chr
    awk 'NR>=2{print $1,$2,$3,$4,$5,$6,$7,$8}'\
     $(in_file $base_dir $chr) >> $outfile
done

### End FUMA creation ###



## Secondary scripts
These were scripts called in the methods file above.

### merge_metal

In [None]:
##
import gzip, sys, os, operator

# function parses input arguments
# required arguments: --metal
#                                        --probabel
#                                        --probabel_labels
#                                        --out
def parseArguments(args):
        while(len(args) > 0):
                # input metal file name (full path)
                if(args[0] == '--metal'):
                        metalFName = args[1]
                        args = args[2:]

                # a list of input probabel file names (full path)
                elif(args[0] == '--probabel'):
                        args = args[1:]
                        probabelFList = []
                        while(len(args)>0 and args[0][:2] != '--'):
                                probabelFList.append(args[0])
                                args = args[1:]

                # a list of probabel labels
                elif(args[0] == '--probabel_labels'):
                        args = args[1:]
                        labelsList = []
                        while(len(args)>0 and args[0][:2] != '--'):
                                labelsList.append(args[0])
                                args = args[1:]

                # output merged file name (full path)
                elif(args[0] == '--out'):
                        outFName = args[1]
                        args = args[2:]

                # other arguments unused
                else:
                        sys.exit('Unused arguments: ' + ' '.join(args))

        # check if missing required arguments
        if('metalFName' not in locals() or 'probabelFList' not in locals() or 'labelsList' not in locals() or 'outFName' not in locals()):
                sys.exit('Missing at least one required argument. Please specify --metal, --probabel, --probabel_labels, --out')

        # check if length of probabel files equals length of probabel labels
        if(len(probabelFList) != len(labelsList)):
                sys.exit('Length of probabel files does not match length of probabel labels.')

        return (metalFName, outFName, probabelFList, labelsList)

# function sort the input probabel file by position
def maybeSort(fname):
        if(fname[-7:] == '.sorted'):
                return fname
        else:
                # check if a sorted file is available
                if(os.path.exists(fname + '.sorted')):
                        print 'Using already sorted file: ' + fname + '.sorted'
                        return fname + '.sorted'
                #if(os.path.exists(fname + '.sorted.gz')):
                #       os.system('rm ' + fname + '.sorted.gz')
                # sort by position
                inF = open(fname, 'r')
                header = inF.readline().split()
                if('position' in header):
                        posIndex = str(header.index('position') + 1)
                elif('pos' in header):
                        posIndex = str(header.index('pos') + 1)
                print 'Sorting file: ' + fname
                cmd = 'sort -k' + posIndex + 'n,' + posIndex + ' ' + fname + ' > ' + fname + '.sorted'
                os.system(cmd)
                return fname + '.sorted'

# one marker represents a row,
# use .output() function to write to file
class Marker:
        def __init__(self, line):
                tmp = line.split()
                self.name = tmp[0]
                self.chr = '.'
                self.pos = '.'
                self.content = tmp[1:]
                self.length = self.content)

        def addChrPos(self, line, chrIndex, posIndex):
                tmp = line.split()
                newChr = tmp[chrIndex]
                newPos = tmp[posIndex]

                if(self.chr == '.'):
                        self.chr = newChr
                elif(self.chr != newChr):
                        print('Chromosome for marker ' + self.name + ' does not match.')

                if(self.pos == '.'):
                        self.pos = newPos
                elif(self.pos != newPos):
                        print('Position for marker ' + self.name + ' does not match.')

        def output(self):
                return(' '.join([self.name, self.chr, self.pos]) + ' ' + ' '.join(self.content))
# extract chr and pos information from probabel file, save to metalDict
def extractNameChrPos(metalDict, probabelFName):
        probabelF = open(probabelFName, 'r')
        header = probabelF.readline().split()

        nameIndex = -1
        chrIndex = -1
        posIndex = -1

        if('name' in header):
                nameIndex = header.index('name')
        elif('marker_name' in header):
                nameIndex = header.index('marker_name')
        elif('markername' in header):
                nameIndex = header.index('markername')

        if('chrom' in header):
                chrIndex = header.index('chrom')
        elif('chr' in header):
                chrIndex = header.index('chr')

        if('position' in header):
                posIndex = header.index('position')
        elif('pos' in header):
                posIndex = header.index('pos')

        if(nameIndex == -1 or chrIndex == -1 or posIndex == -1):
                print probabelFName
                print header

        tmpSet = set()
        for line in probabelF.readlines():
                tmp = line.split()
                tmpSet.add(tmp[nameIndex])
                if(metalDict.get(tmp[nameIndex], 'NA') == 'NA'):
                        continue
                metalDict[tmp[nameIndex]].addChrPos(line, chrIndex, posIndex)
        probabelF.close()
        return tmpSet

# add metal content to dictionary
def addMetalToDict(d, fname):
        if(fname[-2:] == 'gz'):
                inF = gzip.open(fname, 'r')
        else:
                inF = open(fname, 'r')
        markerP = {}
        header = inF.readline()
        pIndex = header.split().index('P-value')

        for line in inF.readlines():
                tmp = line.split()
                marker = tmp[0]
                d[marker] = Marker(line)
                markerP[marker] = float(tmp[pIndex])

        inF.close()

        return markerP

# function extract header from probabel files
def extractHeaders(probabelStream):
        headers = []
        for i in range(len(probabelStream)):
                headers.append(probabelStream[i].readline())
        return headers

# function construct header from metal files and probabel files
def constructHeader(metalFName, probabelHeaders, labelsList):
        if(metalFName[-2:] == 'gz'):
                metalF = gzip.open(metalFName, 'r')
        else:
                metalF = open(metalFName, 'r')
        header = metalF.readline().split()
        header.insert(1, 'position')
        header.insert(1, 'chr')
        metalF.close()

        for i in range(len(labelsList)):
                tmp = probabelHeaders[i].split()
                for t in tmp:
                        if(t not in ['name', 'marker_name', 'markername', 'chrom', 'chr', 'position', 'pos']):
                                header.append(t + '.' + labelsList[i])
        return ' '.join(header)

# extract marker name index and position index from header
def extractNameIndexPosIndex(probabelHeaders):
        nameIndex = []
        posIndex = []
        lengths = []
        for header in probabelHeaders:
                tmp = header.split()
                if('name' in tmp):
                        nameIndex.append(tmp.index('name'))
                elif('marker_name' in tmp):
                        nameIndex.append(tmp.index('marker_name'))
                elif('markername' in tmp):
                        nameIndex.append(tmp.index('markername'))
                else:
                        nameIndex.append(-1)
                if('position' in tmp):
                        posIndex.append(tmp.index('position'))
                elif('pos' in tmp):
                        posIndex.append(tmp.index('pos'))
                else:
                        posIndex.append(-1)
                lengths.append(len(tmp) - posIndex[-1] - 1)
        return (nameIndex, posIndex, lengths)

def processThisMarker(markerName, metalDict, probabelStream, markerNames, buffer, probabelHeaders):
        nameIndex, posIndex, lengths = extractNameIndexPosIndex(probabelHeaders)
        # content from metal
        thisMarker = metalDict[markerName]
        line = thisMarker.output() # metal content added
        for i in range(len(probabelStream)):
                if(thisMarker.name in markerNames[i]):
                        # first check buffer
                        if(buffer[i].get(thisMarker.name, 'NA') != 'NA'):
                                line = line + ' ' + ' '.join(buffer[i][thisMarker.name])
                                del buffer[i][thisMarker.name]
                        # not in buffer, then read in from stream
                        else:
                                newLine = probabelStream[i].readline()
                                tmp = newLine.split()
                                # if the next element is not the one looking for, deposite in buffer
                                while(newLine and tmp[nameIndex[i]] != thisMarker.name):
                                        buffer[i][tmp[nameIndex[i]]] = tmp[posIndex[i]+1:]
                                        newLine = probabelStream[i].readline()
                                        tmp = newLine.split()

                                # now this is the marker
                                line = line + ' ' + ' '.join(tmp[posIndex[i]+1:])
                else:
                        for j in range(lengths[i]):
                                line = line + ' NA'
        return line

def checkOutput(outFName):
        with open(outFName) as outF:
                header = outF.readline()
                headerLength = len(header.split())
                line = outF.readline()
                while(line):
                        if(len(line.split()) != headerLength):
                                return "Fail"
                        line = outF.readline()
                return "Succeed"

# the main function
if __name__ == "__main__":
        metalFName, outFName, probabelFList, labelsList = parseArguments(sys.argv[1:])

        for i in range(len(probabelFList)):
                probabelFList[i] = maybeSort(probabelFList[i])

        # read in metal file
        # construct a dictionary to hold all metal results
        # key is the marker name, value is a Marker class object
        metalDict = {}
        markerP = addMetalToDict(metalDict, metalFName) # markerP - meta analyses p-value

        # first loop through all probabel files to add chr and position information
        # at the same time, keep a record of all marker names
        markerNames = []
        for probabelFName in probabelFList:
                tmpSet = extractNameChrPos(metalDict, probabelFName)
                markerNames.append(tmpSet)

        # sort marker by position
        markerPos = {}
        for m in metalDict.keys():
                try:
                        markerPos[m] = int(metalDict[m].pos)
                except Exception:
                        pass
        sortedMarker = sorted(markerPos.items(), key=operator.itemgetter(1))

        # loop through all probabel files second times
        # this time, keep the input stream
        probabelStream = []
        for probabelFName in probabelFList:
                probabelStream.append(open(probabelFName))

        # first pass - create the output file
        # header - first line from stream read
        outF = open(outFName, 'w')
        probabelHeaders = extractHeaders(probabelStream)
        outF.write(constructHeader(metalFName, probabelHeaders, labelsList) + '\n')

        # content - read subsequent lines
 # a buffer stores markers that are out of order
        buffer = []
        for i in probabelHeaders:
                buffer.append({})
        for m in sortedMarker:
                line = processThisMarker(m[0], metalDict, probabelStream, markerNames, buffer, probabelHeaders)
                outF.write(line + '\n')

        outF.close()

        # check output number of columns

        while(checkOutput(outFName) == 'Fail'):
                # header - first line from stream read
                outF = open(outFName, 'w')
                probabelHeaders = extractHeaders(probabelStream)
                outF.write(constructHeader(metalFName, probabelHeaders, labelsList) + '\n')

                # content - read subsequent lines
                # a buffer stores markers that are out of order
                buffer = []
                for i in probabelHeaders:
                        buffer.append({})
                for m in sortedMarker:
                        line = processThisMarker(m[0], metalDict, probabelStream, markerNames, buffer, probabelHeaders)
                        outF.write(line + '\n')

                outF.close()

        print 'Done'

### qsub_metal_v2.sh
I had to modify lines 70 and 71 to reflect the parallel processing (multi-threading) environment in the AWS environment - symmetric multiprocessing (smp)

In smp all of the cores act as one through the operating system. 

In [None]:
#!/bin/sh

metalScript="#!/shared/bioinformatics/software/third_party/metal_v3.25.2011/bin/metal\n\n"
outFile=""
finalOutFile=""
scriptPrefix=""
mem=3.8
priority=0

while [ "$1" != "" ];
do
        if [ "$1" == "--MARKER" ] || [ "$1" == "--EFFECT" ] || [ "$1" == "--PVALUE" ] || [ "$1" == "--WEIGHT" ] || [ "$1" == "--GENOMICCONTROL" ] || [ "$1" == "--PROCESS" ] || [ "$1" == "--SCHEME" ] || [ "$1" == "--STDERR" ]; then
                metalScript=$metalScript$(echo $1 | sed 's/--//')" "$2"\n"
                shift
        elif [ "$1" == "--ALLELE" ] || [ "$1" == "--OUTFILE" ]; then
                metalScript=$metalScript$(echo $1 | sed 's/--//')" "$2" "$3"\n"
                if [ "$1" == "--OUTFILE" ]; then
                        outFile=$2"1"$3
                        finalOutFile=$2$3
                fi
                shift
                shift
        elif [ "$1" == "--ANALYZE" ]; then
                if [ "$2" == "HETEROGENEITY" ]; then
                        metalScript=$metalScript"ANALYZE HETEROGENEITY\n"
                        shift
                else
                        metalScript=$metalScript"ANALYZE\n"
                fi
        else
                case $1 in
                        --script_prefix )                       shift
                                                                                scriptPrefix=$1
                                                                                ;;
                        --mem )                                         shift
                                                                                mem=$1
                                                                                ;;
                        --priority )                            shift
                                                                                priority=$1
                                                                                ;;
                esac
        fi
        shift
done
metalScript=$metalScript"QUIT\n"
fileMetalScript=$scriptPrefix".metal.sh"
echo -e $metalScript > $fileMetalScript
chmod 775 $fileMetalScript

fileQsubScript=$scriptPrefix".metal.qsub.sh"
fileQsubScriptLog=$scriptPrefix".metal.qsub.log"
fileQsubScriptError=$scriptPrefix".metal.qsub.error"
echo \#$ -S /bin/bash > $fileQsubScript
echo \#$ -v LD_LIBRARY_PATH >> $fileQsubScript
echo \#  >> $fileQsubScript
echo \# >> $fileQsubScript
echo \#$ -l h_rt=600:00:00  >> $fileQsubScript
echo \# >> $fileQsubScript
echo \# >> $fileQsubScript
echo \# Job name  >> $fileQsubScript
echo \#$ -N METAL >> $fileQsubScript
echo \# >> $fileQsubScript
echo \# Use current working directory >> $fileQsubScript
echo \#$ -cwd >> $fileQsubScript
echo \# >> $fileQsubScript
echo \# Join stdout and stderr >> $fileQsubScript
echo \#$ -j y >> $fileQsubScript
echo \# >> $fileQsubScript
echo \# pe request for MPICH2. Set your number of processors here.  >> $fileQsubScript
echo \# SMP stands for "Shared Memory Jobs" >> $fileQsubScript
echo \#$ -pe smp 1 >> $fileQsubScript
echo \# >> $fileQsubScript
echo \# Run job through bash shell >> $fileQsubScript
echo \#$ -S /bin/bash >> $fileQsubScript
echo \# >> $fileQsubScript
echo \# The following is for reporting only. It is not really needed >> $fileQsubScript
echo \# to run the job. It will show up in your output file. >> $fileQsubScript
echo echo \"Got \$NSLOTS processors.\" >> $fileQsubScript
echo $fileMetalScript >> $fileQsubScript
echo mv $outFile $finalOutFile >> $fileQsubScript
echo mv $outFile.info $finalOutFile.info >> $fileQsubScript

chmod 775 $fileQsubScript

vmem=`echo "$mem + 2" | bc`

qsub -q all.q -l mem_free=${mem}G,h_vmem=${vmem}G,h_rt=24:00:00 -p $priority -o $fileQsubScriptLog -e $fileQsubScriptError $fileQsubScript


# Sandbox

In [None]:
inFile=ftnd3_ukbiobank_gwas_meta_afr_eur_chr9_exclude_singletons_stats.txt
 tail -n +2 $inFile |  \
    perl -lne '/^(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)/;
                if (($2 eq "a" || $2 eq "c" || $2 eq "g" || $2 eq "t")
                && ($3 eq "a" || $3 eq "c" || $3 eq "g" || $3 eq "t")) {
                    print join("\t",$1,$2,$3,$4,"snp");
                } else {
                    print join("\t",$1,$2,$3,$4,"indel");
                        }'