# [FTND] Minnesota 1df processing
**Author**: Jesse Marks

**Date**: July 27, 2018

The following email was sent to Jesse Marks from John Guo
on 20180717. The email subject line reads:
"[FTND] - Minnesota 1df processing"

The new Minnesota results are available at

https://s3.console.aws.amazon.com/s3/buckets/rti-uploads/hyoung/?region=us-east-1&tab=overview



I used script on MIDAS to split the file by chromosome:

/share/nas04/bioinformatics_group/data/studies/minnesota_twins/raw_results/process_raw.20180611.py



Then performed ID conversion, filtering, and plotting using script here:

/share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/002/_methods.minnesota_twin.imputed.assoc_tests.001.sh



Could you please move the files to an appropriate location on S3 and process it on AWS?

Please let me know if you have any questions.


## Download Data From S3

In [None]:
## EC2 console ##
mkdir -p /shared/data/studies/minnesota_twins/raw_results
cd /shared/data/studies/minnesota_twins/raws_results

aws s3 sync s3://rti-uploads/hyoung/ .

## Split up by chromosome
These GWAS results are all combined in one file. We will split them apart by chromosome first.

In [None]:
import gzip,sys
BASE_DIR = '/shared/data/studies/minnesota_twins/'
for ethnicity in ['ea']:
        inF = gzip.open(BASE_DIR + 'raw_results/MCTFR_FTND.MetaScore.assoc.gz')
        # each time this is called, the next line will be returned.
        line = inF.readline() 
        # keep skipping the lines until the line with the headers appears.
        # Namely, the headers start with one pound sign then CHROM ...
        while(line[:2] == "##"):
                line = inF.readline()

        firstLine = line[1:] # skip that first pount in the header line
        # split returns a list of words (headers)
        chrIndex = firstLine.split().index('CHROM') # returns 0 because CHROM is first header
        lastChr = ''
        line = inF.readline() # go to actual chromosome
        while(line): # while we are not at the end of the file
                # if the new line is not the same chromosome as the last one processed enter loop
                # note it will enter this loop right from the gitgo because lastChr= '' by default
                if(line.split()[chrIndex] != lastChr): 
                        fname = 'minnesota_twins.' + ethnicity + '.1000G.chr' + line.split()[chrIndex] + '.' + 'CAT_FTND~1df_add.out.txt'
                        dir = '001/' + ethnicity + '/' + 'processing/chr' + line.split()[chrIndex] + '/'
                        outF = file(BASE_DIR + 'imputed/v1/association_tests/' + dir + fname, 'w')
                        # write to a new file based on the new chr we are processing
                        # also add the column Marker to the column header
                        outF.write("Marker" + "\t" + "\t".join(firstLine.split()) + "\n")
                        lastChr = line.split()[chrIndex] # new last chromosome now
                        print('Processing : ' + 'Chr ' + lastChr)
                # creating the Markername = CHR:POSITION
                tmp = line.split()
                outF.write(tmp[0] + ":" + tmp[1] + "\t" + line)
                line = inF.readline() # read the next line


# Then performed ID conversion, filtering, and plotting using script here

In [None]:
for ethnicity in ea; do
  for (( chr=1; chr<23; chr++ )); do
    mkdir -p /shared/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/chr$chr
  done
  mkdir /shared/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/final
done

### Convert ID to phase 3 ID ###
for ethnicity in ea; do
    if [ $ethnicity == "aa" ]
    then
        group=afr
    else
        group=eur
    fi
    for (( chr=1; chr<23; chr++ )); do
        /shared/bioinformatics/software/scripts/qsub_job.sh \
            --job_name MTC_${chr} \
            --script_prefix /shared/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/chr$chr/minnesota_twins.$ethnicity.1000G.chr$chr.CAT_FTND~1df.phase3ID_add.out.txt \
            --mem 15 \
            --priority 0 \
            --program /share/nas03/bioinformatics_group/software/perl/convert_to_1000g_ids.v2.pl \
            --file_in /shared/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/chr$chr/minnesota_twins.$ethnicity.1000G.chr$chr.CAT_FTND~1df_add.out.txt \
            --file_out /shared/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/chr$chr/minnesota_twins.$ethnicity.1000G.chr$chr.CAT_FTND~1df.phase3ID_add.out.txt \
            --legend /share/nas03/bioinformatics_group/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr$chr.legend.gz \
            --file_in_header 1 \
            --file_in_id_col 0 \
            --file_in_chr_col 1 \
            --file_in_pos_col 2 \
            --file_in_a1_col 3 \
            --file_in_a2_col 4 \
            --chr $chr
    done
done

# Check for completion
for ethnicity in ea
do
    if [ $ethnicity == "aa" ]
    then
        group=afr
    else
        group=eur
    fi
    for (( chr=1; chr<23; chr++ ))
    do
        file=/shared/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/chr$chr/minnesota_twins.$ethnicity.1000G.chr$chr.CAT_FTND~1df.phase3ID_add.out.txt.qsub.log
        if [ -f $file ]
        then
          logLineCount=$(wc -l $file | perl -lane 'print $F[0];')
          if [ $logLineCount -ne 4 ]
          then
            echo $file line count: $logLineCount
          else
            tail -n 1 $file |
              perl -ne 'chomp; if (!/^Done/) { print "'$file'\n".$_."\n"; }'
          fi
        else
          echo $file missing
        fi
    done
done



### START Filter ###

# MAF > 0.01 in AFR (AA) or EUR (EA)
for ethnicity in ea; do
  if [ $ethnicity == "aa" ]
  then
    group=afr
  else
    group=eur
  fi
  for (( chr=1; chr<23; chr++ )); do
    if [ $chr == "23" ]; then
      idList=/share/nas03/bioinformatics_group/data/ref_panels/1000G/2014.10/1000GP_Phase3_chrX_NONPAR.maf_lte_0.01_$group
    else
      idList=/share/nas03/bioinformatics_group/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr$chr.maf_lte_0.01_$group
    fi
    /shared/bioinformatics/software/scripts/qsub_job.sh \
      --job_name ${ethnicity}_$chr\
      --script_prefix /shared/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/chr$chr/minnesota_twins.$ethnicity.1000G.chr$chr.CAT_FTND~1df.maf_gt_0.01_$group \
      --mem 3.8 \
      --priority 0 \
      --program /share/nas03/bioinformatics_group/software/perl/extract_rows.pl \
      --source /shared/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/chr$chr/minnesota_twins.$ethnicity.1000G.chr$chr.CAT_FTND~1df.phase3ID_add.out.txt \
      --id_list $idList \
      --out /shared/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/chr$chr/minnesota_twins.$ethnicity.1000G.chr$chr.CAT_FTND~1df.maf_gt_0.01_$group \
      --header 1 \
      --id_column 0 \
      --remove
  done
done

for ethnicity in ea
do
  mv /shared/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/chr*/*.maf_gt_0.01_??? \
   /shared/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/final
done

# MAF > 0.01 in study
for ethnicity in ea; do
  if [ $ethnicity == "aa" ]; then
    group=afr
  else
    group=eur
  fi
  for (( chr=1; chr<23; chr++ )); do
    inFile=/shared/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/final/minnesota_twins.$ethnicity.1000G.chr$chr.CAT_FTND~1df.maf_gt_0.01_$group
    outFile=${inFile}_minnesota_twins
    echo Processing $inFile
    head -n 1 $inFile > $outFile
    tail -n +2 $inFile |
      perl -lane 'if ($F[6] > 0.01 & $F[6] < 0.99) { print; }' >> $outFile
  done
done

# RSQ > 0.3 in study
for ethnicity in ea; do
  if [ $ethnicity == "aa" ]; then
    group=afr
  else
    group=eur
  fi
  for (( chr=1; chr<23; chr++ )); do
    inFile=/share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/final/minnesota_twins.$ethnicity.1000G.chr$chr.CAT_FTND~1df.maf_gt_0.01_${group}_minnesota_twins
    outFile=${inFile}_RSQ
    echo Processing $inFile
    head -n 1 $inFile > $outFile
    tail -n +2 $inFile |
      perl -lane 'if ($F[10] > 0.3) { print; }' >> $outFile
  done
done


### END Filter ###


### START Generate plots ###

for ethnicity in ea; do
  if [ $ethnicity == "aa" ]; then
    group=afr
  else
    group=eur
  fi
  for ext in $group ${group}_minnesota_twins ${group}_minnesota_twins_RSQ; do
  outFile=/share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/minnesota_twins.$ethnicity.1000G.CAT_FTND~1df.maf_gt_0.01_$ext.table
  echo -e "VARIANT_ID\tCHR\tPOSITION\tP\tTYPE" > $outFile
  for (( chr=1; chr<24; chr++ )); do
    inFile=/share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/final/minnesota_twins.$ethnicity.1000G.chr$chr.CAT_FTND~1df.maf_gt_0.01_$ext
    echo Processing $inFile
    tail -n +2 $inFile |
      perl -lne '/^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)(?:\s+\S+){11}\s+(\S+)/;
                  if (($4 eq "A" || $4 eq "C" || $4 eq "G" || $4 eq "T") && ($5 eq "A" || $5 eq "C" || $5 eq "G" || $5 eq "T")) {
                    print join("\t",$1,$2,$3,$6,"snp");
                  } else {
                    print join("\t",$1,$2,$3,$6,"indel");
                  }' >> $outFile
        done
  done
done
for ethnicity in ea; do
  if [ $ethnicity == "aa" ]; then
    group=afr
  else
    group=eur
  fi
  for ext in $group ${group}_minnesota_twins ${group}_minnesota_twins_RSQ; do
  /shared/bioinformatics/software/scripts/qsub_job.sh \
    --job_name gwas_plots \
    --script_prefix /share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/minnesota_twins.$ethnicity.1000G.CAT_FTND~1df.maf_gt_0.01_$ext.plots \
    --mem 15 \
    --priority 0 \
    --program /share/nas03/bioinformatics_group/software/R/dev/generate_gwas_plots.v6.R \
    --in /share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/minnesota_twins.$ethnicity.1000G.CAT_FTND~1df.maf_gt_0.01_$ext.table \
    --in_chromosomes autosomal_nonPAR \
    --in_header \
    --out /share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/minnesota_twins.$ethnicity.1000G.CAT_FTND~1df.maf_gt_0.01_$ext \
    --col_id VARIANT_ID \
    --col_chromosome CHR \
    --col_position POSITION \
    --col_p P \
    --col_variant_type TYPE \
    --generate_snp_indel_manhattan_plot \
    --manhattan_odd_chr_color red \
    --manhattan_even_chr_color blue \
    --manhattan_points_cex 1.5 \
    --generate_snp_indel_qq_plot \
    --qq_lines \
    --qq_points_bg black \
    --qq_lambda
  done
done

for ethnicity in ea
do
  mv /share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/*.png \
   /share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/final
done

### END Generate plots ###



### START Filter by p-value ###

# MAF > 0.01 in AFR and EUR
for ethnicity in ea
do
  if [ $ethnicity == "aa" ]
  then
    group=afr
  else
    group=eur
  fi
  for ext in $group ${group}_minnesota_twins; do
  outFile=/share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/processing/minnesota_twins.$ethnicity.1000G.CAT_FTND~1df.maf_gt_0.01_$ext.p_lte_0.001
  head -n 1 /share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/final/minnesota_twins.$ethnicity.1000G.chr1.CAT_FTND~1df.maf_gt_0.01_$ext > $outFile
  for (( chr=1; chr<23; chr++ ))
  do
    inFile=/share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/$ethnicity/final/minnesota_twins.$ethnicity.1000G.chr$chr.CAT_FTND~1df.maf_gt_0.01_$ext
    echo Processing $inFile
    tail -n +2 $inFile |
      perl -lane 'if ($F[16] <= 0.001) { print; }' >> $outFile
  done
  done
done

# Sort
R
inData=read.table("/share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/ea/processing/minnesota_twins.ea.1000G.CAT_FTND~1df.maf_gt_0.01_eur.p_lte_0.001", header = TRUE)
inData = inData[order(inData$PVALUE),]
write.csv(inData, file="/share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/ea/final/minnesota_twins.ea.1000G.CAT_FTND~1df.maf_gt_0.01_eur.p_lte_0.001.csv", row.names = FALSE)
inData=read.table("/share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/ea/processing/minnesota_twins.ea.1000G.CAT_FTND~1df.maf_gt_0.01_eur_minnesota_twins.p_lte_0.001", header = TRUE)
inData = inData[order(inData$PVALUE),]
write.csv(inData, file="/share/nas04/bioinformatics_group/data/studies/minnesota_twins/imputed/v1/association_tests/001/ea/final/minnesota_twins.ea.1000G.CAT_FTND~1df.maf_gt_0.01_eur_minnesota_twins.p_lte_0.001.csv", row.names = FALSE)
