__Run basic contamination screening and fastqc. perform mapping and alignment, following this tutorial https://www.biostars.org/p/41951/__

In [None]:
%cd ~/ClownfishGWAS/data/APPC_sequencing/ultraplex_out/fastq_screen/


In [None]:
#make a list of read2 filenames to use for next loops
!ls *R2.fastq.gz > filenames
!sed -i'' -e 's/.fastq.gz//g' filenames #remove the extension

In [None]:
#for some reason, barcodes still in at the start of read2, so use cutadapt to remove the first 6 reads from the read2 files. run in the python2.7 environment
for i in $(cat filenames_tofinish); do cutadapt -u 6 -o $i.cut.fastq.gz $i.fastq.gz; done

In [None]:
#screen for contamination and filter out tagged reads simultaneously, filter 1 means keep only reads that mapped exclusively to the reference for A. percula. since this is already done for the read 1 files, just need to do it for the read 2 files that I cut the barcode from  
!for i in /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/ultraplex_out/fastq_screen/*.cut.fastq.gz; do fastq_screen --tag --filter 1 --threads 32 --outdir /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/ultraplex_out/fastq_screen/ $i; done

In [None]:
#run fastqc 
!for i in /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/ultraplex_out/fastq_screen/*tagged_filter.fastq.gz; do fastqc -t 6 *tagged_filter.fastq.gz  $i; done

In [None]:
#visualize sequencing quality for all samples in trimmed_reads directory. check that all barcode sequences are removed
!multiqc .

In [None]:
#so the read 1 and read 2 files still fail qc because the A/T barcode overhang hookup, trim 1 bp on either side of fragments for all read files, check again and then map
!ls *.tagged_filter.fastq.gz  > filenames
!sed -i'' -e 's/.fastq.gz//g' filenames #remove the extension
for i in $(cat filenames); do cutadapt -u 1 -u -1 -o $i.cut.fastq.gz $i.fastq.gz; done
#delete the unfilered fastas from this directory, since they are still in ultraplex out
#remove cut from filenames so names are consistent
!rename '.cut' '' *.fastq.gz 
!rename '.cut' '' *.fastq.gz #second time because the read 2 files have two cut name spaces because they were cut twice
#run fastqc and multiqc again to check we are ready to map
!for i in /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/ultraplex_out/fastq_screen/*tagged_filter.fastq.gz; do fastqc -t 30 *tagged_filter.fastq.gz  $i; done
#make sure to push the multiqc report to github

In [None]:
#okay, looks like still weird artifacts of library prep, not barcodes, that are present in 68 read files up to position 7 bp on 5' end. move all filtered files to barcodes_cut directory and use cutadapt once more
for i in $(cat reads_to_cut7); do cutadapt -u 7 -o $i.cut.fastq.gz $i.fastq.gz; done
#rename cut files, remove uncut files
#fastqc one more time, multiqc to check, then map

In [None]:
#for some reason CAP10_APPC_34.13_R1.tagged_filter.fastq.gz was not including in multiqc report, use fastqc to check if it's good to map with
%cd ~/ClownfishGWAS/data/APPC_sequencing/ultraplex_out/fastq_screen/barcodes_cut/

!fastqc -t 30 CAP10_APPC_34.13_R1.tagged_filter.fastq.gz
#yes looks fine, can move on to mapping

In [None]:
#collate the names of samples that were in the final cutting round of first 7 bp, and the files that were fine and move into a mapping directory
#!mkdir mapping
!for i in $(cat clean_reads); do mv $i mapping/; done
%cd ~/ClownfishGWAS/data/APPC_sequencing/ultraplex_out/fastq_screen/barcodes_cut/mapping


In [None]:
#make bwa index for reference fasta to use in mapping
!bwa index /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/Genome/reference.fasta


In [None]:
#remove the underscore after the sample names of files to map because it was a problem in cutadapt loop
!rename '.cut' '' *.fastq.gz 
!ls  *R1.tagged_filter.fastq.gz > filenames
!sed -i'' -e 's/_R1.tagged_filter.fastq.gz//g' filenames #remove the extension

In [None]:
#map using default parameters but add -M for picard compatibility, make sure to use the tagged/filtered
!for i in $(cat /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/ultraplex_out/fastq_screen/barcodes_cut/mapping/filenames); do bwa mem -t 34 -M -R "@RG\tID:$i\tSM:$i\tPL:Illumina" /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/Genome/reference.fasta $i\_R1.tagged_filter.fastq.gz $i\_R2.tagged_filter.fastq.gz  2> bwa.$i.log | samtools sort -@ 34 -o $i-RG.bam 2>$i.bam.log; done  

In [None]:
#read1/2 files are no longer ordered the same and so bwa has issues. switch to bowtie which doesn't care about read ordering
#!mkdir /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/Genome/bowtie
%cd ~/ClownfishGWAS/data/APPC_sequencing/ultraplex_out/fastq_screen/barcodes_cut/mapping
!bowtie2-build /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/Genome/APercula-Reference/GCA_003047355.2_Nemo_v1.1_genomic.fna APPC_ref

In [None]:
#sort the input read files with bbtools
!for i in $(cat filenames); do repair.sh in1=$i\_R1.tagged_filter.fastq.gz in2=$i\_R2.tagged_filter.fastq.gz out1=repair/$i.fixed.R1.fq out2=repair/$i.fixed.R2.fq outs=repair/$i.singletons.fq repair; done

In [None]:
!for i in $(cat filenames); do bowtie2 -x APPC_ref -1 $i\_R1.tagged_filter.fastq.gz -2 $i\_R2.tagged_filter.fastq.gz -S $i.sam; done

In [None]:
#omg stupid I didn't run cutadapt in paired end mode so it messed up the read names. start in ultraplex out directory and figure out what trimming those files needs
%cd ~/ClownfishGWAS/data/APPC_sequencing/ultraplex_out/

!for i in /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/ultraplex_out/fastq_screen/*tagged_filter.fastq.gz; do fastqc -t 6 *tagged_filter.fastq.gz  $i; done

In [None]:
#make sure to check before running this, can't remember where .sai files are made...
#join pairs, convert from sam to bam files in screen -S samsort
#screen -S samsort
for i in $(cat APPC_sequencing/ultraplex_out/fastq_screen/filenames);
do bwa sampe /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/ultraplex_out/fastq_screen/mapping/reference.fasta $i\.read1.sai $i\.read2.sai $i\.R1.tagged_filter.fastq.gz $i\.R2.tagged_filter.fastq.gz | samtools view -bS - >  ~/ClownfishGWAS/data/APPC_sequencing/DNA/trimmed_reads/mapping/$i\.bam; done



In [None]:
#remove duplicate reads with picard
java -Xms4g -jar /local/home/katrinac/miniconda3/envs/commandline/bin/picard.jar MarkDuplicates I={}-R2.tagged_filter.fastq.gz O={}-RGmd.bam M={}_dup_metrics.txt OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 TAGGING_POLICY=OpticalOnly &> md.{}.log   

In [None]:
#January 24, 2022 using fastp to trim adapters because trimmomatic left the barcode in read 2 file
for i in $(cat /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/DNA/filenames); 
    do fastp -i /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/DNA/$i.R1.fq.gz -I /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/DNA/$i.R2.fq.gz -o /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/DNA/fastp/$i.R1.fastp.fq.gz -O /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/DNA/fastp/$i.R2.fastp.fq.gz; 
    done



In [None]:
#January 25, there's still barcodes in the beginning of the read 2 files after fastp adapter trimming. I'm going to try ultraplex instead.
cp ~/ClownfishGWAS/data/APPC_sequencing/novaseq_2021_03_26_SEQ1/bcsplit/*.gz ~/ClownfishGWAS/data/APPC_sequencing/ultraplex_in/
cp ~/ClownfishGWAS/data/APPC_sequencing/novaseq_2021_04_16_SEQ2/bcsplit/*.gz ~/ClownfishGWAS/data/APPC_sequencing/ultraplex_in/

for i in $(cat /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/DNA/cap_filenames); 
    do ultraplex -i /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/ultraplex_in/$i-read-1.fastq.gz -i2 /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/ultraplex_in/$i-read-3.fastq.gz -b /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/multiplexing/APPCBarcodes.csv -t 8 -d /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/DNA/ultraplex_out; 
    done


In [None]:
#this worked! do this for all capture pools, rename, move this line to the demultiplex notebook, and then redo the fastq, mapping, picard, then angsd and genotype calling
ultraplex -i /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/ultraplex_in/CAP1-read-1.fastq.gz -i2 /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/ultraplex_in/CAP1-read-3.fastq.gz -b /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/multiplexing/Cap1_barcodes.csv -t 8 -d /local/home/katrinac/ClownfishGWAS/data/APPC_sequencing/DNA/ultraplex_out 