In [None]:
%mkdir -p ~/scratch/analysis/all_data
%cd ~/scratch/analysis/all_data
%pwd

Set up links to demux samples

In [None]:
%%bash
ln -s ../KNP_demux/r?.*.fq.gz .
rm -f *unmatched.fq.gz
ln -s ../EG_demux/r?.*.fq.gz .
rm -f *unmatched.fq.gz

In [None]:
%mkdir -p ~/scratch/analysis/results
%cd ~/scratch/analysis/results
%pwd

In [None]:
%%bash
mkdir -p genome
wget --output-document genome/ecoli_w3110.fa.gz --no-verbose ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000010245.1_ASM1024v1/GCA_000010245.1_ASM1024v1_genomic.fna.gz
wget --output-document genome/ecoli_w3110.gff.gz --no-verbose ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000010245.1_ASM1024v1/GCA_000010245.1_ASM1024v1_genomic.gff.gz

gunzip genome/ecoli_w3110.fa.gz
gunzip genome/ecoli_w3110.gff.gz

bowtie2-build genome/ecoli_w3110.fa genome/ecoli_w3110

ls genome

###Now Everything Together!!
We will now run all the samples, but first we need to generate an adapter file for all the samples. 

#### Generating a full adapter file
We still need to do the copy-and-paste part from the index primer manual, but we will do the reverse complementing automatically.  Let's do that now (if you are feeling lazy, you can use the `testrun_adapters.fasta` file in the repo directory) . . .

OK, now that it is out of the way, we need to install a python library that our reverse complementing script will use

Now we can run our script.  By default it only outputs the reverse complemented sequences, but with the --both option, it will also output the original sequence.

In [None]:
%%bash
~/bioinf_nb_ngscourse2015/revcomp.py \
    ~/bioinf_nb_ngscourse2015/all_neb_adapters.fasta \
    --both --output all_neb_adapters_both.fasta

####Looping over all the samples
Now we can put all of the previous commands into one big loop.  This is probably a good time for copying and pasting.  But we will make a few small changes.

1. We will add all the barcodes the the list of barcodes to iterate over
    * AGTCAA AGTTCC ATGTCA CCGTCC GTCCGC GTGAAA
2. We need to remember to use our full adapter file

In [None]:
%%bash
IFS="."
# for R1 in ~/scratch/analysis/all_data/r1.*.fq.gz
for R1 in ~/scratch/analysis/all_data/r1.7A_E.fq.gz
    do
        set $R1
        BARCODE=$2
        echo $BARCODE
    done
unset IFS

In [None]:
%%bash
mkdir -p trimmed
mkdir -p th_out
mkdir -p counts

FASTQ_DIR=/home/bitnami/scratch/analysis/all_data

IFS="."
# for R1 in ${FASTQ_DIR}/r1.7A_E.fq.gz
for R1 in ${FASTQ_DIR}/r1.*.fq.gz
    do
        set $R1
        BARCODE=$2
        echo $BARCODE
        fastq-mcf all_neb_adapters_both.fasta \
            ${FASTQ_DIR}/r1.${BARCODE}.fq.gz \
            ${FASTQ_DIR}/r2.${BARCODE}.fq.gz \
            -q 20 -x 0.5 \
            -o trimmed/r1.${BARCODE}.trim.fq.gz \
            -o trimmed/r2.${BARCODE}.trim.fq.gz
            
        mkdir -p th_out/${BARCODE}
        tophat2 -G genome/ecoli_w3110.gff \
            --library-type fr-firststrand \
            --output-dir th_out/${BARCODE} \
            --max-intron-length 5 \
            --min-intron-length 4 \
            --transcriptome-max-hits 1 \
            --max-multihits 1 \
            --no-coverage-search \
            --no-novel-juncs \
            --num-threads 2 \
            genome/ecoli_w3110 \
            trimmed/r1.${BARCODE}.trim.fq.gz \
            trimmed/r2.${BARCODE}.trim.fq.gz
            
        ln th_out/${BARCODE}/accepted_hits.bam th_out/${BARCODE}.bam
        samtools index th_out/${BARCODE}.bam
        
        samtools sort -n th_out/${BARCODE}/accepted_hits.bam \
            th_out/${BARCODE}/accepted_hits.name

        htseq-count --quiet --order=name --format=bam --stranded=reverse --type=gene \
            --idattr=ID th_out/${BARCODE}/accepted_hits.name.bam \
            genome/ecoli_w3110.gff > counts/${BARCODE}.csv
    done
unset IFS