In [5]:
import subprocess
import shlex
import os

Number of reads:

# 1. Raw data analysis

In [33]:
for filename in os.listdir():
    if filename.endswith('.fastq'):
        command = f"grep '@SRR' {filename} | wc -l"
        proc = subprocess.run(command, capture_output=True, shell=True)
        print(filename, proc.stdout.decode('utf-8'))

ferm30rep1.fastq 1721675

ferm0rep2.fastq 9929568

ferm30rep2.fastq 6172452

ferm0rep1.fastq 9043877



Also run Fastq: high duplication level and adapters are present, but...

- https://academic.oup.com/nargab/article/2/3/lqaa068/5901066
- https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-0956-2

...so no filtering is required

# 2. Alignment

In [23]:
genome_index = 'yeast_index'
transcriptome_index = 'yeast_rna_index'

def align_command(index, fastq_file, p='8'):
    name = fastq_file.split('.')[0]
    if 'rna' in index:
        name += '_rna'
    command = f'hisat2 -p {str(p)} -x {index} -U {fastq_file} | samtools sort > {name}.bam'
    return shlex.split(command)

# strange thing happened:
# can't run command form jupyter with -U option: says this option is invalid
# however it runs if -U is replaced with -q
# run both commands (with -U and -q) from terminal and the results were the same, so...

for filename in os.listdir():
    if filename.endswith('.fastq'):
        for index in (genome_index, transcriptome_index):
            align = align_command(index, filename)
            print(align)
            proc = subprocess.run(align, capture_output=True)
            print(proc.stderr.decode('utf-8'))
            print(proc.stdout.decode('utf-8'))
            proc.check_returncode()

['hisat2', '-p', '8', '-x', 'yeast_index', '-q', 'ferm30rep1.fastq', '|', 'samtools', 'sort', '>', 'ferm30rep1.bam']
1721675 reads; of these:
  1721675 (100.00%) were unpaired; of these:
    66371 (3.86%) aligned 0 times
    1507906 (87.58%) aligned exactly 1 time
    147398 (8.56%) aligned >1 times
96.14% overall alignment rate


['hisat2', '-p', '8', '-x', 'yeast_rna_index', '-q', 'ferm30rep1.fastq', '|', 'samtools', 'sort', '>', 'ferm30rep1_rna.bam']
1721675 reads; of these:
  1721675 (100.00%) were unpaired; of these:
    274582 (15.95%) aligned 0 times
    1298302 (75.41%) aligned exactly 1 time
    148791 (8.64%) aligned >1 times
84.05% overall alignment rate


['hisat2', '-p', '8', '-x', 'yeast_index', '-q', 'ferm0rep2.fastq', '|', 'samtools', 'sort', '>', 'ferm0rep2.bam']
9929568 reads; of these:
  9929568 (100.00%) were unpaired; of these:
    511726 (5.15%) aligned 0 times
    8644577 (87.06%) aligned exactly 1 time
    773265 (7.79%) aligned >1 times
94.85% overall alignment

# GFF to GTF & featureCounts & DESeq2
```bash
gffread GCF_000146045.2_R64_genomic.gff -T -o GCF_000146045.2_R64_genomic.gtf
```


In [63]:
reps = [('rep1', 'rep2'), ('rep1_rna', 'rep2_rna')]
for rep in reps:
    rep1, rep2 = rep
    if 'rna' in rep1:
        title = 'rna'
    else:
        title = 'gen'
        
    command = (f'featureCounts -g gene_id -T 8 -a GCF_000146045.2_R64_genomic.gtf -o '
               f'{title}.fc ferm0{rep1}.bam ferm0{rep2}.bam ferm30{rep1}.bam ferm30{rep2}.bam')
    subprocess.run(command, shell=True, check=True)
    proc = subprocess.run(f'cat {title}.fc | cut -f 1,7-10 | R -f deseq2.r', shell=True, capture_output=True)
    subprocess.run(f'head result.txt -n 50 | cut -f 1 > {title}_genes.txt', shell=True)
    proc = subprocess.run("cat norm-matrix-deseq2.txt | R -f draw-heatmap.r", shell=True, capture_output=True, 
                         check=True)
    subprocess.run(f'mv output.pdf {title}.pdf', shell=True)