In [18]:
import subprocess
import shlex
import os

import pandas as pd

Number of reads:

# 1. Raw data analysis

In [33]:
for filename in os.listdir():
    if filename.endswith('.fastq'):
        command = f"grep '@SRR' {filename} | wc -l"
        proc = subprocess.run(command, capture_output=True, shell=True)
        print(filename, proc.stdout.decode('utf-8'))

ferm30rep1.fastq 1721675

ferm0rep2.fastq 9929568

ferm30rep2.fastq 6172452

ferm0rep1.fastq 9043877



Also run Fastq: high duplication level and adapters are present, but...

- https://academic.oup.com/nargab/article/2/3/lqaa068/5901066
- https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-0956-2

...so no filtering is required

# 2. Alignment

In [23]:
genome_index = 'yeast_index'
transcriptome_index = 'yeast_rna_index'

def align_command(index, fastq_file, p='8'):
    name = fastq_file.split('.')[0]
    if 'rna' in index:
        name += '_rna'
    command = f'hisat2 -p {str(p)} -x {index} -U {fastq_file} | samtools sort > {name}.bam'
    return shlex.split(command)

# strange thing happened:
# can't run command form jupyter with -U option: says this option is invalid
# however it runs if -U is replaced with -q
# run both commands (with -U and -q) from terminal and the results were the same, so...

for filename in os.listdir():
    if filename.endswith('.fastq'):
        for index in (genome_index, transcriptome_index):
            align = align_command(index, filename)
            print(align)
            proc = subprocess.run(align, capture_output=True)
            print(proc.stderr.decode('utf-8'))
            print(proc.stdout.decode('utf-8'))
            proc.check_returncode()

['hisat2', '-p', '8', '-x', 'yeast_index', '-q', 'ferm30rep1.fastq', '|', 'samtools', 'sort', '>', 'ferm30rep1.bam']
1721675 reads; of these:
  1721675 (100.00%) were unpaired; of these:
    66371 (3.86%) aligned 0 times
    1507906 (87.58%) aligned exactly 1 time
    147398 (8.56%) aligned >1 times
96.14% overall alignment rate


['hisat2', '-p', '8', '-x', 'yeast_rna_index', '-q', 'ferm30rep1.fastq', '|', 'samtools', 'sort', '>', 'ferm30rep1_rna.bam']
1721675 reads; of these:
  1721675 (100.00%) were unpaired; of these:
    274582 (15.95%) aligned 0 times
    1298302 (75.41%) aligned exactly 1 time
    148791 (8.64%) aligned >1 times
84.05% overall alignment rate


['hisat2', '-p', '8', '-x', 'yeast_index', '-q', 'ferm0rep2.fastq', '|', 'samtools', 'sort', '>', 'ferm0rep2.bam']
9929568 reads; of these:
  9929568 (100.00%) were unpaired; of these:
    511726 (5.15%) aligned 0 times
    8644577 (87.06%) aligned exactly 1 time
    773265 (7.79%) aligned >1 times
94.85% overall alignment

# GFF to GTF & featureCounts & DESeq2
```bash
gffread GCF_000146045.2_R64_genomic.gff -T -o GCF_000146045.2_R64_genomic.gtf
```


In [21]:
reps = [('rep1', 'rep2'), ('rep1_rna', 'rep2_rna')]
for rep in reps:
    rep1, rep2 = rep
    if 'rna' in rep1:
        title = 'rna'
    else:
        title = 'gen'
        
    command = (f'featureCounts -g gene_id -T 8 -a GCF_000146045.2_R64_genomic.gtf -o '
               f'{title}.fc ferm0{rep1}.bam ferm0{rep2}.bam ferm30{rep1}.bam ferm30{rep2}.bam')
    subprocess.run(command, shell=True, check=True)
    proc = subprocess.run(f'cat {title}.fc | cut -f 1,7-10 | R -f deseq2.r', shell=True, capture_output=True)
    subprocess.run(f'head result.txt -n 50 | cut -f 1 > {title}_genes.txt', shell=True)
    subprocess.run(f'head result.txt -n 50 > {title}_result.txt', shell=True)
    proc = subprocess.run("cat norm-matrix-deseq2.txt | R -f draw-heatmap.r", shell=True, capture_output=True, 
                         check=True)
    subprocess.run(f'mv output.pdf {title}.pdf', shell=True)

In [17]:
proc = subprocess.run('diff rna_genes.txt gen_genes.txt', shell=True, check=True, capture_output=True)
print(proc.stdout.decode('utf-8')) # no difference




In [10]:
for title in ('rna_genes.txt', 'gen_genes.txt'):
    with open(title, 'r') as f:
        genes = [line.strip('gene-').strip() for line in f.readlines()]
    with open('stripped_' + title, 'w') as f:
        f.write('\n'.join(genes))

# Gene ontology search results 

In [20]:
go_search = pd.read_csv('mapper_genes_15841_slimTab.txt', sep='\t')
go_search

Unnamed: 0,GOID,TERM,NUM_LIST_ANNOTATIONS,LIST_SIZE,CLUSTER_FREQUENCY,TOTAL_NUM_ANNOTATIONS,POPULATION_SIZE,GENOME_FREQUENCY,ANNOTATED_GENES
0,GO:0006364,rRNA processing,13,49,26.53%,366,6443,5.68%,"YDR449C, YEL026W, YER127W, YGR159C, YHR066W, Y..."
1,GO:0042273,ribosomal large subunit biogenesis,9,49,18.37%,130,6443,2.02%,"YCR072C, YDL063C, YEL026W, YHR066W, YIR012W, Y..."
2,GO:0042274,ribosomal small subunit biogenesis,8,49,16.33%,146,6443,2.27%,"YDR449C, YEL026W, YER127W, YGR159C, YHR196W, Y..."
3,GO:0042255,ribosome assembly,7,49,14.29%,79,6443,1.23%,"YCR072C, YGR159C, YHR066W, YIR012W, YLR264W, Y..."
4,GO:0006360,transcription by RNA polymerase I,6,49,12.24%,71,6443,1.10%,"YHR196W, YJL148W, YJR063W, YML043C, YMR093W, Y..."
5,GO:0006811,ion transport,6,49,12.24%,340,6443,5.28%,"YDR536W, YHR094C, YKL120W, YNL065W, YNR060W, Y..."
6,GO:0055085,transmembrane transport,5,49,10.20%,468,6443,7.26%,"YDR536W, YHR094C, YKL120W, YNL065W, YOR271C"
7,GO:0005975,carbohydrate metabolic process,4,49,8.16%,253,6443,3.93%,"YBR105C, YER062C, YKR097W, YOL136C"
8,GO:0055086,nucleobase-containing small molecule metabolic...,4,49,8.16%,220,6443,3.41%,"YBL039C, YMR300C, YNL141W, YOL136C"
9,GO:0006417,regulation of translation,3,49,6.12%,234,6443,3.63%,"YLR264W, YNL112W, YOR359W"


In [23]:
deseq_top50 = pd.read_csv('gen_result.txt', sep='\t')
deseq_top50

Unnamed: 0.1,Unnamed: 0,id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,gene-YER062C,gene-YER062C,16729.875894,7.899953,0.408404,19.343472,2.313608e-83,1.344669e-79
1,gene-YDR536W,gene-YDR536W,10088.387566,7.874257,0.411571,19.132209,1.361793e-81,3.957371e-78
2,gene-YHR094C,gene-YHR094C,8495.183817,7.881624,0.432049,18.242432,2.376454e-74,4.603984e-71
3,gene-YNL065W,gene-YNL065W,2109.592516,7.785356,0.466923,16.673751,2.03418e-62,2.955664e-59
4,gene-YKL120W,gene-YKL120W,1727.165761,7.269328,0.48244,15.067824,2.636549e-51,3.064725e-48
5,gene-YLR264W,gene-YLR264W,1387.811877,6.48318,0.43216,15.001817,7.143607999999999e-51,6.919775e-48
6,gene-YGR159C,gene-YGR159C,8021.263133,6.76742,0.451849,14.977169,1.0353479999999999e-50,7.521801e-48
7,gene-YJL122W,gene-YJL122W,1291.149172,6.511634,0.434559,14.984451,9.278892e-51,7.521801e-48
8,gene-YNL112W,gene-YNL112W,5328.977598,7.363983,0.493559,14.920173,2.436391e-50,1.573367e-47
9,gene-YOR271C,gene-YOR271C,1052.36038,6.844359,0.472759,14.477488,1.681312e-47,9.771787e-45
