# `DNA`

In [1]:
from src import DNA

In [2]:
dna = DNA("AACTGACG")
dna

'A': 3; 'C': 2; 'G': 2; 'T': 1
'length': 8
DNA('AACTGACG')

In [3]:
# Get nucleotide by index
dna[0]

'A'

In [4]:
# Replace old value by new one
dna[0] = "T"

In [5]:
dna

'T': 2; 'A': 2; 'C': 2; 'G': 2
'length': 8
DNA('TACTGACG')

In [6]:
dna.reverse_complement()

'CGTCAGTA'

# Loaders

In [7]:
from src import (
    FastaLoader,
    FastqLoader,
)

### **FASTA format**
___
Load an entire genome (extension.fa)

> lambda_virus.fa can be downloaded form here https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa 

In [8]:
fasta = FastaLoader()
sequence = fasta("data/lambda_virus.fa")
sequence[:10], len(sequence)

('GGGCGGCGAC', 48502)

In [9]:
dna = DNA(sequence)

In [10]:
dna

'G': 12820; 'A': 12334; 'T': 11986; 'C': 11362
'length': 48502
DNA('GGGCGGCGACCTCGCGGGTT...')

### **FASTQ format** Sequencing by synthesis
___
For Sequencing reads and Qualities

In [11]:
fastq = FastqLoader()
sequences, qualities = fastq("data/SRR835775_1.first1000.fastq")

In [12]:
len(sequences)

1000

In [13]:
len(qualities)

1000

In [14]:
sequences[0]

'TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTCACCCTAACCCTAACCCTAACCGTATCCGTCACCCTAACCCTAAC'

In [15]:
qualities[0]

'???B1ADDD8??BB+C?B+:AA883CEE8?C3@DDD3)?D2;DC?8?=BAD=@C@(.6.6=A?=?@##################################'

### Interpretation of `quality`
To get qualities or Q value from this equation

$$Q = -10 \cdot \log_{10}p, \hspace{0.2 cm}, p:  \text{probability that base call is incorrect}$$


| Q   | Chance call is incorrect  |
|-----|-------|
| 10  |  1/10 | 
| 20  |  1/100 |
| 30  |  1/1000 |
| 40  |  1/10000 |


In [16]:
# Call method 'phred33_to_Q' to translate encoded-qualities in ASCII to Q-values
Q_values = [ fastq.phred33_to_Q(enc_qual) for enc_qual in qualities[0] ]

In [17]:
Q_values[:10]

[30, 30, 30, 33, 16, 32, 35, 35, 35, 23]