In [2]:
# Nucleotide counter

def count(s):
    d={"A": 0,
       "C": 0,
       "G": 0,
       "T": 0}
    split=list(s)
    for i in split:
        d[i]=d[i]+1
    return(f'{d["A"]} {d["C"]} {d["G"]} {d["T"]}')

assert count("ATG")=="1 0 1 1"
assert count("AAAAA")=="5 0 0 0"

In [8]:
# Getting data from GenBank
# Queary: A genus name, followed by two dates in YYYY/M/D format.
from Bio import Entrez
def search(genus, d1, d2):
    Entrez.email="k.zoltowska@oxfordalumni.org"
    search_term=f"{genus}[Organism] AND ({d1}[PDAT] : {d2}[PDAT])"
    handle=Entrez.esearch(db="nucleotide", term=search_term)
    record = Entrez.read(handle)
    return(record["Count"])

In [42]:
# Select the shortest sequence in the set

from Bio import Entrez
from Bio import SeqIO
Entrez.email = "k.zoltowska@oxfordalumni.org"
handle = Entrez.efetch(db="nucleotide", id=["JX475048 NM_001003102 JQ011270 NM_001131214 NM_001133698 JX445144 JX295575 JQ762396 NM_001009148 NM_001172751"], rettype="fasta")
records = list (SeqIO.parse(handle, "fasta"))
lengths=[]
for r in records:
    lengths.append((r.id, r.seq, len(r.seq)))
    
length_in=100000000000000000
for index, (id, seq, length) in enumerate(lengths):
    if length<length_in:
        length_in=length
for index, (id, seq, lenght) in enumerate(lengths):
    if lenght==length_in:
        x=index

print(f'>{records[x].description}\n{records[x].seq}')


In [None]:
# Converting FASTQ to FAsTA
# Create txt file with fastq sequences called fastqc in the run directory
# Sequence outputs will be saved in fasta file in the same directory
import Bio.SeqIO
SeqIO.convert("fastq","fastq", out_file="fasta", out_format="fasta")

In [None]:
# Parse FASTQ data and determine average quality scores 
# Report number of sequences below the specified quality

import Bio.SeqIO
import statistics

filename="Fastq_in"
seq_object=SeqIO.parse(filename, "fastq")

phreds=[]
for seq in seq_object:
    phreds.append(statistics.mean(seq.letter_annotations["phred_quality"]))
    
count=0
threshold=28
for phred in phreds:
    if phred<20:
        count=count+1

print(count)


In [None]:
# Translate RNA to protein and define the translation table used

from Bio.Seq import translate
translations=[]
sequence="ATGTGGATTCGCAAGAGTGACTCACTGGGCATATACGTATATGACATGGACTGGGACCGATTTCGCTTTACGCTTTTGCGAATATTCGAGCAATGGCTATCGCTAGGAAGGTCTGTACAGGTAGAGCGTCGGAGCCTGAGGCCTGTAGCGCGTAACATGTCGAGCGAGAGTGTCAAGGATCATCAATTCCAGACGCATAAAACGCACACCAGAAACTTGTCTAGGTTGGCATGCAATCGAAAATTACCACGGGGGGCCATAATTAGGTGGCATGATCTTTCAAAGCCGAGATTGATCCTCACTCTCTCATCAAGGATTGACTCTGGCCAAAGTCGTTTAGTCAGATGTGTACGCTGTTGGGTAGATGACATTTTAGCATGGCGTAACAATGTGCCAAAGGAGAGCGAATTAGGTCTAGTCATGCACGAGTACGGTCATACAGCCATGTACAATTCCCCCCTGGCCGGGTCTACTCAAGGAGATACTCTAGTGTTAGAAAACGGCTTCATCCTCCCCACCAGATACACAGTAACGTTCGAGTGCTTACTTCGTCGGCTGAAAGCATGTGGTCGCGGCTGTCAACGGGGGGGCCCGCAAGGCCGGCTTCCGAGTCCGGGGCAGAGGGCAACTGCGGTCAGAGTACGGGAACGGATTGCTTGCCTATTGATAGGCCCTAGCACGGACTCCGCTGAGATAGGATACCTCCCCCGGCTTACGAGGAGCAATATACCTCCGGTACAGCCGACATCTCTACAACTGGGAATTCCAACCCCAAACTTTCGGATCAGTCATCCCAAGAAGATGCGATTTTTTAACGCTCCGTTACTCAACTGCAATGCTCATTTCAATCTGATGGGAAATCGTCAAACTAATCCTCATCCTCCACTAACCGGAACATCTTGCATCCCTCCCCCGTCAAGACCGAGATTCCTGGTGACACAGTGTGCTTCACGGACCGGTTCCGGAATCACGGTTAGACCCTCAGCACCGGGGAGTAGCTATTATGACCGTTTCCACGAGACCGGTAGAGTGGCCGCGAACAGAACGCGTATTCGGTTAGCGAAGGGCTCTGAACATGTCCTACTGATTATAGCGTATTTAAGTTCACCGCGCAGGGAATCGCACGAGAGTGAGGTCATAGCGGTAATCCAGTATCCTGTCCATAGAACCAGGGACGTCGCGAGCGAGGAAACCACGACAAGGTCGCGGTGTCCGAATTCACAATGGGGGCGTTACCGCGAAGGCGTCTTTATTGCCTGGTGTATTATTTCGACCTTGCCCCTAGGCCTCGACCGCGTCGGAATCATGGGGAGGTCAAACATCGCGGTCTACACAGTTTTGGCTCTTGGACCAAACTATGACAAGACGGTCTTTTGGATCGGCAGAACGCGGTCGCTAAAACTCTGGCTCACCCTTCTAAACAATAGAATCACCCGAGCGTTCTTACTAAGGAATTGGATCGGTAGGCACTGGAATCCGTATACCGCGTCTAGGAAACAAAACCCGCGAGCAGGCCGATTTCCTCTTGGAACAGTCCTAGTAGTCCCAAGAAGCAGTCTCCGCACTTCTGCCCCCGGTCCCAATTCCGCCTCCGAGCTCAAGTTGGACGAACACCCTACAACAGCGTCAGGTCTACCGATCCACGATCGTGCTTTATCAGGCCCACGCAAGATGGAATCGGATACTCTCTTTAAAGCTCACATCGTCCTACATTGGCACATTCATAATGATAAGTGGAGATCAACTTACTCTGAACGGAAAGCAAGATCTAGATGCGAAAAACCTAACGTGCGAGATGGACACCGATGGACATCCACTAACAACAAACAAAACAGCGACAGCGGCCAACCCGCCTGCTCTATTGTTTCGAAAAATTTGCTAAATAGCGGACCTAGCTTTCGAGGGCTAGGACCCCCCCAATCTAGCCTATGCGCAGATACGATGCCACGTGGCTGTAATGAACAGGCTATCAATATAGTTAAGCATCCCGTTCACGGTGCGGTAATGTTTCTTGAAAATAGGCAGCCCTCGAGTATGCCGTCTCTTAGAGCGTTAAGAAGCCACTGTACTTCCCATGGAGTGAGCCCTTTTCGCACGTATTGGCGGGATCGAACGCTCGGGGAATTGACCATTTCACACGTCTCAAGCGCCGGAACAATTTCCTTGTATGCTAATGTTAATTTGGGTGTGGCGGTCATGTCCCCACGCCGTAATACCGAGCGAACGCAGAGCGCTCACCTATACAAGGCTGCGGATGAACAGCCGTATCCTGGCATACAGACCGTCGTTTCCATATTCTGCCTTCCGACCAGTAACCCAACTGCCGAGGCATTTGTTCGCTCTGTAGAGACTAACCATCCGAAAGTGTGTAGGCTCGAGGGCCCGAAACTATTAGTCAATTTTAACCATAGCCGGAATTTAGTTCTAGTTTTTGTCTGCCTGATGGAATATCAGCAAGTGAACATACTCCGTTGTTGGCCGGTCGAGCGGGAAAGCTCAAGCAGTCAGCAAACGTATCACCCAACTCGGGCAACCTATCTACGGGGACTGCCAGCTTCAGCACACATCATGGGTATCGCTATAACGGAGGGCCAATCGAACCGCCCCACTTTCATCAATACGCCGCAGAGCGACCACTTGAACGCGGCTGGTACTTCAGTAGACTGTAACTGGCGAAACATGGCGTGCAGGAGGAACTTCGCGAGCCACGCCGAAAGTCAGTGCATCACGATAAGACCAAACAGTTATGCGCGACGTGGGGTACGCTGGTCACTTCGTATAGGACTTCAAATCGAATTTGAGCCGTCAGGGATTAGACTACACCACGCTCTCTCACATAACTCAACCCAGAAGGAGAGCCTTTACGGCAGTAACAGTTCTCCTACGGTAATCCAGCGCCAGTCCCTCACCAGGACCTGCTGGCTACTTTTGACCCACTTTAGCAGGGGTGTCCACCATACAAAAAGCTCCGCCCGTGAGGGTGCGGGTACTAGCGGGATCCGCAAGTCGTCAAGGCTTACTCAGATATGCTCAGCAACTCTATACTGTCGCCTCGTCGGGCCCGCTCATCTAAGCAACTATAGTGAAAGAAACTGGTCCCGAGCCGTGTCCGTGAGGCACTGGGTGTGTACGCATCGCCCCATGTGCTGTGGGTGGATCGCAGGAGCGGATCGCAGAACCTGCGTAAACGGGGAGTACCTTTTGATCTGGCCTGTACAAGCTAGATATAGGCCAATGTGCAAACACCAATATACGTTTCTCTTCTACTGCCCGAACGACGCCTCCCGTGCACCCACGAAGCATACAGTCGAGATGGCGTACGATGCTACATGGAGGATGATGTTTCGGTCCACAGTCTGGACGGTCAGAGCTCGGATAAAGCACTTGAGGCACTATCGGCCGATTAGCCAACTACGCCGCTCCTTCCGGCGAGCTTACCTTAGTTGCCTACTCGTCACGATTGACGATCTGGATTTTGCGCCACTGGACGTTCAATACCGGTCGCGTGCGTCGTGGGAACCCAAGTCTAATGAATTTATTAGGGCCGCCCAACATCCTGAGATTCCCGTCGACACGCCGTGTTTCCGACGCCGAGATCGATCCAAGCGGCCTCCAGAGGGGTGCGTAGCTGACAACCCCGCAGAAGGAGGTTGCTCACACATCGGTATCGGTGGCCCAACTGTTTCCTGCGAAGAGCGGAGGATGGTGGGGCGGCTCAGAAAAGACGAGCTTGCAAGAAGTTCCGCGACGCGGGAGAATATACGTTCTTCATTAGCTACTTCTGGCCCCGATTGGCTGCAAATTCACTCTGTTCACCAATGCCCCTCTAAAACAACTAATAACGATTGTCGTATGCTACAAGATTTTATTTTAGAGCTATGCCCTTCGGGCGGGGCCTACAGAGCGCACCAATTTACGCTAACCAAAGACGATAAGAGCGATTACGCTGCGGCTGCGGGGTTTTCTATACTAGCAACCCGCGCCCGAAATTATTGTTTGGTGCCACATCGGTGTCAGTTTCCGCAGTCAACTGGAGTAAGGAACCTGCATCGAGCGAGTGTACATAGACCATTAGTAGTGCGTGTTGTACGTATAGCTTTACGCATGATTATAGGATATTGGGCCGCTACACAGACCAGTAATGTTACATACCCCTCGGGTGGCGTCGCAATCGTTGCCCTGTGGGGTCCGGGAGCTGCTCCTCATTCTGGAGTACCATGGAGTGAGGATGGTTGGACCCTTCAGAAACTTGAGGTGCACTACTCCGTGACGCGATTGTCACGGATGGGCCGGAACGCTACAAGGGATCGAGAGAAGGTAGAGTCTGGTGCTGAGTACGCTGTGGATACCACAGCGTCAAGCGCGGGAGGGTGTCGGACGTACACGCCTTTGTATGCGAGGACGAACTATAATGTTATGAAGTTGATGTACACGGTACAAGGAATGTGGGATTCGCGCCGAAAAAGTTTCGTAAGGGATTGGCGCCAATTCGAATCCCTCAGGGGCCATGTTACCTGCCAGTTGATCAACTTTGCGCTACGAACGATGCAATCCGCAAGTCCTGCCATAGCTAGCCGCATCCTTGCACTGAGATCCTCCCGCAGTCCCCGGACGTTGGGGCTCGCCTGGCGCACGCAAGCGAAATCAGACAGTGACCATTCACTGACTTTATGGAGCCGTATGGCGCGGAATAAAGCACACTCGGAATACATGAGCCGATCCCGGGGGTTAGTAGGCTGTTGTTGTGGCACAGGCAGTCGCGTGCGTCGCATATTCTGTATGAGGGCAGCTGCAGTCGGCTCAGACCTCAAACCACACGGTGCAAACCAGACTTATGGCGCCGGATCTCCTCACGCCTCAGCTTGTGTCAGTGCTGGGCCTGAAAGGAGCTATCGCGTCTATAGATTGAGAGTCGCAGGTGGAGACACCTGTGACGCGGCCATCCTAGACACGCTTTCAAGATCCCTTATCCGTTCCTGGGTTTGTGCTGGCCGACGCCGAGAACTCATCCCACGCCTGCCTAAGCTTGCCTCACGTTCAAGAGCGGGCCTAATAACCAATTCTATCTACAAAGCGCCCACTGCGGCACGAGCTGTGCGGTCTACGGCACGCGCTAGGTCGAGTAGGACCTTCCCATTGCCACTGACTCTTGCGTCCGTTTGTCTAGCGATTGATATCCGTGGCAAAGATCGCAGACGTAGAAGCACTGAAGACGTTCACTTGCAGACCATGAACATTCTTCATCCGGCTTATATAAAAATAGTAGGGCTGCCCTTACACTCAGATACTCAACTTCGTTCAAGATCCAGAAGATTCTATTGGAAAGCAGAAAAGGCTAGTAAGTATGTTAGTCGAGATCCGTATGTAATATTCGACCGAGCCTTCTCGATACAAGGAGCGTCTAATGTCGTAACGTCTTATTGCCCTGCCCCTGAGATGGGTAAGACGCTTTACATTGAAGGTGAGGATGCACGAAGTAGTGTTGAGCTGCTAATTCGGGGTCTGACCCGAATACTGCCACGCCTGCGAACGGGGCAGTCTCGACCAGCACGGGATTGCAACCAGCTAACTACGGTAGTGCGGGGACCTCACAGTAGGCAAGAGTTCGACTTAGGAAAGCGTATTCAAATCGTTAAGCGTTTAGGGAGATGGCGGTACGTACTCAACTGTCACGTGCAATCGTGCGTATGTGCACGCCTGAGGAGTAGTGATAGATGGGACTCGAGGACTAAGAAGCGGAGGTTTAGGCCAGATGTCGGCAACCTTAAGGCCTCTTGGGGAACACAATGGCACTCCGCGCATAGTATGGAGTATCACGAAGAGCTAGCATGCCCCGCCAGAGTAGTTCCCGCAATCTCTCGACGGTGGGATGCTGGACGTCTGGCCGTGGGTAGGTCGCAAACCTGCGTCTCACACGAGAAGACTTTAGCAAATTCATGCGAGGCGCGTGTAGACCCTCTTAGCGGGACAAACAGTACCGTTAAATTGTTGAAAGTTGACGAGAGCCACACAACAATTGAGACTCGTGGTAACTTAAAAGTAGTTGGTGCGCGGCTTGTTCTACCCCCTCTCGATGATCTCCGAGGCGGCTCCGGCGGTGTCTTATTTGCAAGCCGTGTCCTAGACATTCCCAAAGTGGGCGGGGTAGTAAGCGCGCTCCATCATCGGTCGCCACGAAAATACGTTGGACAGGGCTGGCGTCCCTCGGTGGGAGTAATCATTTCGAGCGAATACATTGTAGGCATTGCTGTTTTCTACACTAACGTATACTTCGCAACGTCTTACGGTCAACACCAGACCCAGGTCCCGATTAGTCGTGTATTATGTGACGGTTATGCTCTCGGGACCTTAAGTCTATGTGCTGTAACAGACCGCTTCGGTGGTAGTACACACTCGCCAGTACGCATCGCCGGTAGTGTACAGCCCACCCTGATAGGCGGCGAACGAGACCGGTTACATAGTCGTTTATATGTCGAGCCAACAAGATATTTTTTCAGTTGCCTCAGCTCGGGCACGGGAATGACAACCAATCGTCCTCAAACTAGCGGCCGGTGCTATCTGAGTAAGGAACATAATGGTTCGACGGATCATAGCACCATCAGAGTCTGCCAGGAGTCCAAGAACGATATTGGTAGGTGCCTGAATCGAGGTTCGAGATATATGTCAGCAGTCCGGTCTAGGAGTTTTCACAGCTACCTCTTAAGGAACTGTTCTTACTTTTTAACCGGTAGAGCACCAAGAGTTGCCGCACATCGATTTGATTGCCATAAGTCTACTATGTGCTTCCCGACATTGCGGCACACAACAGGGGTTTTGGCAGTCAGGAGCCCGCTATATTTGACAGGGTCACGAACCAATGACAATTCAGGACTTTTGGCGGGTATAGCAAGATACCCACAGTCGGGTCACACTCGAGTGGACGTACATGCAAAGGGTAATGTCCACAGAAATGCGATGATTAGTAAGCACCACGGGCGTGTCGGTGGCGCAGTCGACAAATTGACAAGTAGGCCGGGTTACGGGATGACGGAGCATACCGTCGGCCTTGAGGAATCCAAAAACCCCGACAATAGTATTTACGAGCAATTTACTTCACGATTCGTATGCGACACGGCTTTCGACCAGTGGTCCAATGACGTTAGACTTTCTGCAGGCTGCCACTTCATATTAGTTTCCCACAACTTGATGGGCGGGGGAAAGATGCCTGGTAATACGAACCCAGCGGGGATCACGGCCTGCCGCTATGGAAATATTTATGCTTCAGGTCCTACCAGTCGCGGGTTCAGAGCTACACGACTTCAAGCGGAGGGTCGCCGCACAAGAGCTTTTGGTCCCGCACAGTTTGATTATTTCCAGACCTTGCAGTTTATCCTAAAGAGAGCGTTTGTGTCCCGACGTACAACCAGAAACCGTAAGGCCCAGGCATTCGATTTATATGCGAGCCCAGATTTCGGGTTCCCGGTTAGACATTCATTCCATGCACGGACTGTAGAAGGATCCTTACGTGAACCGGTGCACCGGTTTATCAGCCGCCCACCGTATGCATTAGCTTTACGTTCTTTCCGGCCATCGAGGGAGTCTAGTGTTCTGGAGGATAAAGTGGCTCACAACCCGCTAGGGTATGCTTCTGACAAGCACCCTCGGGTGACCATGGCTGGCATGTTAGGAATTGTGGCCCAACGGTGGGGACCCTCCCCCATGGTTGTTGATTTCGGAGAACCGTGTGACAGCGCCCCGACTGGAAACCCCTGGGCCCAAAGTTCGTTGACTATCAGTTGTGCCCACTGTCCCAAGTGGATTCGTCGGGTGATGGGGTGTTCGTTTTCGTATAAGTGGTATGTTGGTGTAGTTTTGGTGGAGGGCGTGCACTGCCTCTTAGCTGCGCGTTTCGTCGCTCACCCGAGTGACCAACGTGCAGAGCACCCAGGACATCAATCAGGCTACTGCTCACATGTTGATGGACAGACGATAGTAGACCAGCAGTTACAACTAAGAATTCCTACTTAG"
output="MWIRKSDSSGIYVYDMDWDRFRFTLLRIFEQWLSLGRSVQVERRSSRPVARNMSSESVKDHQFQTHKTHTRNLSRLACNRKLPRGAIIRWHDLSKPRLILTLSSRIDSGQSRLVRCVRCWVDDILAWRNNVPKESELGLVMHEYGHTAMYNSPSAGSTQGDTLVLENGFILPTRYTVTFECLLRRSKACGRGCQRGGPQGRLPSPGQRATAVRVRERIACLLIGPSTDSAEIGYLPRLTRSNIPPVQPTSLQSGIPTPNFRISHPKKMRFFNAPLLNCNAHFNSMGNRQTNPHPPLTGTSCIPPPSRPRFSVTQCASRTGSGITVRPSAPGSSYYDRFHETGRVAANRTRIRLAKGSEHVLSIIAYLSSPRRESHESEVIAVIQYPVHRTRDVASEETTTRSRCPNSQWGRYREGVFIAWCIISTLPLGLDRVGIMGRSNIAVYTVLALGPNYDKTVFWIGRTRSLKLWLTLLNNRITRAFLLRNWIGRHWNPYTASRKQNPRAGRFPLGTVLVVPRSSLRTSAPGPNSASELKLDEHPTTASGLPIHDRALSGPRKMESDTLFKAHIVLHWHIHNDKWRSTYSERKARSRCEKPNVRDGHRWTSTNNKQNSDSGQPACSIVSKNLLNSGPSFRGLGPPQSSLCADTMPRGCNEQAINIVKHPVHGAVMFLENRQPSSMPSLRALRSHCTSHGVSPFRTYWRDRTLGELTISHVSSAGTISLYANVNLGVAVMSPRRNTERTQSAHLYKAADEQPYPGIQTVVSIFCLPTSNPTAEAFVRSVETNHPKVCRLEGPKLLVNFNHSRNLVLVFVCSMEYQQVNILRCWPVERESSSSQQTYHPTRATYLRGSPASAHIMGIAITEGQSNRPTFINTPQSDHLNAAGTSVDCNWRNMACRRNFASHAESQCITIRPNSYARRGVRWSLRIGLQIEFEPSGIRLHHALSHNSTQKESLYGSNSSPTVIQRQSLTRTCWLLLTHFSRGVHHTKSSAREGAGTSGIRKSSRLTQICSATLYCRLVGPAHLSNYSERNWSRAVSVRHWVCTHRPMCCGWIAGADRRTCVNGEYLLIWPVQARYRPMCKHQYTFLFYCPNDASRAPTKHTVEMAYDATWRMMFRSTVWTVRARIKHLRHYRPISQLRRSFRRAYLSCLLVTIDDSDFAPSDVQYRSRASWEPKSNEFIRAAQHPEIPVDTPCFRRRDRSKRPPEGCVADNPAEGGCSHIGIGGPTVSCEERRMVGRLRKDELARSSATRENIRSSLATSGPDWSQIHSVHQCPSKTTNNDCRMLQDFILELCPSGGAYRAHQFTLTKDDKSDYAAAAGFSILATRARNYCLVPHRCQFPQSTGVRNSHRASVHRPLVVRVVRIALRMIIGYWAATQTSNVTYPSGGVAIVASWGPGAAPHSGVPWSEDGWTLQKLEVHYSVTRLSRMGRNATRDREKVESGAEYAVDTTASSAGGCRTYTPLYARTNYNVMKLMYTVQGMWDSRRKSFVRDWRQFESLRGHVTCQLINFALRTMQSASPAIASRILASRSSRSPRTLGLAWRTQAKSDSDHSSTLWSRMARNKAHSEYMSRSRGLVGCCCGTGSRVRRIFCMRAAAVGSDLKPHGANQTYGAGSPHASACVSAGPERSYRVYRLRVAGGDTCDAAILDTLSRSLIRSWVCAGRRRELIPRSPKLASRSRAGLITNSIYKAPTAARAVRSTARARSSRTFPLPSTLASVCLAIDIRGKDRRRRSTEDVHLQTMNILHPAYIKIVGSPLHSDTQLRSRSRRFYWKAEKASKYVSRDPYVIFDRAFSIQGASNVVTSYCPAPEMGKTLYIEGEDARSSVESLIRGSTRISPRSRTGQSRPARDCNQLTTVVRGPHSRQEFDLGKRIQIVKRLGRWRYVLNCHVQSCVCARSRSSDRWDSRTKKRRFRPDVGNLKASWGTQWHSAHSMEYHEELACPARVVPAISRRWDAGRSAVGRSQTCVSHEKTLANSCEARVDPLSGTNSTVKLLKVDESHTTIETRGNLKVVGARLVLPPLDDLRGGSGGVLFASRVLDIPKVGGVVSALHHRSPRKYVGQGWRPSVGVIISSEYIVGIAVFYTNVYFATSYGQHQTQVPISRVLCDGYALGTLSLCAVTDRFGGSTHSPVRIAGSVQPTSIGGERDRLHSRLYVEPTRYFFSCLSSGTGMTTNRPQTSGRCYSSKEHNGSTDHSTIRVCQESKNDIGRCSNRGSRYMSAVRSRSFHSYLLRNCSYFLTGRAPRVAAHRFDCHKSTMCFPTLRHTTGVLAVRSPLYLTGSRTNDNSGLLAGIARYPQSGHTRVDVHAKGNVHRNAMISKHHGRVGGAVDKLTSRPGYGMTEHTVGLEESKNPDNSIYEQFTSRFVCDTAFDQWSNDVRLSAGCHFILVSHNLMGGGKMPGNTNPAGITACRYGNIYASGPTSRGFRATRLQAEGRRTRAFGPAQFDYFQTLQFILKRAFVSRRTTRNRKAQAFDLYASPDFGFPVRHSFHARTVEGSLREPVHRFISRPPYALALRSFRPSRESSVSEDKVAHNPLGYASDKHPRVTMAGMLGIVAQRWGPSPMVVDFGEPCDSAPTGNPWAQSSLTISCAHCPKWIRRVMGCSFSYKWYVGVVLVEGVHCLLAARFVAHPSDQRAEHPGHQSGYCSHVDGQTIVDQQLQLRIPT"

for table in [1,2,3,4,5,6,9,10,11,12,13,14,15,16,21,22,23,24,25,26,27,28,29,30,31]:
    translations.append((translate(sequence, stop_symbol="*", to_stop=False, table=table), table))
counts=[]
for translation, table in translations:
    count=0
    for i in range(len(output)):
        if translation[i]=="*":
            count=count+1
        elif translation[i]==output[i]:
            count=count+1
    counts.append((table,count))
print(counts)


In [None]:
# Filtering bad quality reads from fastq files
q=19
p=83
from Bio import SeqIO
import statistics
records=SeqIO.parse("fastq_file", format="fastq")
ltup=[]
for record in records:
    ltup.append((record.letter_annotations["phred_quality"],len(record.seq)))
counter=0   
for quality, length in ltup:
    count=0
    for i in quality:
        if i >= q:
            count=count+1
    if (count/length)*100>=p:
        counter=counter+1

print(counter)
        

In [21]:
# Making reverse complement

from Bio.Seq import Seq
from Bio import SeqIO

record=SeqIO.parse("/home/kasia/BioPython_projects/Bioinformatics_Armory/data/input",format="fasta")
count=0
for r in record:
    seq=r.seq
    r_comp=seq.reverse_complement()
    if seq==r_comp:
        count=count+1

print(count)


1


In [19]:
for r in record:
    print(r)