In [29]:
# 1
'''
Let's remeber how to use dictionaries.
Task: return a dictionary where 
    * keys are IDs of seqs from an input fasta file (prot.fasta),
    * key's values are seqs itself. 
'''

#!pip install Bio


from Bio import SeqIO



def my_own_fasta_parser(inFile):
  sequences = {}
  with open(inFile, "r") as content:       
    for seq_record in SeqIO.parse(content, "fasta"):
      sequences[seq_record.id]=seq_record.seq
    return sequences

my_own_fasta_parser("prot.fasta")





{'seq0': Seq('FQTWEEFSRAAEKLYLADPMKVRVVLKYRHVDGNLCIKVTDDLVCLVYRTDQAQ...EKF'),
 'seq1': Seq('KYRTWEEFTRAAEKLYQADPMKVRVVLKYRHCDGNLCIKVTDDVVCLLYRTDQA...TLM'),
 'seq10': Seq('FDSWDEFVSKSVELFRNHPDTTRYVVKYRHCEGKLVLKVTDNHECLKFKTDQAQ...MEK'),
 'seq2': Seq('EEYQTWEEFARAAEKLYLTDPMKVRVVLKYRHCDGNLCMKVTDDAVCLQYKTDQ...HGK'),
 'seq3': Seq('MYQVWEEFSRAVEKLYLTDPMKVRVVLKYRHCDGNLCIKVTDNSVCLQYKTDQAQDVK'),
 'seq4': Seq('EEFSRAVEKLYLTDPMKVRVVLKYRHCDGNLCIKVTDNSVVSYEMRLFGVQKDN...SLL'),
 'seq5': Seq('SWEEFAKAAEVLYLEDPMKCRMCTKYRHVDHKLVVKLTDNHTVLKYVTDMAQDV...LMR'),
 'seq6': Seq('FTNWEEFAKAAERLHSANPEKCRFVTKYNHTKGELVLKLTDDVVCLQYSTNQLQ...RSI'),
 'seq7': Seq('SWEEFVERSVQLFRGDPNATRYVMKYRHCEGKLVLKVTDDRECLKFKTDQAQDA...IFF'),
 'seq8': Seq('SWDEFVDRSVQLFRADPESTRYVMKYRHCDGKLVLKVTDNKECLKFKTDQAQEA...TLM'),
 'seq9': Seq('KNWEDFEIAAENMYMANPQNCRYTMKYVHSKGHILLKMSDNVKCVQYRAENMPDLKK')}

In [26]:
# 2
'''
Another super easy task.

We have the same fasta file (prot.fasta).
Now we want to get a list with the ids of protein seqs that have 
a relative frequency higher than a given threshold for a given residue.

And don't forget to use my_own_fasta_parser function from a previous task!
'''
from collections import Counter
def my_own_residue_abundance(input_file, residue, threshold=0.2):
    
    seq_ids = []
    sequences = my_own_fasta_parser(input_file)

    for seq_id, sequence in sequences.items():
      if (Counter(sequence)[residue]/len(sequence)>=threshold):
        seq_ids.append(seq_id)
    return seq_ids

my_own_residue_abundance("prot.fasta", "N", 0.1)


['seq9']

In [97]:
# 3
'''
Let's practice in Bio package using.
Task:
1. read fasta file containing several DNA seqs (nucl.fasta)
2. subset seqs that have GC content > 45 and coding protein with aromaticity > 0.01
3. write a new fasta file with those protein(!) seqs
4. return the percentage of seqs that passed your filter
Hint: Bio.SeqIO, Bio.SeqRecord, Bio.SeqUtils
'''

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import GC
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


def my_own_filtering(input_file, output_file, filt_gc=45, filt_arom=0.125):  
    sequences = {}
    c = 0   
    with open(input_file, "r") as content:     
        for record in SeqIO.parse(content, "fasta"):
            c+=1
            calc_gc=GC(record.seq)
            protein=ProteinAnalysis(str(record.seq.translate()))
            calc_arom=protein.aromaticity()
            if calc_gc >= filt_gc and calc_arom >= filt_arom:
              sequences[record.id]=record.seq
    

    records = []
    for seq_id, seq in sequences.items():
        rec = SeqRecord(Seq(seq), id=seq_id, description="")
        records.append(rec)
    handle = open(output_file,"w")
    SeqIO.write(records, handle, "fasta")
    handle.close()
    print (len(records)*100/c)


my_own_filtering("nucl.fasta","output.fasta")


1
14.285714285714286




KeyboardInterrupt: ignored

In [90]:
# 4
"""
Continue practicing in Bio package using:
Task:
complete the following code that should be able to return 
the best alignment of two amino acid seqs (pairwise2.align.globalds)
based on BLOSUM62 matrix from Bio.SubsMat.MatrixInfo.
http://rosalind.info/glossary/blosum62/
"""

from Bio.Align import Applications
from Bio import pairwise2
from Bio.SubsMat import MatrixInfo as matlist
from Bio.pairwise2 import format_alignment


def balign(first_seq, second_seq, gap_open, gap_extend):

    matrix = matlist.blosum62

    alns = pairwise2.align.globalds("ACCGT", "ACG",matrix,gap_open, gap_extend)

    top_aln =format_alignment(*alns[0])

    print(top_aln)


balign("ACCTAGC", "ACC", -1, -1)

ACCGT
| || 
A-CG-
  Score=17



In [117]:
# 5
""" You have some DNA sequence: AGTACTAGAGCATTCTATGGAG.
Find out which peptides could be created from it and sort them by their length.
Use as much Biopython modules as possible.
"""

import Bio
from Bio.Seq import Seq


def proteins(seq):
  proteins = []
  seqRC = seq.reverse_complement()
  proteins.append(str(seq.translate(to_stop = True)))
  proteins.append(str(seqRC.translate(to_stop = True)))
  return(sorted(proteins, key=len))

seq=Seq("AGTACTAGAGCATTCTATGGAG")
proteins(seq)




['LHRML', 'STRAFYG']

In [77]:
# 6
""" TASK: Try to create one-line function (without (!!!) using Bio package) 
that returns reverse complementary to a given sequence. 
Hint: using dictionaty & list comprehensions might be helpful.
"""

def rev_compl_one_line(seq):
    return ''.join([{'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}[base] for base in seq[::-1]])

seq= Seq("AGTACACTGGT")


print(rev_compl_one_line(seq))

#from Bio.Seq import Seq
#print(seq.reverse_complement())


ACCAGTGTACT
ACCAGTGTACT
