# Day 2: Programming Styles exercises

## 1. Functional Programming


Exercise 1: Implement a function gc_content that takes a DNA sequence as a string and returns the GC content using a list comprehension.


In [None]:
def gc_content(seq):
    """Return the GC content of a sequence."""
    return (seq.count('G') + seq.count('C')) / len(seq)

Exercise 2: Implement a function codon_usage that takes a DNA sequence as a string and a codon as a string, and returns the frequency of the codon in the sequence using map and filter.

In [None]:
def codon_usage(seq, codon):
    """Return the frequency of a codon in a sequence."""
    return seq.count(codon) / len(seq)

#same function as above but using map and filter
def codon_usage2(seq, codon):
    """Return the frequency of a codon in a sequence."""
    return len(list(filter(lambda x: x == codon, seq))) / len(seq)

Exercise 3: Implement a function orf_finder that takes a DNA sequence as a string and finds all open reading frames (ORFs) using filter and a helper function.

In [None]:
def orf_finder(seq):
    """Find all open reading frames in a sequence."""
    seq = seq.upper()
    orfs = []
    for i in range(3):
        for j in range(i, len(seq), 3):
            if seq[j:j+3] == 'ATG':
                for k in range(j, len(seq), 3):
                    if seq[k:k+3] in ('TAA', 'TAG', 'TGA'):
                        orfs.append(seq[j:k+3])
                        break
    return orfs

#same function but with list comprehension
def orf_finder2(seq):
    """Find all open reading frames in a sequence."""
    seq = seq.upper()
    orfs = []
    for i in range(3):
        for j in range(i, len(seq), 3):
            if seq[j:j+3] == 'ATG':
                orfs.append([seq[j:k+3] for k in range(j, len(seq), 3) if seq[k:k+3] in ('TAA', 'TAG', 'TGA')][0])
                break
    return orfs

def orf_finder3(seq):
    """Find all open reading frames in a sequence."""
    seq = seq.upper()
    stop_codons = ['TAA', 'TAG', 'TGA']
    
    return [seq[j:k+3] for i in range(3) for j in range(i, len(seq), 3) if seq[j:j+3] == 'ATG' 
            for k in range(j, len(seq), 3) if seq[k:k+3] in stop_codons]

from itertools import chain
def orf_finder4(seq):

    """Find all open reading frames in a sequence."""
    seq = seq.upper()
    stop_codons = ['TAA', 'TAG', 'TGA']

    # Create an iterable of possible ORFs for each frame
    frames = (filter(lambda x: seq[x:x+3] == 'ATG', range(i, len(seq), 3)) for i in range(3))

    # Create an iterable of ORFs for each start position
    orfs = (filter(lambda x: seq[x:x+3] in stop_codons, range(start, len(seq), 3)) for start in chain.from_iterable(frames))

    # Map each ORF end position to its sequence and return the list of all sequences
    return list(map(lambda x: seq[x:x+3], chain.from_iterable(orfs)))

## 2. OOP: Inheritance vs Composition

Exercise 1: Create a simple DNA class, then create a RNA class that inherits from DNA and adds an extra method to convert DNA sequence to RNA.

In [None]:
class DNA:
    def __init__(self, sequence):
        self.sequence = sequence

    def reverse(self):
        self.sequence = self.sequence[::-1]

    def complement(self):
        complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
        self.sequence = ''.join([complement[base] for base in self.sequence])

#create RNA that inherits from DNA and has a method to transcribe DNA to RNA
class RNA(DNA):
    def transcribe(self):
        self.sequence = self.sequence.replace('T', 'U')



Exercise 2: Implement a Gene class that uses a DNA instance (composition) and adds additional gene-specific properties and methods.

In [None]:
class Gene:
    def __init__(self, name, sequence):
        self.name = name
        self.sequence = sequence

    def __len__(self):
        return len(self.sequence)

    def __repr__(self):
        return f'Gene({self.name}, {self.sequence})'

    def __str__(self):
        return f'Gene {self.name} with sequence {self.sequence}'

    def __eq__(self, other):
        return self.sequence == other.sequence

    def reverse(self):
        self.sequence = self.sequence[::-1]

    def complement(self):
        complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
        self.sequence = ''.join([complement[base] for base in self.sequence])

    def transcribe(self):
        self.sequence = self.sequence.replace('T', 'U')

    def gc_content(self):
        return (self.sequence.count('G') + self.sequence.count('C')) / len(self.sequence)

    def gc_content_subsec(self, k=20):
        gc = []
        for i in range(0, len(self.sequence) - k + 1, k):
            subsec = self.sequence[i:i+k]
            gc.append((subsec.count('G') + subsec.count('C')) / k)
        return gc

    def codons(self):
        return [self.sequence[i:i+3] for i in range(0, len(self.sequence), 3)]

Exercise 3: Refactor your Gene class so that it inherits from DNA and compare this design with the composition version.

In [None]:
#again Gene class, but this time inheriting from DNA
class Gene(DNA):
    def __init__(self, name, sequence):
        super().__init__(sequence)
        self.name = name

    def __len__(self):
        return len(self.sequence)

    def __repr__(self):
        return f'Gene({self.name}, {self.sequence})'

    def __str__(self):
        return f'Gene {self.name} with sequence {self.sequence}'

    def __eq__(self, other):
        return self.sequence == other.sequence

    def gc_content(self):
        return (self.sequence.count('G') + self.sequence.count('C')) / len(self.sequence)

    def gc_content_subsec(self, k=20):
        gc = []
        for i in range(0, len(self.sequence) - k + 1, k):
            subsec = self.sequence[i:i+k]
            gc.append((subsec.count('G') + subsec.count('C')) / k)
        return gc

    def codons(self):
        return [self.sequence[i:i+3] for i in range(0, len(self.sequence), 3)]

## 3. OOP: Polymorphism and Abstract Base Classes

Exercise 1: Create an abstract base class Sequence with an abstract method gc_content. Create DNA and RNA classes that both inherit from Sequence and implement gc_content.

In [None]:
from abc import ABC, abstractmethod

class Sequence(ABC):
    def __init__(self, sequence):
        self.sequence = sequence

    @abstractmethod
    def gc_content(self):
        pass

class DNA(Sequence):
    def __init__(self, sequence):
        super().__init__(sequence)

    def gc_content(self):
        return (self.sequence.count('G') + self.sequence.count('C')) / len(self.sequence)

class RNA(Sequence):
    def __init__(self, sequence):
        super().__init__(sequence)

    def gc_content(self):
        return (self.sequence.count('G') + self.sequence.count('C')) / len(self.sequence)


Exercise 2: Add a transcribe method to the Sequence abstract base class. Implement it in the DNA class but not in the RNA class.

In [2]:
from abc import ABC, abstractmethod

class Sequence(ABC):
    def __init__(self, sequence):
        self.sequence = sequence

    @abstractmethod
    def gc_content(self):
        pass

    @abstractmethod
    def transcribe(self):
        pass

class DNA(Sequence):
    def __init__(self, sequence):
        super().__init__(sequence)

    def gc_content(self):
        return (self.sequence.count('G') + self.sequence.count('C')) / len(self.sequence)
    
    def transcribe(self):
        return self.sequence.replace('T', 'U')
    

class RNA(Sequence):
    def __init__(self, sequence):
        super().__init__(sequence)

    def gc_content(self):
        return (self.sequence.count('G') + self.sequence.count('C')) / len(self.sequence)
    
    def transcribe(self):
        raise NotImplementedError("Transcription does not apply to RNA sequences")



Exercise 3: Extend your Sequence hierarchy with a Protein class and make sure that it's impossible to call transcribe on a Protein object.

In [None]:
class Protein(Sequence):
    def __init__(self, sequence):
        super().__init__(sequence)

    def gc_content(self):
        raise NotImplementedError("GC Content does not apply to protein sequences")
    
    def transcribe(self):
        raise NotImplementedError("Transcription does not apply to protein sequences")

## 4. Instance, Class and Static Methods:

Exercise 1: Add an instance method count_amino_acids to a Protein class that counts the number of each amino acid in a protein.

Exercise 2: Add a class method from_pdb to your Protein class that constructs a Protein object from a PDB file.

Exercise 3: Add a static method validate to your Protein class that checks whether a string is a valid amino acid sequence.


In [None]:
from Bio.PDB import PDBParser

class Protein:
    def __init__(self, sequence):
        self.sequence = sequence

    def count_amino_acids(self):
        aa = {}
        for res in self.sequence:
            if res in aa:
                aa[res] += 1
            else:
                aa[res] = 1
        return aa
    
    @classmethod
    def read_pdb_sequence(cls, pdb_file):
        parser = PDBParser(QUIET=True)  # Quiet mode to avoid warnings
        structure = parser.get_structure('PDB', pdb_file)
        for model in structure:
            for chain in model:
                seq = []
                for residue in chain:
                    if residue.get_resname() in ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLY', 'GLU', 
                                                'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 
                                                'THR', 'TRP', 'TYR', 'VAL']:
                        seq.append(residue.get_resname())
        return "".join(seq)
    
    @staticmethod
    def validate_protein_sequence(sequence):
        valid_aa = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLY', 'GLU', 
                    'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 
                    'THR', 'TRP', 'TYR', 'VAL']
        for aa in sequence:
            if aa not in valid_aa:
                raise ValueError(f"{aa} is not a valid amino acid")
        return True
    

## 5. Context Managers

Exercise 1: Write a context manager to open and automatically close a PDB file.


In [None]:
class PDBManager:
    #context manager to open and close pdb files
    def __init__(self, pdb_file):
        self.pdb_file = pdb_file
    
    def __enter__(self):
        self.file = open(self.pdb_file)
        return self.file

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.file.close()
        if exc_type:
            print(f"Exception of type {exc_type} occurred")
            print(f"Exception value: {exc_value}")
            print(f"Exception traceback: {exc_traceback}")
            return True

Exercise 2: Write a context manager that prints the progress of reading a directory of PDB files in terms of the percentage of the files 

processed.

In [None]:
import tqdm
import os

class PDBProgress:
    #context manager to show progress of pdb file processing
    def __init__(self, pdb_file):
        self.pdb_file = pdb_file

        #tqdm progress bar
        self.pbar = tqdm(total=os.path.getsize(self.pdb_file), unit='B', unit_scale=True, desc=self.pdb_file)
    

Write a context manager that temporarily changes the working directory to a specified path and then changes it back, regardless of whether an error occurred within the context block. Test this with some PDB file operations.

## 6. Type Hints



Exercise 1: Add type hints to your Protein class from the previous exercises.


Exercise 2: Write a function read_pdb that takes a file path as input and returns an object of class Protein. Use type hints for the function signature.

Add type hints to the context managers you wrote in the previous exercises, using the ContextManager generic class from the typing module.