In [1]:
from typing import Sequence
import pandas as pd
import itertools
from extend_msa import msa
from nw import nw
from read_fasta import read_fasta


Parameters:

In [2]:
scoreMatrix = {
    'A': {'A': 0, 'C': 5, 'G': 2, 'T': 5, '-': 5, 'N': 5, 'R': 5, 'S': 5},
    'C': {'A': 5, 'C': 0, 'G': 5, 'T': 2, '-': 5, 'N': 5, 'R': 5, 'S': 5},
    'G': {'A': 2, 'C': 5, 'G': 0, 'T': 5, '-': 5, 'N': 5, 'R': 5, 'S': 5},
    'T': {'A': 5, 'C': 2, 'G': 5, 'T': 0, '-': 5, 'N': 5, 'R': 5, 'S': 5},
    '-': {'A': 5, 'C': 5, 'G': 5, 'T': 5, '-': 0, 'N': 5, 'R': 5, 'S': 5},
    'N': {'A': 5, 'C': 5, 'G': 5, 'T': 5, '-': 5, 'N': 5, 'R': 5, 'S': 5},
    'R': {'A': 5, 'C': 5, 'G': 5, 'T': 5, '-': 5, 'N': 5, 'R': 5, 'S': 5},
    'S': {'A': 5, 'C': 5, 'G': 5, 'T': 5, '-': 5, 'N': 5, 'R': 5, 'S': 5}
}

gapCost = 5

### 2-approximation algorithm implementation

In [5]:
def sp_approx(sequences: Sequence, score_matrix: dict, gap_cost: int) -> list[list]:

    """ Part 1: Initializing the MSA """
    print(f'\033[1mPart 1: Initializing the MSA\033[0m\n')

    seq_names = list(sequences.keys())
    alignments = {}

    """ a) Finding center sequence from distance matrix and average distances """
    print(f'\t\033[1ma) Finding center sequence from distance matrix and average distances\033[0m\n')

    # Distance matrix
    distanceMatrix = [[0] * len(seq_names) for _ in range(len(seq_names))]

    for i, seq_i in enumerate(seq_names):
        for j, seq_j in enumerate(seq_names):
            seqA = sequences[seq_i]
            seqB = sequences[seq_j]

            alignment = nw(seqA, seqB, score_matrix, gap_cost)
            distanceMatrix[i][j] = alignment[2] # the distance is in the 2nd position in the ouput from NW()

            # Saving the full output from NW() for later
            if seq_i != seq_j and i < j:
                alignments[(seq_i, seq_j)] = alignment
    
    print(f'Alignments and scores: {alignments}\n')
    print(f'       \033[1mDistance matrix\033[0m\n{pd.DataFrame(distanceMatrix, index = list(seq_names), columns = list(seq_names))}\n')

    # Average distance from sequences to all other sequences
    avgDist = {}

    for i, seq_name in enumerate(seq_names):
        avg_distance = sum(distanceMatrix[i]) / len(distanceMatrix[i])
        avgDist[seq_name] = round(avg_distance, 2)

    print(f'Average sequence distances to other sequences: {avgDist}\n')

    ## Getting the center sequence - i.e., the sequence that is, on average, most similar to all other sequences
    centerSeq_name = min(avgDist, key = avgDist.get)

    print(f'Center sequence: {centerSeq_name}\n')

    """ b) Initializing alignment with the most similar sequences, with the center sequence in the first row """
    print(f'\t\033[1mb) Initializing alignment with the most similar sequences, with the center sequence in the first row\033[0m\n')

    # Finding the first pair of sequences in the alignment - i.e., the sequences that are most similar
    firstPair_name = min(alignments, key = lambda k: alignments[k][2])

    M = []

    if firstPair_name[0] == centerSeq_name: # Case: the center sequence is the FIRST sequence in the pair-tuple
        M.append(alignments[firstPair_name][0])
        M.append(alignments[firstPair_name][1])

    else: # Case: the center sequence is the SECOND sequence in the pair-tuple
        M.append(alignments[firstPair_name][1])
        M.append(alignments[firstPair_name][0])

    M = list(map(list, zip(*M))) # creating list of columns

    print(f'First pair: {firstPair_name}\n')
    print(f'Initialized alignment, M: {M}\n\n')

    """ Part 2: Computing the multiple sequence alignment using the 2-approximation algorithm, given in the extend_msa.py file """
    print(f'\033[1mPart 2: Computing the multiple sequence alignment using the extend_msa.py file\033[0m\n')

    # Progressive MSA
    MA = []

    for i in alignments:
        A = []
        if i != firstPair_name and centerSeq_name in i: # Only looking at pairs that are not the first pair (which we already have) and sequences aligned with the center sequence
            if i[0] == centerSeq_name: # Case: the center sequence is the FIRST sequence in the pair-tuple
                A.append(alignments[i][0])
                A.append(alignments[i][1])
            
            else: # Case: the center sequence is the SECOND sequence in the pair-tuple
                A.append(alignments[i][1])
                A.append(alignments[i][0])

            A = list(map(list, zip(*A))) # creating list of columns
            MA = msa(M, A)

    MSA = ([''.join(column) for column in zip(*MA)])

    print(f'MSA: {MSA}\n\n')

    """ Part 3: Getting the SP score from the sum of columns """
    print(f'\033[1mPart 3: Getting the SP score from the sum of columns\033[0m\n')

    SP_score = []
    
    for i in MA:
        combs = list(itertools.combinations(i, 2))
        for pair in combs:
            score = score_matrix[pair[0]][pair[1]]
            SP_score.append(score)

    print(f'Approximate SP score: {sum(SP_score)}')

    return MSA, sum(SP_score)

In [8]:
sequences = read_fasta('brca1-testseqs-3.fasta')
sp_approx(sequences, scoreMatrix, gapCost)

[1mPart 1: Initializing the MSA[0m

	[1ma) Finding center sequence from distance matrix and average distances[0m

Alignments and scores: {('brca1_bos_taurus', 'brca1_canis_lupus'): (['A', 'T', 'G', 'G', 'A', 'T', 'T', 'T', 'A', 'T', 'C', 'T', 'G', 'C', 'G', 'G', 'A', 'T', 'C', 'A', 'T', 'G', 'T', 'T', 'G', 'A', 'A', 'G', 'A', 'A', 'G', 'T', 'A', 'C', 'A', 'A', 'A', 'A', 'T', 'G', 'T', 'C', 'C', 'T', 'C', 'A', 'A', 'T', 'G', 'C', 'T', 'A', 'T', 'G', 'C', 'A', 'G', 'A', 'A', 'A', 'A', 'T', 'C', 'T', 'T', 'A', 'G', 'A', 'G', 'T', 'G', 'T', 'C', 'C', 'A', 'A', 'T', 'A', 'T', 'G', 'T', 'C', 'T', 'G', 'G', 'A', 'G', 'T', 'T', 'G', 'A', 'T', 'C', 'A', 'A', 'A', 'G', 'A', 'G', 'C', 'C', 'T', 'G', 'T', 'C', 'T', 'C', 'T', 'A', 'C', 'A', 'A', 'A', 'G', 'T', 'G', 'T', 'G', 'A', 'C', 'C', 'A', 'C', 'A', 'T', 'A', 'T', 'T', 'T', 'T', 'G', 'C', 'A', 'A', 'A', 'T', 'T', 'T', 'T', 'G', 'T', 'A', 'T', 'G', 'C', 'T', 'G', 'A', 'A', 'A', 'C', 'T', 'T', 'C', 'T', 'C', 'A', 'A', 'C', 'C', 'A', 'G', 'A'

(['ATGGATTTATCTGCGGATCATGTTGAAGA-AGTACAAAATGTCCTCAATGCTATGCA-GAAAATCTTAG--AGTGTCCAATA-TGTCTGGAGTTGATCAAAGAGCCT-GTC-TCTACAAAGTGTGACC-A-CA-TATTTTGCAAATTTTGTATGC-TGAAAC-T--TCTCAACCA-GAAGAAAGGGCCTTCACAATGTCC--TTTGTGTAAGAATGA-',
  'ATGGATTTATCTGCGGATCGTGTTGAAGA-AGTACAAAATGTTCTTAATGCTATGCA-GAAAATCTTAG--AGTGTCCAATA-TGTCTGGAGTTGATCAAAGAGCCT-GTT-TCTACAAAGTGTGATC-A-CA-TATTTTGCAAATTTTGTATGC-TGAAAC-T--TCTCAACCA-GAGGAAGGGGCCTTCACAGTGTCC--TTTGTGTAAGAACGA-',
  'GCGAA---AT--GTA-A-CACGGTAGAGGTGAT-CGGGGTG-CGTTA-TAC-GTGCGTGGTGACCTCGGTCGGTGTT-GACGGTGCCTGGGGTTCCTCAGAGTGTTTTGGGGTCTGAAGGATG-GACTTGTCAGTGATT-GCCA-TTGGAGACGTGCAAAATGTGCTTTCAGCCATGCAGAA-GAAC-TTGG-AGTGTCCAGTCTGTTTAGATGTGAT'],
 792)

### Exact algorithm

In [None]:
def sp_exact_3():
    pass