In [1]:
import global_align as ga

In [2]:
ncbi_match_score = 3
ncbi_mismatch_score = -4
gap_open_cost = 5
ncbi_gap_extension_score = -2
scoring_mat = {
    "A": {"A": ncbi_match_score, "C": ncbi_mismatch_score, "G": ncbi_mismatch_score, "T": ncbi_mismatch_score, "-": ncbi_gap_extension_score},
    "C": {"A": ncbi_mismatch_score, "C": ncbi_match_score, "G": ncbi_mismatch_score, "T": ncbi_mismatch_score, "-": ncbi_gap_extension_score},
    "G": {"A": ncbi_mismatch_score, "C": ncbi_mismatch_score, "G": ncbi_match_score, "T": ncbi_mismatch_score, "-": ncbi_gap_extension_score},
    "T": {"A": ncbi_mismatch_score, "C": ncbi_mismatch_score, "G": ncbi_mismatch_score, "T": ncbi_match_score, "-": ncbi_gap_extension_score},
    "-": {"A": ncbi_gap_extension_score, "C": ncbi_gap_extension_score, "G": ncbi_gap_extension_score, "T": ncbi_gap_extension_score, "-": ncbi_match_score},
}

max_score = ga.get_max_similarity_score(scoring_mat=scoring_mat)
cost_mat = ga.transform_scoring_mat_to_cost_mat(
    scoring_mat=scoring_mat,
    max_score=max_score
)
cost_mat

{'A': {'A': 0, 'C': 7, 'G': 7, 'T': 7, '-': 4},
 'C': {'A': 7, 'C': 0, 'G': 7, 'T': 7, '-': 4},
 'G': {'A': 7, 'C': 7, 'G': 0, 'T': 7, '-': 4},
 'T': {'A': 7, 'C': 7, 'G': 7, 'T': 0, '-': 4},
 '-': {'A': 3, 'C': 3, 'G': 3, 'T': 3, '-': 0}}

In [9]:
seq_1, seq_2 = ga.draw_two_random_seqs(
    alphabet=["A", "C", 'G', "T"],
    min_len_seq_1=5,
    max_len_seq_1=30,
    min_len_seq_2=15,
    max_len_seq_2=20,
    divergence=0.8
)
print(seq_1)
print(seq_2)

CGGTCTTAGCATATGTTGGCATAC
ATTAGCATCATAGTGGA


In [3]:
ga.final_score_to_cost(
    score=10,
    m=10,
    n=7,
    max_score=3
)

17

In [4]:
ga.final_cost_to_score(
    cost=17, 
    m=10,
    n=7,
    max_score=3
)

10

In [11]:
ga.final_score_to_cost(
    score=-12,
    m=10,
    n=7,
    max_score=1
)

22

In [12]:
ga.final_cost_to_score(
    cost=22, 
    m=10, 
    n=7, 
    max_score=1
)

-12

In [4]:
middle_row_index = len(seq_1)
best_paths_mat = ga.init_best_paths_matrix(
    dynamic_prog_num_rows=middle_row_index + 1,
    dynamic_prog_num_cols=len(seq_2) + 1
)
best_paths_mat

[[2, 2, 2, 2, 2, 2, 2, 2, 2],
 [4, 2, 2, 2, 2, 2, 2, 2, 2],
 [4, 2, 2, 2, 2, 2, 2, 2, 2],
 [4, 2, 2, 2, 2, 2, 2, 2, 2],
 [4, 2, 2, 2, 2, 2, 2, 2, 2],
 [4, 2, 2, 2, 2, 2, 2, 2, 2],
 [4, 2, 2, 2, 2, 2, 2, 2, 2],
 [4, 2, 2, 2, 2, 2, 2, 2, 2],
 [4, 2, 2, 2, 2, 2, 2, 2, 2]]

In [5]:
dynamic_prog_num_cols = len(seq_2) + 1
partial_dp_mat = ga.init_partial_dynamic_prog_matrix_2(
    seq_1=seq_1,
    seq_2=seq_2,
    cost_mat=cost_mat,
    gap_open_cost=gap_open_cost,
    dynamic_prog_num_cols=dynamic_prog_num_cols
)
partial_dp_mat

[[0, 8, 11, 14, 17, 20, 23, 26, 29], [8, 0, 0, 0, 0, 0, 0, 0, 0]]

In [6]:
moves_for_gap_open_penalty_from_left = {0, 3, 4, 11, 12, 14, 19, 22, 23}
moves_for_gap_open_penalty_from_up = {0, 1, 2, 9, 10, 13, 19, 20, 21}
situation_mapper={
    # from_diag_best_path_type = 0
    # from_left_best_path_type == 1
    # and from_up_best_path_type == 3
    # 3-way ties
    ((0, 0, 0), (0, 1, 3)): 15,
    # 2-way ties for low
    ((0, 0, 2), (0, 1, 3)): 9,
    ((0, 2, 0), (0, 1, 3)): 11,
    ((2, 0, 0), (0, 1, 3)): 5,
    # 2-way ties for high
    ((0, 1, 1), (0, 1, 3)): 0,
    ((1, 0, 1), (0, 1, 3)): 1,
    ((1, 1, 0), (0, 1, 3)): 3,
    # no ties
    ((0, 1, 2), (0, 1, 3)): 0,
    ((1, 0, 2), (0, 1, 3)): 1,
    ((1, 2, 0), (0, 1, 3)): 3,
    ((0, 2, 1), (0, 1, 3)): 0,
    ((2, 0, 1), (0, 1, 3)): 1,
    ((2, 1, 0), (0, 1, 3)): 3,
    # from_left_best_path_type == 1
    # and from_up_best_path_type == 4
    # 3-way ties
    ((0, 0, 0), (0, 1, 4)): 16,
    # 2-way ties for low
    ((0, 0, 2), (0, 1, 4)): 9,
    ((0, 2, 0), (0, 1, 4)): 12,
    ((2, 0, 0), (0, 1, 4)): 6,
    # 2-way ties for high
    ((0, 1, 1), (0, 1, 4)): 0,
    ((1, 0, 1), (0, 1, 4)): 1,
    ((1, 1, 0), (0, 1, 4)): 4,
    # no ties
    ((0, 1, 2), (0, 1, 4)): 0,
    ((1, 0, 2), (0, 1, 4)): 1,
    ((1, 2, 0), (0, 1, 4)): 4,
    ((0, 2, 1), (0, 1, 4)): 0,
    ((2, 0, 1), (0, 1, 4)): 1,
    ((2, 1, 0), (0, 1, 4)): 4,
    # from_left_best_path_type == 2
    # and from_up_best_path_type == 3
    # 3-way ties
    ((0, 0, 0), (0, 2, 3)): 17,
    # 2-way ties for low
    ((0, 0, 2), (0, 2, 3)): 10,
    ((0, 2, 0), (0, 2, 3)): 11,
    ((2, 0, 0), (0, 2, 3)): 7,
    # 2-way ties for high
    ((0, 1, 1), (0, 2, 3)): 0,
    ((1, 0, 1), (0, 2, 3)): 2,
    ((1, 1, 0), (0, 2, 3)): 3,
    # no ties
    ((0, 1, 2), (0, 2, 3)): 0,
    ((1, 0, 2), (0, 2, 3)): 2,
    ((1, 2, 0), (0, 2, 3)): 3,
    ((0, 2, 1), (0, 2, 3)): 0,
    ((2, 0, 1), (0, 2, 3)): 2,
    ((2, 1, 0), (0, 2, 3)): 3,
    # from_left_best_path_type == 2
    # and from_up_best_path_type == 4
    # 3-way ties
    ((0, 0, 0), (0, 2, 4)): 18,
    # 2-way ties for low
    ((0, 0, 2), (0, 2, 4)): 10,
    ((0, 2, 0), (0, 2, 4)): 12,
    ((2, 0, 0), (0, 2, 4)): 8,
    # 2-way ties for high
    ((0, 1, 1), (0, 2, 4)): 0,
    ((1, 0, 1), (0, 2, 4)): 2,
    ((1, 1, 0), (0, 2, 4)): 4,
    # no ties
    ((0, 1, 2), (0, 2, 4)): 0,
    ((1, 0, 2), (0, 2, 4)): 2,
    ((1, 2, 0), (0, 2, 4)): 4,
    ((0, 2, 1), (0, 2, 4)): 0,
    ((2, 0, 1), (0, 2, 4)): 2,
    ((2, 1, 0), (0, 2, 4)): 4,
    # from_diag_best_path_type = 19
    # from_left_best_path_type == 1
    # and from_up_best_path_type == 3
    # 3-way ties
    ((0, 0, 0), (19, 1, 3)): 24,
    # 2-way ties for low
    ((0, 0, 2), (19, 1, 3)): 20,
    ((0, 2, 0), (19, 1, 3)): 22,
    ((2, 0, 0), (19, 1, 3)): 5,
    # 2-way ties for high
    ((0, 1, 1), (19, 1, 3)): 19,
    ((1, 0, 1), (19, 1, 3)): 1,
    ((1, 1, 0), (19, 1, 3)): 3,
    # no ties
    ((0, 1, 2), (19, 1, 3)): 19,
    ((1, 0, 2), (19, 1, 3)): 1,
    ((1, 2, 0), (19, 1, 3)): 3,
    ((0, 2, 1), (19, 1, 3)): 19,
    ((2, 0, 1), (19, 1, 3)): 1,
    ((2, 1, 0), (19, 1, 3)): 3,
    # from_left_best_path_type == 1
    # and from_up_best_path_type == 4
    # 3-way ties
    ((0, 0, 0), (19, 1, 4)): 25,
    # 2-way ties for low
    ((0, 0, 2), (19, 1, 4)): 20,
    ((0, 2, 0), (19, 1, 4)): 23,
    ((2, 0, 0), (19, 1, 4)): 6,
    # 2-way ties for high
    ((0, 1, 1), (19, 1, 4)): 19,
    ((1, 0, 1), (19, 1, 4)): 1,
    ((1, 1, 0), (19, 1, 4)): 4,
    # no ties
    ((0, 1, 2), (19, 1, 4)): 19,
    ((1, 0, 2), (19, 1, 4)): 1,
    ((1, 2, 0), (19, 1, 4)): 4,
    ((0, 2, 1), (19, 1, 4)): 19,
    ((2, 0, 1), (19, 1, 4)): 1,
    ((2, 1, 0), (19, 1, 4)): 4,
    # from_left_best_path_type == 2
    # and from_up_best_path_type == 3
    # 3-way ties
    ((0, 0, 0), (19, 2, 3)): 26,
    # 2-way ties for low
    ((0, 0, 2), (19, 2, 3)): 21,
    ((0, 2, 0), (19, 2, 3)): 22,
    ((2, 0, 0), (19, 2, 3)): 7,
    # 2-way ties for high
    ((0, 1, 1), (19, 2, 3)): 19,
    ((1, 0, 1), (19, 2, 3)): 2,
    ((1, 1, 0), (19, 2, 3)): 3,
    # no ties
    ((0, 1, 2), (19, 2, 3)): 19,
    ((1, 0, 2), (19, 2, 3)): 2,
    ((1, 2, 0), (19, 2, 3)): 3,
    ((0, 2, 1), (19, 2, 3)): 19,
    ((2, 0, 1), (19, 2, 3)): 2,
    ((2, 1, 0), (19, 2, 3)): 3,
    # from_left_best_path_type == 2
    # and from_up_best_path_type == 4
    # 3-way ties
    ((0, 0, 0), (19, 2, 4)): 27,
    # 2-way ties for low
    ((0, 0, 2), (19, 2, 4)): 21,
    ((0, 2, 0), (19, 2, 4)): 23,
    ((2, 0, 0), (19, 2, 4)): 8,
    # 2-way ties for high
    ((0, 1, 1), (19, 2, 4)): 19,
    ((1, 0, 1), (19, 2, 4)): 2,
    ((1, 1, 0), (19, 2, 4)): 4,
    # no ties
    ((0, 1, 2), (19, 2, 4)): 19,
    ((1, 0, 2), (19, 2, 4)): 2,
    ((1, 2, 0), (19, 2, 4)): 4,
    ((0, 2, 1), (19, 2, 4)): 19,
    ((2, 0, 1), (19, 2, 4)): 2,
    ((2, 1, 0), (19, 2, 4)): 4
}

In [7]:
partial_dp_mat, cur_cell_best_cum_cost, best_paths_mat = ga.do_core_align_2(
    seq_1=seq_1,
    seq_2=seq_2,
    best_paths_mat=best_paths_mat,
    partial_dp_mat=partial_dp_mat,
    gap_open_cost=gap_open_cost,
    cost_mat=cost_mat,
    moves_for_gap_open_penalty_from_left=moves_for_gap_open_penalty_from_left,
    moves_for_gap_open_penalty_from_up=moves_for_gap_open_penalty_from_up,
    situation_mapper=situation_mapper
)

NameError: name 'gap_extension_cost' is not defined

In [None]:
best_paths_mat

[[2, 2, 2, 2, 2, 2],
 [4, 0, 0, 19, 19, 0],
 [4, 1, 19, 19, 19, 19],
 [4, 19, 19, 0, 0, 19],
 [4, 1, 0, 19, 19, 0],
 [4, 19, 19, 19, 19, 19]]

In [None]:
partial_dp_mat

[[0, 26, 31, 36, 41, 46], [6, 31, 36, 41, 46, 51]]

In [10]:
ga.final_cost_to_score(cost=22, m=9, n=6, max_score=1)

13

In [10]:
ga.traceback_2(
    best_paths_mat=best_paths_mat,
    seq_1=seq_1,
    seq_2=seq_2, 
)

seq_1_aligned
['G']
seq_2_aligned
['A']
seq_1_aligned
['G', 'A']
seq_2_aligned
['A', 'C']
seq_1_aligned
['G', 'A', 'C']
seq_2_aligned
['A', 'C', 'C']
seq_1_aligned
['G', 'A', 'C', 'T']
seq_2_aligned
['A', 'C', 'C', 'A']
seq_1_aligned
['G', 'A', 'C', 'T', 'A']
seq_2_aligned
['A', 'C', 'C', 'A', 'A']


('ATCAG', '|*|**', 'AACCA')

In [None]:
# seq_1 = "CATGGG"
# seq_1 = "C"
# seq_2 = "ACTG"
# seq_2 = "TATT"
# seq_1 = "ACACAACTAGTGCTACGTAT"
# seq_2 = "T"
# seq_1 = "TC"
# seq_2 = "T"
# seq_1 = "GTCAGCAT"
# seq_2 = "CTCTGAACACG"
# seq_1 = "CGCCTC"
# seq_2 = "GTCG"
# seq_1 = "CGCCT"
# seq_2 = "GTCG"
# seq_1 = "CATGGG"
# seq_2 = "ACTG"
dynamic_prog_num_rows = len(seq_1) + 1
dynamic_prog_num_cols = len(seq_2) + 1

In [None]:
# partial_A_mat, partial_B_mat, partial_C_mat = (ga.init_partial_dynamic_prog_matrix(
#     gap_existence_cost=gap_existence_cost,
#     seq_1=seq_1,
#     seq_2=seq_2,
#     scoring_mat=scoring_mat,
#     dynamic_prog_num_cols=dynamic_prog_num_cols
# ) for u in range(3)) 

# best_paths_mat = ga.init_best_paths_matrix(
#     dynamic_prog_num_rows=dynamic_prog_num_rows,
#     dynamic_prog_num_cols=dynamic_prog_num_cols
# )

In [None]:
# ga.do_core_align(
#     seq_1=seq_1,
#     seq_2=seq_2,
#     scoring_mat=scoring_mat,
#     gap_existence_cost=gap_existence_cost,
#     dynamic_prog_num_rows=dynamic_prog_num_rows,
#     dynamic_prog_num_cols=dynamic_prog_num_cols,
#     partial_A_mat=partial_A_mat,
#     partial_B_mat=partial_B_mat,
#     partial_C_mat=partial_C_mat,
#     best_paths_mat=best_paths_mat
# )

In [None]:
# partial_A_mat, partial_B_mat, partial_C_mat, best_paths_mat, score = ga.warmup_align(
#     seq_1=seq_1,
#     seq_2=seq_2,
#     scoring_mat=scoring_mat,
#     gap_existence_cost=gap_existence_cost,
#     dynamic_prog_num_cols=dynamic_prog_num_cols,
#     partial_A_mat=partial_A_mat,
#     partial_B_mat=partial_B_mat,
#     partial_C_mat=partial_C_mat,
#     best_paths_mat=best_paths_mat
# )

In [None]:
# ga.traceback(
#     best_paths_mat=best_paths_mat,
#     seq_1=seq_1[0],
#     seq_2=seq_2
# )

In [None]:
alignment = ga.align(
    seq_1=seq_1,
    seq_2=seq_2,
    scoring_mat=scoring_mat,
    gap_existence_cost=gap_existence_cost
)
ga.print_alignment(
    *alignment,
    chars_per_line=70
)

In [None]:
best_paths_mat = [[1, 1, 1, 1, 1], [2, 0, 1, 0, 0], [2, 0, 0, 1, 0], [2, 0, 0, 0, 1], [2, 2, 0, 0, 0], [2, 2, 0, 2, 0], [2, 2, 2, 0, 2]]
best_paths_mat

In [None]:
alignment = ga.traceback(
    best_paths_mat=best_paths_mat,
    seq_1=seq_1,
    seq_2=seq_2
)
ga.print_alignment(*alignment)