In [1]:
import numpy as np
import pandas as pd


In [4]:
import numpy as np
import pandas as pd

def build_markov_matrix(sequence):
    # Define nucleotide order and index mapping
    nucleotides = ['a', 'c', 'g', 't']
    nuc_index = {nuc: i for i, nuc in enumerate(nucleotides)}

    # Initialize a 4x4 transition count matrix
    transition_counts = np.zeros((4, 4), dtype=int)

    # Count nucleotide transitions
    for i in range(len(sequence) - 1):
        curr_nuc = sequence[i]
        next_nuc = sequence[i + 1]
        if curr_nuc in nuc_index and next_nuc in nuc_index:
            transition_counts[nuc_index[curr_nuc], nuc_index[next_nuc]] += 1

    # Normalize counts to get transition probabilities
    transition_probs = transition_counts.astype(float)
    row_totals = transition_probs.sum(axis=1, keepdims=True)
    np.seterr(invalid='ignore')  # suppress divide-by-zero warnings
    transition_probs = np.divide(transition_probs, row_totals, where=row_totals != 0)

    # Create and return a labeled DataFrame
    return pd.DataFrame(transition_probs, index=nucleotides, columns=nucleotides)

# Example usage
dna_seq = (
    "ttgaatccctgtacgttaagtatatcacagtgttgtatgtcgagttgggtcgtagccaatacgtgcctccgtacagaggtctattttaactagtaggctcatttacttgagggactaatgt"
    "ccaactcatattagcgggggttgggacgcgtaatggacggagccagcctaaggcgaaccgatggcatcaaatacggttgacgtccttatggggaagctcagggtagaagacagttttaacagat"
    "ccctacggggcgccccttggcattagccagacctcggtgcaacatcagacttgttgggtttcaaataagtaccccgcctgtaaactcccgcgagccatgccgggtggagttactgcgttt"
    "tgcggctcggagtataatgcctataaacgtctaccgcaaaatgaggatatgagggatctcaacctcaaagactctattaagcccagacgacgtgaaacaggggctactctctgatagccccat"
    "cgacatatagttcccgattaatattttaatttctatatagatcctcgggaagccgcctcgcgtcggttgcaggcattccaagagtatcccgctgtcagagatatgaggtggtgatatcat"
    "tgacctatacttcaaaagcaacggggatagacgttggcgggctcgcaccattttagtcgattacatcaccgacccggatcagagcccgcgataacccataatatgcagagtcgactcacattcagga"
    "gccgtagcatcatcatcggctggcaatcgtacaaccccggggatcctaagccatccgttgctatcgagttatttgcgttcgacaataattgctgcttagtacggaacgtggacccgtagc"
    "tgggaatattttttatcagagtttccctttacgctcgatcgtgtgttcgacacgcgttgattgatgattactacaggaagagccaccgcgacaatcgcgtggctgcttctgcgcgcatcacgaggc"
    "taggataagcaaacatctacgcgatttttgcctgcgcgga"
)
#random dna sequence generated from: https://www.bioinformatics.org/sms2/random_dna.html
markov_df = build_markov_matrix(dna_seq)
print(markov_df)


          a         c         g         t
a  0.211765  0.227451  0.250980  0.309804
c  0.240964  0.257028  0.293173  0.208835
g  0.263566  0.255814  0.263566  0.217054
t  0.286822  0.236434  0.205426  0.271318
