In [5]:
def frequent_words(text: str, k: int) -> list[str]:
    
    def FrequencyTable(text: str, k: int) -> dict[str,int]:
        freqMap = {}
        n = len(text)
        for i in range(n - k + 1):
            pattern = text[i:i + k]
            if pattern not in freqMap:
                freqMap[pattern] = 1
            else:
                freqMap[pattern] += 1
        return freqMap

    frequentPatterns = []
    freqMap = FrequencyTable(text, k)
    max_value = max(freqMap.values())
    mostFrequentPatterns = [key for key, value in freqMap.items() if value == max_value]
    return mostFrequentPatterns


text = 'ACGTTGCATGTCGCATGATGCATGAGAGCT'
k = 4

print(frequent_words(text, k))

['GCAT', 'CATG']


In [8]:
def reverse_complement(pattern: str) -> str:
    complement = ''
    for nuc in pattern:
        if nuc == 'A':
            complement += 'T'
        elif nuc == 'T':
            complement += 'A'
        elif nuc == 'G':
            complement += 'C'
        elif nuc == 'C':
            complement += 'G'
        else:
            print("Pattern must only contain 'A', 'T', 'G', or 'C'.")
            return
    reverse_complement = complement[::-1]
    return reverse_complement

print(reverse_complement('AAAACCCGGT'))

ACCGGGTTTT


In [9]:
def pattern_matching(pattern: str, genome: str) -> list[int]:
    startPositions = []
    pattern_length = len(pattern)
    genome_length = len(genome)

    for i in range(genome_length - pattern_length + 1):
        if genome[i : i + pattern_length] == pattern:
            startPositions.append(i)
    return startPositions

print(pattern_matching('ATAT', 'GATATATGCATATACTT'))

[1, 3, 9]


In [19]:
def minimum_skew(genome: str) -> list[int]:
    skew = [0]
    for i in range(len(genome)):
        if genome[i] == 'G':
            skew.append(skew[i]+1)
        elif genome[i] == 'C':
            skew.append(skew[i]-1)
        else:
            skew.append(skew[i])
    min_value = min(skew)
    min_positions = []
    for i in range(len(skew)):
        if skew[i] == min_value:
            min_positions.append(i)
    return min_positions

print(minimum_skew('TAAAGACTGCCGAGAGGCCAACACGAGTGCTAGAACGAGGGGCGTAAACGCGGGTCCGAT'))

[11, 24]


In [20]:
def hamming_distance(p: str, q: str) -> int:
    hdist = 0
    for i in range(len(p)):
        if p[i] != q[i]:
            hdist += 1
    return hdist

p = 'AGAAACAGACCGCTATGTTCAACGATTTGTTTTATCTCGTCACCGGGATATTGCGGCCACTCATCGGTCAGTTGATTACGCAGGGCGTAAATCGCCAGAATCAGGCTG'
q = 'AGAAACCCACCGCTAAAAACAACGATTTGCGTAGTCAGGTCACCGGGATATTGCGGCCACTAAGGCCTTGGATGATTACGCAGAACGTATTGACCCAGAATCAGGCTC'
print(hamming_distance(p,q))

28


In [21]:
def approximate_pattern_matching(pattern: str, text: str, d: int) -> list[int]:
    startPositions = []
    pattern_length = len(pattern)
    text_length = len(text)

    for i in range(text_length - pattern_length + 1):
        if hamming_distance(text[i : i + pattern_length], pattern) <= d:
            startPositions.append(i)
    return startPositions

pattern = 'ATTCTGGA'
text = 'CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT'
d = 3
print(approximate_pattern_matching(pattern,text,d))

[6, 7, 26, 27]


In [23]:
def approximate_pattern_count(text: str, pattern: str, d: int) -> int:
    return len(approximate_pattern_matching(pattern, text, d))

pattern = 'GAGG'
text = 'TTTAGAGCCTTCAGAGG'
d = 2
print(approximate_pattern_count(text,pattern,d))

4


In [25]:
def neighbors(s: str, d: int) -> list[str]:
    nucleotides = {'A', 'T', 'C', 'G'}
    neighborhood = set()
    if d == 0:
        neighborhood.add(s)
        return neighborhood
    if len(s) == 1:
        neighborhood.update(['A', 'T', 'C', 'G'])
        return neighborhood
    suffixNeighbors = neighbors(s[1:], d)
    for str in suffixNeighbors:
        if hamming_distance(s[1:], str) < d:
            for nuc in nucleotides:
                neighborhood.add(nuc+str)
        else:
            neighborhood.add(s[0]+str)
    return neighborhood

print(neighbors("ACG", 1))

{'ACG', 'AAG', 'ACC', 'ATG', 'GCG', 'TCG', 'ACA', 'ACT', 'AGG', 'CCG'}


In [28]:
def frequent_words_with_mismatches(text: str, k: int, d: int) -> list[str]:
    patterns = []
    freqMap = {}
    n = len(text)
    for i in range(n - k + 1):
        pattern = text[i:i + k]
        neighborhood = neighbors(pattern, d)
        for neighbor in neighborhood:
            if neighbor not in freqMap:
                freqMap[neighbor] = 1
            else:
                freqMap[neighbor] += 1
    m = max(freqMap.values())
    for pat in freqMap:
        if freqMap[pat] == m:
            patterns.append(pat)
    return patterns

text = 'ACGTTGCATGTCGCATGATGCATGAGAGCT'
k = 4
d = 1
print(frequent_words_with_mismatches(text, k, d))

['ATGT', 'GATG', 'ATGC']


In [30]:
def frequent_words_mismatches_reverse_complements(text: str, k: int, d: int) -> list[str]:
    rc = reverse_complement(text)
    patterns = []
    freqMap = {}
    for seq in [text, rc]:
        n = len(seq)
        for i in range(n - k + 1):
            pattern = seq[i:i + k]
            neighborhood = neighbors(pattern, d)
            for neighbor in neighborhood:
                if neighbor not in freqMap:
                    freqMap[neighbor] = 1
                else:
                    freqMap[neighbor] += 1

    m = max(freqMap.values())
    for pat in freqMap:
        if freqMap[pat] == m:
            patterns.append(pat)
    return patterns

text = 'ACGTTGCATGTCGCATGATGCATGAGAGCT'
k = 4
d = 1
print(frequent_words_mismatches_reverse_complements(text, k, d))

['ACAT', 'ATGT']
