# 1: Find a Position in a Genome Minimizing the Skew

In [6]:
import numpy as np
from Bio.Seq import Seq
import re
import itertools

In [3]:
dna = ''
with open('rosalind_ba1f.txt') as file:
    dna = file.read().replace('\n', '')

skew_map = {'C': -1, 'G': 1, 'T': 0, 'A': 0}
skew = np.empty(len(dna), dtype=np.int16)
count = 0
for i, char in enumerate(dna):
    count += skew_map[char]
    skew[i] = count

min = min(skew)
indices = np.where(skew == min)[0] + 1
print(indices)

[12643]


# 2: Find Patterns Forming Clumps in a String

In [5]:
def get_pattern_count(text, pattern):
    seq = Seq(text)
    return seq.count_overlap(pattern)

genome = ''
k = 0
l = 0
t = 0
with open('rosalind_ba1e.txt') as file:
    stuffs = file.readlines()
    char_regex = re.compile('[^a-zA-Z]')
    int_regex = re.compile('[^0-9]')
    genome = char_regex.sub('', stuffs[0])
    numbers = stuffs[1].split(' ')
    k = int(int_regex.sub('', numbers[0]))
    l = int(int_regex.sub('', numbers[1]))
    t = int(int_regex.sub('', numbers[2]))

genome_len = len(genome)
clump = []

for i in range(genome_len - l + 1):
    current_genome = genome[i:i+l]
    current_genome_len = len(current_genome)
    for j in range(current_genome_len - k + 1):
        pattern = current_genome[j:j+k]
        pattern_count = get_pattern_count(current_genome, pattern)
        if pattern_count >= t and pattern not in clump:
            clump.append(pattern)
print(' '.join(clump))

GGCACCTACG TCGCCACCGA AAGCACTTGG CAGCGATTTA TGCGACGGGA AAGTGAACTC


# 3: Find a Median String

In [7]:
def get_hamming_distance(dna1, dna2):
    dna_len = len(dna1)
    hamming_distance = 0

    for i in range(dna_len):
        if dna1[i] != dna2[i]:
            hamming_distance += 1
    return hamming_distance


def get_text_distance(text, pattern):
    text_len = len(text)
    pattern_len = len(pattern)
    min_distance = pattern_len

    for i in range(text_len-pattern_len+1):
        temp_pattern = text[i:i+pattern_len]
        distance = get_hamming_distance(temp_pattern, pattern)
        if min_distance > get_hamming_distance(temp_pattern, pattern):
            min_distance = distance
    return min_distance


def get_dna_distance(dna, pattern):
    pattern_len = len(pattern)
    distance = 0
    for text in dna:
        text_distance = get_text_distance(text, pattern)
        distance += text_distance
    return distance


dna = []
k = 0
with open('rosalind_ba2b.txt') as file:
    stuffs = file.readlines()
    char_regex = re.compile('[^a-zA-Z]')
    int_regex = re.compile('[^0-9]')
    k = int(int_regex.sub('', stuffs[0]))
    dna = stuffs[1:]
    dna = [char_regex.sub('', text) for text in dna]

symbols = 'GACT'
all_patterns = list(itertools.product(symbols, repeat=k))
min_distance = k*len(dna)
median = ''
for pattern in all_patterns:
    pattern = ''.join(pattern)
    distance = get_dna_distance(dna, pattern)
    if min_distance > distance:
        min_distance = distance
        median = pattern
print(median)


CTTGGT


# 4: Find a Profile-most Probable k-mer in a String

In [8]:
symbol_index = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

def get_probability(profile, pattern):
    pattern_len = len(pattern)
    probability = 1
    for i in range(pattern_len):
        probability *= profile[symbol_index[pattern[i]]][i]
    return probability


def get_most_probable_k_mer(profile, text, k):
    most_probable_k_mer = ''
    max_probability = -1
    text_len = len(text)
    for i in range(text_len - k + 1):
        pattern = text[i:i + k]
        probability = get_probability(profile, pattern)
        if probability > max_probability:
            max_probability = probability
            most_probable_k_mer = pattern
    return most_probable_k_mer


text = ''
k = 0
profile_matrix = []
with open('rosalind_ba2c.txt') as file:
    stuffs = file.readlines()
    char_regex = re.compile('[^a-zA-Z]')
    text = char_regex.sub('', stuffs[0])
    k = int(stuffs[1])
    for row in stuffs[2:]:
        all_floats = re.findall(r"[-+]?\d*\.\d+|\d+", row)
        all_floats = [float(i) for i in all_floats]
        profile_matrix.append(all_floats)

print(get_most_probable_k_mer(profile_matrix, text, k))

TGTACT


# 5: Implement GreedyMotifSearch

In [9]:
symbol_index = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

def get_profile_from_motifs(motifs):
    k = len(motifs[0])
    t = len(motifs)
    profile = [[0.0]*k, [0.0]*k, [0.0]*k, [0.0]*k]
    for text in motifs:
        for i in range(k):
            index = symbol_index[text[i]]
            profile[index][i] += 1
    np_profile = np.array(profile)/t
    profile = np_profile.tolist()
    return profile


def get_probability(profile, pattern):
    pattern_len = len(pattern)
    probability = 1
    for i in range(pattern_len):
        probability *= profile[symbol_index[pattern[i]]][i]
    return probability


def get_score_from_motifs(motifs):
    t = len(motifs)
    motif_len = len(motifs[0])
    score = 0
    for i in range(motif_len):
        count = {}
        best_count = 0
        for motif in motifs:
            symbol = motif[i]
            if symbol not in count.keys():
                count[symbol] = 1
            else:
                count[symbol] += 1
            if count[symbol] > best_count:
                best_count = count[symbol]
        score += (t-best_count)
    return score


def get_most_probable_k_mer(profile, text, k):
    most_probable_k_mer = ''
    max_probability = -1
    text_len = len(text)
    for i in range(text_len - k + 1):
        pattern = text[i:i + k]
        probability = get_probability(profile, pattern)
        if probability > max_probability:
            max_probability = probability
            most_probable_k_mer = pattern
    return most_probable_k_mer


def greedy_motif_search(dna, k, t):
    best_motifs = [text[:k] for text in dna]
    best_score = get_score_from_motifs(best_motifs)
    text_len = len(dna[0])
    for i in range(text_len-k+1):
        current_motifs = [dna[0][i:i + k]]
        for j in range(1, t):
            current_profile = get_profile_from_motifs(current_motifs)
            # print(np.array(current_profile))
            most_probable_pattern = get_most_probable_k_mer(current_profile, dna[j], k)
            current_motifs.append(most_probable_pattern)
        # print(current_motifs)
        current_score = get_score_from_motifs(current_motifs)
        if current_score < best_score:
            best_motifs = current_motifs
            best_score = current_score
    return best_motifs




dna = []
k = 0
t = 0
with open('rosalind_ba2d.txt') as file:
    stuffs = file.readlines()
    digits = re.findall(r'\d+', stuffs[0])
    k = int(digits[0])
    t = int(digits[1])
    dna = stuffs[1:]
    char_regex = re.compile('[^a-zA-Z]')
    dna = [char_regex.sub('', line) for line in dna]



best_motifs = greedy_motif_search(dna, k, t)
print('\n'.join(best_motifs))

GTTTGTATACAC
ATAGCAACGCTG
CACAACCTTACT
TTTAACTGCGGT
GTTAGCTTGAGT
CTCTCCTTAGCT
GTTAGAACAGGC
GTTAGCATGACC
CTAAGCTCAAGT
GTTAGCCTGAAT
ATCTGTATAGGT
CTTAACACACAG
GTTAGCCTGAGT
GTTAGAATAACC
TTCAGCACGAGC
GTTACTCTACTC
GTTTACCTAATT
ATTAGACCGAAT
GTTAGCTTGACT
TTAACCCTGGGT
GTATACTTGACC
CTTAAAATGCGT
GTTACCATGACG
GTTAACATGATC
CTTGCAACGGCG
