In [1]:
#Ihsan Muchsin

from collections import OrderedDict

def cons(infile, outfile):
    '''
    Given: 1) An input file containing a collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format. 2) The name of output file
    Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.), Plus an output file containing the result
    '''
    
    all_seq = {}
    with open(infile, 'r') as fin:
        seq = ""
        header = None
        for line in fin:
            if line.startswith(">"): #name of the sequence in FASTA file starts with >
                if header is not None:
                    all_seq[header] = seq #add a new entry to dictionary with key==header and value==sequence
                header = line.lstrip(">").rstrip("\n") #get the sequence name if a new header is found
                seq="" #reset the sequence after new header is found
            else:
                seq += line.strip() #add every line after header to the sequence
        all_seq[header] = seq #add last entry in dictionary
        
    profile_matrix = OrderedDict.fromkeys(('A', 'C', 'G', 'T',))    
    for seq in all_seq.values():        
        if not profile_matrix['A']:
            for key in profile_matrix:
                profile_matrix[key] = [0] * len(seq)        
        for index, char in enumerate(seq):            
            profile_matrix[char][index] += 1 #count the number of each nucleotide in each position
            
    cons = []    
    for index in range(len(profile_matrix['A'])):
        max_repeats, max_char = 0, '' # initialize/reset counters
        for char in profile_matrix: # cycle over the keys for this index
            if profile_matrix[char][index] >= max_repeats: # update counters
                max_repeats = profile_matrix[char][index]
                max_char = char
        cons.append(max_char) # append largest-counted char to result
    
    cons_string = ''.join(cons) # get all the chars as a single string
    
    with open(outfile, 'w') as fout:
        fout.write(cons_string + '\n')
        prof_mat_string = []
        for key, chars in profile_matrix.items():
            prof_mat_string.append('{}: {}'.format(key, ' '.join(str(char) for char in chars)))
        fout.write('\n'.join(prof_mat_string))
    
    return (profile_matrix, cons_string)

In [2]:
infile = 'rosalind_cons.txt'
outfile = 'cons_sol.txt'

profile_matrix, cons_string = cons(infile, outfile)
print(cons_string)
for key, chars in profile_matrix.items():
    print('{}: {}'.format(key, ' '.join(str(char) for char in chars)))

ATTCGCCTAGTTACAACTATCAATTACGCAGAGGCTATTCGGTATGGGGTGAGCTAGCGAATTACGCCGAGGAGGAATACGGTACCTCTAGCAATCGGCGAATCATTTCAACTAGGCCGACTTGCTTAGGTCCAGGATTCTCTTAGGTGGCCGGTTTATGGACCCCAGGTTGCGGATGCTTGATGTGCGGTGATTTATTAAGGGATTTTTGGGGTGAGTTAAGGGCGTGTTTGACAGTTGGTTGCAGTCTTTAGTGGCTCTCTCTGTCGTGCGCACACAGCGGCTGTACTAGACTGCGGTCTCTGGTACTTTCGCCTACTACTGTAGCGTGAGTCTAGTGCTGGAGGCGCTCATACCTGTGTTGATCCAAGTCGCTAAATTGGTGTGCTGTTCGTCCATTTGTAGCGGGGTAAGAGGTGATATTTGCAGCCGTGCTACCTGGTACTCGATCCCTCTACGTGCCTAATCAAATACCACCTGTATAGTCGTGTTGCCTCGGCACAGAGTTAGTATCGCTCCTATTCCGTCACGTAACCACAATTACATGTCCCCGAGCGCCCGAGCCGGGGTGCGCTACATCGCTTGGTAGGCATGTTACTGTTATTTTTTTCCGAACTGTGTTAGTATTTGAACCTGTAGGAAGAGACAACCTGTTTGACACTCAGGTCGGTTAGAGTTTTTAATGCCTATGGAGCCATAATATCATGGCACGATGTCTCTGGTTACAATTAGTTGACCTTACGTGGACGTCTGCTGATTACTGTTTTCACCGCTGCCCTAGATTGGCTTAATGGGTCGTATGCCTACATGTGCATTCTTCGGCTTGCCGAGGGTTTATGCCTCGAGAGCTTCCGTAGGCCTGCTTGTGAGAGATAAGCTCGTTGTAATTCGGCGCAGGCGGTTCACTTAGGTAGACCCCACCTTGTACAGAACTCTTGATCATGCCAACGACGTGCACTCTCGTCTGTCTCGACGCTAATCGACATCG
A: 5 2 3 1 