In [1]:
import ReadNewickFormat
import BinaryTree
import pandas as pd

alphabet = ["A","T","C","G"]

type_LeafNode = BinaryTree.BinaryTree.LeafNode

type_InternalNode = BinaryTree.BinaryTree.InternalNode

In [2]:
T, Error =  ReadNewickFormat.run("((2,3),((1,4),5))")

testSeq = ["A","A","A","A","C"]

if Error:
    print(Error.as_string())
else: print(T)
    


((2,3),((1,4),5))


In [3]:
def prepareLeaves(T,sequences):
    leaves = T.getLeaves()
    leaves.sort(key=lambda x: x.num)
    for index, value in enumerate(sequences):
        leaves[index].setSequence(value)
    

In [4]:
def score(node, seq_index):
    
    delta = lambda x,y: 0 if x == y else 1
    inf = float('inf')
    res = {}
    if type(node) == type_LeafNode:
        for value in alphabet:
            if node.sequence[seq_index] == value:
                res[value] = 0
            else:
                res[value] = inf
        node.score = res
    else:
        for value in alphabet:
            res[value] = min([node.left.score[i] + delta(i,value) for i in alphabet]) + min([node.right.score[i] + delta(i,value) for i in alphabet])
        
        node.score = res
            
    



In [5]:
def smallParsimony(T, leaves):
    n = len (leaves[0].sequence)

    res = 0
    for i in range(n):
        ripeNodes = []
        tag = True

        for j in range(len(leaves)):
            leaves[j].setTag(tag)
            score(leaves[j], i)
            parent = leaves[j].parent

            if parent.left.tag == tag and parent.right.tag == tag:
                ripeNodes.append(parent)

        while ripeNodes:
            node = ripeNodes.pop()
            node.setTag(tag)
            score(node,i)
            parent = node.parent

            if parent == None:
                break

            if parent.left.tag == tag and parent.right.tag == tag:
                ripeNodes.append(parent)

        res += min(T.root.score.values())
        tag = not tag

    return res

In [6]:
def readFile(filename):
    text = []
    with open(filename) as file:
        for line in file:
            if ((sline := line.rstrip()) == ""):
                continue
            text.append(sline)
    return text

In [7]:
def getCostMatrix(costMatrix):
    costMatrix = [str.replace(s, " ","") for s in costMatrix]
    cr_name = [c for c in costMatrix[0]]

    df = pd.DataFrame(columns=cr_name, index=cr_name)

    for i in range(0,len(cr_name)):
        for j in range(0,i+1):
            cost = costMatrix[i+1][j+1]
            df.iloc[i,j] = cost
    return df

In [11]:
sequences = readFile("sequenzen.txt")

costMatrix = getCostMatrix(readFile("kosten.txt"))

T, Error =  ReadNewickFormat.run(readFile("topologie.txt")[0])


if Error:
    print(Error.as_string())
elif not all([len(sequences[0]) == len(seq) for seq in sequences]):
    print("Sequenzen haben nicht gleiche Länge.")
elif len(sequences) != len(T.getLeaves()):
    print("Mehr Sequenzen als Blätter im Newick-Format")
else:
    prepareLeaves(T, sequences)
    print(smallParsimony(T, T.getLeaves()))

10
