# Creating the Baseline

1. Run multiple sequence alignment on mammals.fna. Execution is documented in [clustalw.out.txt](./clustalw.out.txt).
2. Generate NJ tree using clustalw (baseline_nj.ph).
3. Generate UPGMA tree using clustaw (baseline_upgma.ph).
4. Generate Maximum Likelihood tree using fasttree (baseline_ml.ph).

## Rename accession to common names

In [3]:
def accession_to_commmon_name(input_file, output_file):
    # Read in the file
    with open(input_file, 'r') as file :
      data = file.read()

    # Replace the target string
    data = data.replace('CM054508.1', 'Pygmy_chimpanzee')
    data = data.replace('NC_001321.1', 'Finback_whale')
    data = data.replace('NC_001321.', 'Finback_whale')
    data = data.replace('NC_001325.1', 'Harbor_seal')
    data = data.replace('NC_001325.', 'Harbor_seal')
    data = data.replace('NC_001601.1', 'Blue_whale')
    data = data.replace('NC_001601.', 'Blue_whale')
    data = data.replace('NC_001602.1', 'Gray_seal')
    data = data.replace('NC_001602.', 'Gray_seal')
    data = data.replace('NC_001610.1', 'Opossum')
    data = data.replace('NC_001610.', 'Opossum')
    data = data.replace('NC_001640.1', 'Horse')
    data = data.replace('NC_001640.', 'Horse')
    data = data.replace('NC_001643.1', 'Chimpanzee')
    data = data.replace('NC_001643.', 'Chimpanzee')
    data = data.replace('NC_001645.1', 'Gorilla')
    data = data.replace('NC_001645.', 'Gorilla')
    data = data.replace('NC_001700.1', 'Cat')
    data = data.replace('NC_001700.', 'Cat')
    data = data.replace('NC_001794.1', 'Wallaroo')
    data = data.replace('NC_001794.', 'Wallaroo')
    data = data.replace('NC_001808.1', 'White_rhinoceros')
    data = data.replace('NC_001808.', 'White_rhinoceros')
    data = data.replace('NC_002083.1', 'Orangutan')
    data = data.replace('NC_002083.', 'Orangutan')
    data = data.replace('NC_010339.1', 'House_mouse')
    data = data.replace('NC_010339.', 'House_mouse')
    data = data.replace('NC_012374.1', 'Rat')
    data = data.replace('NC_012374.', 'Rat')
    data = data.replace('OK135155.1', 'Cow')
    data = data.replace('OM287160.1', 'Gibbon')
    data = data.replace('OM864526.1', 'Chicken')
    data = data.replace('OP605624.1', 'Human')

    # Write the file out again
    with open(output_file, 'w') as file:
      file.write(data)

In [4]:
baseline_dir = './trees'
clustal_nj_tree = f'{baseline_dir}/clustal_mammals_nj.ph'
baseline_nj_tree   = f'{baseline_dir}/baseline_nj.ph'

clustal_upgma_tree = f'{baseline_dir}/clustal_mammals_upgma.ph'
baseline_upgma_tree   = f'{baseline_dir}/baseline_upgma.ph'

fasttree_ml_tree= f'{baseline_dir}/fasttree_mammals_ml.ph'
baseline_ml_tree= f'{baseline_dir}/baseline_ml.ph'

baseline_consensus_tree = f'{baseline_dir}/baseline_consensus.ph'

accession_to_commmon_name(clustal_nj_tree, baseline_nj_tree)
accession_to_commmon_name(clustal_upgma_tree, baseline_upgma_tree)
accession_to_commmon_name(fasttree_ml_tree, baseline_ml_tree)

In [5]:
from dendropy import TaxonNamespace, Tree, TreeList

tree_list = TreeList([
    Tree.get(path=baseline_nj_tree, schema="newick"), 
    Tree.get(path=baseline_upgma_tree, schema="newick"),
    Tree.get(path=baseline_ml_tree, schema="newick")]
)

In [6]:
# frequency threshold = 0.50
consensus_tree = tree_list.consensus()

In [7]:
consensus_tree.write(file=open(baseline_consensus_tree, 'w'), schema="newick")