Align mitochondrial genes based on mitochondrial sequences from Wolters et al 2023

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

from Bio import SeqIO
from Bio import Seq

from ete3 import Tree

import shutil

import diverse_yeast_tools as dyt  


genomes_dir = dyt.genomes_dir
mito_aln_dir = genomes_dir + os.sep + os.path.normpath("yeast_mitochondrial/Finalized_Assemblies/gene_alignments/ind_genes")
y1000plus_dir = dyt.y1000plus_dir

#Load y1000 species table
y1000_species = pd.read_csv(y1000plus_dir + os.path.normpath("y1000plus_tools_data/y1000plus/y1000_species_table.csv"), index_col=0)

base_dir = dyt.base_dir

In [13]:
# #mito_gois is the corse set of genes shared by all species (per Wolters et al)
# mito_gois = ['cox1','cox2','cox3','cob','atp6','atp8','atp9']

# name_errata = {goi: {'substitutions': {}, 'missing_species':set()} for goi in mito_gois}

# for goi in mito_gois: 
#     name_errata[goi]['substitutions'] = {'Pachysolen__tannophilus_' + goi: 'Pachysolen tannophilus' , 
#                                         'Candida__apicola_' + goi: 'Starmerella apicola'
#                                         }

In [None]:
# #old version
# #Load selected species: 
# species_set = pd.read_csv(os.path.normpath(base_dir + '/selected_proteins/species_selection/species_selection.csv'), index_col = 0)
# species_set_filt = species_set.loc[species_set['Load']=='Y']
# species_names = set(species_set_filt.index)

# mito_encoded_genes = os.listdir(mito_aln_dir)

# og_list = {}

# for goi in mito_gois: #os.listdir(mito_aln_dir):
#   print(goi)
#   #goi = 'atp6'


#   name_errata_goi = name_errata[goi]

#   og_list_goi = {}

#   species_present = []

#   #load fasta of all mitochondrially aligned genes and cycle through to capture sequence information of selected genes.  Also capture missing species
#   for record in SeqIO.parse(mito_aln_dir + os.sep + goi + os.sep + goi + '.combined.translated.fasta', 'fasta'):
#       spec_name = ''
#       if len(record.id.split('.'))==3:
#         sep = '.'
#         spec_name = ' '.join(record.id.split(sep)[0:2])
#         if spec_name == 'Komagataella phaffii':  # species as in shen et al 2018 in table 1 from supplement is changed.  
#             spec_name = 'Komagataella pastoris'
#       elif len(record.id.split('__'))>=3:
#         sep = '__'
#         spec_name = ' '.join(record.id.split(sep)[0:2])
#       elif record.id in name_errata_goi['substitutions'].keys():
#         spec_name = name_errata_goi['substitutions'][record.id]         
      
#       if spec_name in species_names:
#           if spec_name == 'Candida albicans':
#             gene_id = record.description.split('.')[2]
#           elif len(record.description.split(' '))==1:
#             gene_id = record.description
#             print(record.description + 'does not have a name separated by a space.')
#           else: 
#             gene_id = record.description.split(' ')[2]
#           species_present.append(spec_name)
#           og_list_goi[record.id] = {'species': spec_name,
#                                     'gene_id': gene_id,
#                                   'seq':str(record.seq)}

#   missing_specs = species_names - set(species_present)
#   print(missing_specs)
#   name_errata_goi['missing_species']= missing_specs
  
#   og_list[goi] = og_list_goi

In [31]:
#rewritten version

#Load selected species: 
species_set = pd.read_csv(os.path.normpath(base_dir + '/selected_proteins/species_selection/species_selection.csv'), index_col = 0)
species_set_filt = species_set.loc[species_set['Load']=='Y']

#mito_encoded_genes = os.listdir(mito_aln_dir)

mito_genes_core_list = ['cox1', 'cox2', 'cox3', 'cob', 'atp6','atp8','atp9']

#Pachysolen Tahhophilus has only a single underline after its name
#Starmerella apicola is called Candida Apicola in wolters et al. 

name_errata = {goi:{'Pachysolen__tannophilus_'+goi:'Pachysolen tannophilus', 'Candida__apicola_'+goi:'Starmerella apicola'} for goi in mito_genes_core_list}




missing_mito_genes = {}

mito_genes_wolters = {}

for goi in mito_genes_core_list: 
  #goi = 'atp6'

  #load fasta of all mitochondrially aligned genes
  species_names = set(species_set_filt.index)

  mito_genes_wolters_goi = {}

  species_present = []

  for record in SeqIO.parse(mito_aln_dir + os.sep + goi + os.sep + goi + '.combined.translated.fasta', 'fasta'):
      spec_name = ''
      if len(record.id.split('.'))>=3:
        sep = '.'
        spec_name = ' '.join(record.id.split(sep)[0:2])
      elif len(record.id.split('__'))>=3:
        sep = '__'
        spec_name = ' '.join(record.id.split(sep)[0:2])
      elif record.id in name_errata[goi].keys():
        spec_name = name_errata[goi][record.id]

      # In Wolters et al, K. pastoris is called K. phaffii
      if spec_name == 'Komagataella phaffii':
        spec_name = 'Komagataella pastoris'
      
      if spec_name in species_names:        
          species_present.append(spec_name)

          if ((goi=='cox3')&(spec_name == 'Candida albicans')):
            gene_id = record.description.split('.')[2]
          elif ((goi =='atp9')&(spec_name=='Wickerhamomyces anomalus')):
            gene_id = record.description.split('__')[2]
          else: 
            gene_id = record.description.split(' ')[2]

          mito_genes_wolters_goi[record.id] = {'species': spec_name,
                                    'gene_id': gene_id,
                                  'seq':str(record.seq)}

  missing_specs = species_names - set(species_present)
  missing_mito_genes[goi] = missing_specs
  mito_genes_wolters[goi] = mito_genes_wolters_goi
  
  #print(missing_specs)
print(missing_mito_genes)

{'cox1': {'Ascoidea rubescens', 'Schizosaccharomyces pombe', 'Sporopachydermia lactativora', 'Alloascoidea hylecoeti'}, 'cox2': {'Pachysolen tannophilus', 'Ascoidea rubescens', 'Schizosaccharomyces pombe', 'Alloascoidea hylecoeti'}, 'cox3': {'Ascoidea rubescens', 'Schizosaccharomyces pombe', 'Alloascoidea hylecoeti'}, 'cob': {'Ascoidea rubescens', 'Schizosaccharomyces pombe', 'Alloascoidea hylecoeti'}, 'atp6': {'Ascoidea rubescens', 'Schizosaccharomyces pombe', 'Alloascoidea hylecoeti'}, 'atp8': {'Eremothecium gossypii', 'Ascoidea rubescens', 'Schizosaccharomyces pombe', 'Alloascoidea hylecoeti'}, 'atp9': {'Ascoidea rubescens', 'Schizosaccharomyces pombe', 'Kluyveromyces marxianus', 'Alloascoidea hylecoeti'}}


Missing Species: 
1. S. pombe - extracted from Pombase


2) Alloascoidea hylecoeti and Ascoidea rubescens

From Antonis: 

 we left these two species out because we could only find a few genes and only get very fragmented assemblies. For Alloascoidea, we could find 11 mt genes but the longest contig contained only 2 of them. For Ascoidea, we could only find 4 genes and the longest contig contained a single gene. Thus, it was hard to trust that we knew what was going on (in contrast to most other species, where we could retrieve a single contig that contained the entire genome).


Search of ref genomes with tblastn for missing genes: 

These genes need to be translated using the ncbi translation table 4 which is the one used for mitochondria for the other three ascoidea species, and also for Sporopahydermia which are closest in the tree from that paper.   

Searched, then saved hit as [spec_abbrev]_[gene_id]_[search original filename].xml

Then got sequence for hit and saved as [spec_abbrev]_[gene_id]_cds.fasta

Then imported and translated with Biopython.Seq.Translate

A. hyl: https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_001600815.1/

['cox1', 'cox2', 'cox3', 'cob', 'atp6','atp8','atp9']

Cox1: NP_009305.1, Hit was spread across several scaffolds.   Left out
Cox2: NP_009326.1, Single hit 
Cox3: NP_009328.1, Single hit
Cob: NP_009315.1, Three hits somewhat overalapping across 3 scaffolds, left out
Atp6: NP_009313.1, Single hit, only covered about 6 residues, left out
Atp8: NP_009312.1, No hits
Atp9: NP_009319.1, No hits


A. rub: https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_001661345.1/

Cox1: NP_009305.1, Two nonoverlapping hits across two scaffolds, but big gaps. Left out.
Cox2: NP_009326.1, No hits
Cox3: NP_009328.1, No hits
Cob: NP_009315.1, No hits
Atp6: NP_009313.1, No Hits
Atp8: NP_009312.1, No Hits
Atp9: NP_009319.1, No Hits

3) Other searches: 

Sporopahcydermia lactitavora for cox1 gene.  Antonis found Mitochondrial genome and found a good hit there by doing tblastn with  

Query: 
Cox1: NP_009305.1 

Subject: S. lactitavora mitochondrial genome
gi|2618218503|gb|OR704033.1|
https://www.ncbi.nlm.nih.gov/nuccore/OR704033.1?report=GenBank 

Translated with codon table 4




Missing Genes found: 

These genes were placed into a fasta file called: 

examples/etc/mitochondrial_sequences/missing_sequences.fasta


>Eremothecium__gossypii__atp8 atp8 uniprot__Q75G40
https://www.uniprot.org/uniprotkb/Q75G40/entry
>Pachysolen__tannophilus__cox2 cox2 uniprot__Q9XM29
https://www.uniprot.org/uniprotkb/Q9XM29/entry
>Pachysolen__tannophilus__cox2 cox2 uniprot__A0A1E4TMZ3 - this one is shorter, probably incorrect
https://www.uniprot.org/uniprotkb/A0A1E4TMZ3/entry
>Kluyveromyces marxianus__atp9 atp9 ncbi__AP012221.1


Saved search: 


In [2]:
#Import hits from A. hylecoeti and Sporopachydermia and translate using translation table 4

retranslated_genes_fname = base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/retranslated.fasta')

genes_to_translate = {'Alloascoidea__hylecoeti': {'coding_table': 4, 
                                                 'genes': 
                                                    {'cox2':  {1: {'hit_frame':-2, 'hit_start':86, 'hit_stop': 151}},
                                                    'cox3': {1: {'hit_frame':-2, 'hit_start':6, 'hit_stop': 153}}
                                                    }
                                                },
                      'Sporopahcydermia__lactitavora': {'coding_table': 4, 
                                                       'genes': 
                                                           {'cox1': 
                                                            {1: {'hit_frame':-1, 'hit_start':370, 'hit_stop': 533},
                                                             2: {'hit_frame':-2, 'hit_start':4, 'hit_stop': 129},
                                                             3: {'hit_frame':-2, 'hit_start':130, 'hit_stop': 237},
                                                             4: {'hit_frame':-2, 'hit_start':238, 'hit_stop': 322}, 
                                                             5: {'hit_frame':-3, 'hit_start':320, 'hit_stop': 374}
                                                             }
                                                           }
                                                        }
                      }

#Not sure what hit frame means in the context of the blast hits.  Using Expasy, only the 3'5' frame1 seemed to match the search. 

spec_abbrevs = {'Alloascoidea__hylecoeti': 'Ahyl', 
                'Sporopahcydermia__lactitavora': 'Slac'}

spec_abbrev_rlookup = dict(zip(spec_abbrevs.values(),spec_abbrevs.keys()))

cds_dir = base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/cds')
cds_fnames = os.listdir(cds_dir)

cds_fnames_dict = {}
for cds_fname in cds_fnames: 
    (spec_abbrev, goi, hit_number, *rest) = cds_fname.split('_')
    species = spec_abbrev_rlookup[spec_abbrev]
    goi_hit_data = genes_to_translate[species]['genes'][goi][int(hit_number)]
    start = goi_hit_data['hit_start']
    stop = goi_hit_data['hit_stop']
    frame = goi_hit_data['hit_frame']

    cds_fnames_dict[cds_fname] = (species,spec_abbrev, goi, start, stop,frame,int(hit_number))

cds_fnames_df = pd.DataFrame.from_dict(cds_fnames_dict, orient='index', columns = ['species','spec_abbrev','goi','start','stop','frame','hit_number'])
     
with open(retranslated_genes_fname,'w') as f_out:
    for species, genes_to_translate_data in genes_to_translate.items():

        #species = 'Sporopahcydermia lactitavora' #'Alloascoidea hylecoeti'#
        #genes_to_translate_data = genes_to_translate[species]

        spec_abbrev = spec_abbrevs[species]
        coding_table = genes_to_translate_data['coding_table']
        gois = genes_to_translate_data['genes']

        #Subset files based on spec_abbrev and goi

        for goi in gois.keys():
            cds_fnames_subset = cds_fnames_df[(cds_fnames_df['spec_abbrev']==spec_abbrev) & (cds_fnames_df['goi']==goi)]

            if len(cds_fnames_subset)==1: #No need to merge different exons
                cds_fname = cds_fnames_subset.index[0]

                #For some reason I had to do the reverse complement here - I suspect it is somethinga bout the reading frame. 
                cds_full_seq_rc = str(SeqIO.read(cds_dir + os.sep + cds_fname,'fasta').seq)
                cds_full_seq = Seq.reverse_complement(cds_full_seq_rc)

                protein_seq = Seq.translate(cds_full_seq, table=4)

                start=cds_fnames_subset['start'].iloc[0]
                stop= cds_fnames_subset['stop'].iloc[0]
                
            else: 
                protein_seq_list = []
                cds_fnames_subset_sorted = cds_fnames_subset.sort_values(by='start')
                start = cds_fnames_subset_sorted['start'].iloc[0]
                current_stop = cds_fnames_subset_sorted['stop'].iloc[0]
                
                current_fname = cds_fnames_subset_sorted.index[0]
                current_cds_rc = str(SeqIO.read(cds_dir + os.sep + current_fname,'fasta').seq)
                #For some reason I had to do the reverse complement here - I suspect it is something about the reading frame - they are all negative, but the value didn't seem to matter. 
                current_cds = Seq.reverse_complement(current_cds_rc)
                current_protein_seq = Seq.translate(current_cds, table=coding_table)
                protein_seq_list.append(current_protein_seq)

                # #check current protein_seq is correct length
                # assert current_stop-start==len(current_protein_seq), 'Protein Sequence not correct length fname = {}'.format(current_fname) 

                for jj in range(1,len(cds_fnames_subset_sorted)):
                    next_fname = cds_fnames_subset_sorted.index[jj]
                    next_start = cds_fnames_subset_sorted['start'].iloc[jj]
                    next_stop = cds_fnames_subset_sorted['stop'].iloc[jj]

                    next_cds_rc = str(SeqIO.read(cds_dir + os.sep + next_fname,'fasta').seq)
                    next_cds = Seq.reverse_complement(next_cds_rc)
                    next_protein_seq = Seq.translate(next_cds, table=coding_table)
                    # assert next_stop-next_start==len(next_protein_seq), 'Protein Sequence not correct length fname = {}'.format(next_fname) 

                    gap = next_start-current_stop     
                    print(gap)   
                    if gap == 1: 
                        protein_seq_list.append(next_protein_seq)
                    elif gap>1: 
                        print('Warning:  Gap between {} and {}, merging sequence ignoring gap'.format(current_fname, next_fname) )
                        protein_seq_list.append(next_protein_seq)
                    elif gap<1:
                        print('Warning:  Overlap between {} and {}'.format(current_fname, next_fname) )
                        print(current_protein_seq)
                        overlap_current = current_protein_seq[gap-1:]
                        print(next_protein_seq)
                        overlap_next = next_protein_seq[0:-gap+1]
                        print('first sequence overlap is {}, second sequence overlap is {}, using second sequence overlap'.format(overlap_current, overlap_next))
                        protein_seq_list[jj-1]=protein_seq_list[jj-1][0:gap-1]
                        protein_seq_list.append(next_protein_seq)

                    current_start = next_start
                    current_stop = next_stop
                    current_fname = next_fname
                    current_protein_seq = next_protein_seq
                
                protein_seq = ''.join(protein_seq_list)

                stop = current_stop
            
            f_out.write('>' + species + '__' + goi + ' ' + goi + ' retranslated__start_' + str(start) + '__stop_' + str(stop) + '\n')
            f_out.write(protein_seq + '*\n')



# # coding_dna = 
# # 
# # print(sequence)

1
1
-2
FFGHPEVYILIIPGFGVVSQIVSTYAKKPIFGRIGMIYAMASIGLLGFLVWSHHMYVVGLDADTRAYFTSATMIIAIPTGIKIWA
LFSWIASLYGGSIVFTTPMLYALAFLFLFTIGGLSGVALSNSSLDIAFHDKIFIV
first sequence overlap is IWA, second sequence overlap is LFS, using second sequence overlap
-4
LFSWIASLYGGSIVFTTPMLYALAFLFLFTIGGLSGVALSNSSLDIAFHDKIFIV
TYYVIAHFHYVLSMGAVFSLFAGYYYWSPKILGLYYNERLAQIQFWLLFIGANLTFFVMHFLGLQGMPRRIPDYPDAYYGWNYIASFGSLISVISVIVFIYVLYDQLVYGLDNKYNGKAVTILTTPDVSESNLVYLNNNSHRGTSLEWVLNSPPAFHSFNTPAIQ
first sequence overlap is KIFIV, second sequence overlap is TYYVI, using second sequence overlap


In [None]:
#Extract peptide sequences from wolters et al

mito_genes_wolters_fname = base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/from_wolters.fasta')

with open(mito_genes_wolters_fname, 'w', newline='\n') as f_out:

    for goi, og_list_goi in og_list.items():
        
        for wolters_id, gene_info in og_list_goi.items():
            spec = gene_info['species']
            f_out.write('>' + spec.split(' ')[0] + '__' + spec.split(' ')[1] + '__' +goi + ' ' + goi + ' wolters__' + gene_info['gene_id'] + '\n')
            f_out.write(gene_info['seq'] + '\n')
        



In [None]:
#Extract proteins from Pombe genome
pombe_genename_map_rev = {mito_goi : mito_goi for mito_goi in mito_gois}
pombe_genename_map_rev['cob'] = 'cob1'
pombe_genename_map = {genename_pombe:genename for genename, genename_pombe in pombe_genename_map_rev.items()}

pombe_proteome_fname = genomes_dir + os.path.normpath('schizosaccharomyces_pombe/peptide_20240918.fa')

pombe_mito_genes_fname = base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/pombe_core_genes.fasta')

with open(pombe_mito_genes_fname, 'w', newline='\n') as f_out: 
    pombe_prots = SeqIO.parse(pombe_proteome_fname, 'fasta')
    for record in pombe_prots:
        pombe_genename = record.description.split(' ')[1].split('|')[0]
        if pombe_genename in set(pombe_genename_map.keys()):
            genename = pombe_genename_map[pombe_genename] 
            pombase_id = record.id.split(':')[0]
            f_out.write('>Schizosaccharomyces__pombe__' +genename + ' ' + genename + ' pombase__' + pombe_genename + '_' + pombase_id + '\n')
            f_out.write(str(record.seq) + '\n')

In [9]:
mito_genes_fname = base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/mito_core_genes.fasta')

pombe_mito_genes_fname = base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/pombe_core_genes.fasta')
mito_genes_wolters_fname = base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/from_wolters.fasta')
missing_genes_fname = base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/missing_sequences.fasta')
retranslated_genes_fname = base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/retranslated.fasta')


with open(mito_genes_fname,'w') as f_out:
    for fname in [pombe_mito_genes_fname, mito_genes_wolters_fname, missing_genes_fname,retranslated_genes_fname]:
        with open(fname,'r') as f_in:
           shutil.copyfileobj(f_in, f_out)
        #f_out.write('\n')


# with open(newfile,'wb') as destination:
#     with open(file_1,'rb') as source:
#         shutil.copyfileobj(source, destination)
#     destination.write(os.linesep*2)
#     with open(file_2,'rb') as source:
#         shutil.copyfileobj(source, destination)
#     destination.close()

In [4]:
shutil.copyfileobj?

[1;31mSignature:[0m [0mshutil[0m[1;33m.[0m[0mcopyfileobj[0m[1;33m([0m[0mfsrc[0m[1;33m,[0m [0mfdst[0m[1;33m,[0m [0mlength[0m[1;33m=[0m[1;36m0[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m copy data from file-like object fsrc to file-like object fdst
[1;31mFile:[0m      c:\users\bheineike\anaconda3\envs\bmh_bioinformatics\lib\shutil.py
[1;31mType:[0m      function

## Move mitochondrial PDB files into appropriate folders, then build list of filenames to run USalign
usalign: 
Note: Sandra didn't make this sequence >Alloascoidea__hylecoeti__cox2 cox2 retranslated__start_86__stop_151

#Kluyveromyces__marxianus__atp9 is far too long and doesn't align well.  Remove from list. 
Lipomyces__starkeyi__cox2.pdb looks bad but it is from the Wolters data Lipomyces__starkeyi__cox2 cox2 wolters__gene-94-mRNA-1 will leave it in



In [13]:
mito_genes_core_list = ['cox1', 'cox2', 'cox3', 'cob', 'atp6','atp8','atp9']

pdb_base_dir =  base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/pdbs')

seq_filter = {'atp9': ['Kluyveromyces__marxianus__atp9'], 'cox2': ['Lipomyces__starkeyi__cox2']}

mito_pdbs = os.listdir(pdb_base_dir + os.sep + 'pdbs')

mito_pdb_lists = {mito_gene:[] for mito_gene in mito_genes_core_list }

for mito_pdb in mito_pdbs:
    mito_pdb_base = mito_pdb.split('.')[0]
    mito_gene = mito_pdb_base.split('__')[2]
    mito_pdb_lists[mito_gene].append(mito_pdb_base)

for mito_gene, drop_list in seq_filter.items(): 
    old_list = mito_pdb_lists[mito_gene]
    filtered_list = [item for item in old_list if not(item in drop_list)]
    mito_pdb_lists[mito_gene] = filtered_list



for mito_gene in mito_genes_core_list: 
    pdb_list_fname = pdb_base_dir + os.sep + 'pdb_list_' + mito_gene + '.txt'

    with open(pdb_list_fname,'w') as f_out: 
        for pdb_fname in mito_pdb_lists[mito_gene]: 
            f_out.write(pdb_fname + '\n') 



use 20241023_usalign_mito_gene.sh which runs 

<code>
/home/heineike/usalign/USalign -dir $PDB_DIR $PDB_LIST -suffix .pdb -mm 4 > ${MITO_PDB_BASE}/us_align_${MITO_CORE_GENE}.fasta
</code>

In [24]:
#Remove headers from usalign files
for mito_gene in mito_genes_core_list: 
    us_align_fname = pdb_base_dir + os.sep + 'us_align_' + mito_gene + '.fasta'
    us_align_clean_fname = pdb_base_dir + os.sep + 'fasta_renamed' + os.sep + 'us_align_clean_' + mito_gene + '.fasta'
    with open(us_align_fname,'r') as f_in: 
        with open(us_align_clean_fname,'w') as f_out: 
            for line in f_in: 
                if line[0]=='>':
                    f_out.write(line)
                    seq_line = next(f_in)
                    f_out.write(seq_line)



In [17]:
#Trees of protein sequences

#Make files for Oliver to order sequence names into trees and visualize them
tree_dir = base_dir + os.sep + os.path.normpath('/examples/etc/mitochondrial_sequences/pdbs/trees/mito_trees')

for tree_fname in os.listdir(tree_dir):
    if (tree_fname.split('.')[-1])=='treefile':
        mito_gene = tree_fname.split('.')[0]

        #tree_fname = base_dir + os.sep + os.path.normpath('msas/structural/tm_align/trees/OG1022_REF_Scer_AF-P38715-F1-model_v2.tm.fasta.clipkit.treefile')
        t = Tree(tree_dir + os.sep + tree_fname, format=1)

        t.ladderize

        fname_out = tree_dir  + os.sep + mito_gene + '_tree_list.txt'

        with open(fname_out,'w') as f_out:
            for leaf in t.iter_leaf_names():
                leaf_tidy = leaf.split('.pdb_A')[0] + '.pdb'
                f_out.write(leaf_tidy + '\n')

        image_out_fname = tree_dir  + os.sep + mito_gene + '_tree.png'
        t.render(file_name=image_out_fname)

    #t.render('%%inline')