Align mitochondrial genes based on mitochondrial sequences from Wolters et al 2023

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

from Bio import SeqIO

#Import data from paper
#G:\My Drive\Crick_LMS\external_data\yeast_mitochondrial\Finalized_Assemblies\gene_alignments
mito_aln_dir = os.path.normpath("G:/My Drive/Crick_LMS/external_data/yeast_mitochondrial/Finalized_Assemblies/gene_alignments/ind_genes")

#Import list of species

external_data_base = os.path.normpath('G:/My Drive/Crick_LMS/external_data/genomes')

y1000plus_dir = os.path.normpath('C:/Users/bheineike/Documents/GitHub/y1000plus_tools') + os.sep
#Load y1000 species table
y1000_species = pd.read_csv(y1000plus_dir + os.path.normpath("y1000plus_tools_data/y1000plus/y1000_species_table.csv"), index_col=0)

base_dir = os.path.normpath('G:/My Drive/Crick_LMS/projects/diverse_yeasts/alphafold')


In [7]:
#mito_gois is the corse set of genes shared by all species (per Wolters et al)
mito_gois = ['cox1','cox2','cox3','cob','atp6','atp8','atp9']

name_errata = {goi: {'substitutions': {}, 'missing_species':set()} for goi in mito_gois}

for goi in mito_gois: 
    name_errata[goi]['substitutions'] = {'Pachysolen__tannophilus_' + goi: 'Pachysolen tannophilus' , 
                                        'Candida__apicola_' + goi: 'Starmerella apicola'
                                        }

In [18]:
#Load selected species: 
species_set = pd.read_csv(os.path.normpath(base_dir + '/selected_proteins/species_selection/species_selection.csv'), index_col = 0)
species_set_filt = species_set.loc[species_set['Load']=='Y']
species_names = set(species_set_filt.index)

mito_encoded_genes = os.listdir(mito_aln_dir)

og_list = {}

for goi in mito_gois: #os.listdir(mito_aln_dir):
  print(goi)
  #goi = 'atp6'


  name_errata_goi = name_errata[goi]

  og_list_goi = {}

  species_present = []

  #load fasta of all mitochondrially aligned genes and cycle through to capture sequence information of selected genes.  Also capture missing species
  for record in SeqIO.parse(mito_aln_dir + os.sep + goi + os.sep + goi + '.combined.translated.fasta', 'fasta'):
      spec_name = ''
      if len(record.id.split('.'))==3:
        sep = '.'
        spec_name = ' '.join(record.id.split(sep)[0:2])
        if spec_name == 'Komagataella phaffii':  # species as in shen et al 2018 in table 1 from supplement is changed.  
            spec_name = 'Komagataella pastoris'
      elif len(record.id.split('__'))>=3:
        sep = '__'
        spec_name = ' '.join(record.id.split(sep)[0:2])
      elif record.id in name_errata_goi['substitutions'].keys():
        spec_name = name_errata_goi['substitutions'][record.id]         
      
      if spec_name in species_names:
          if spec_name == 'Candida albicans':
            gene_id = record.description.split('.')[2]
          elif len(record.description.split(' '))==1:
            gene_id = record.description
            print(record.description + 'does not have a name separated by a space.')
          else: 
            gene_id = record.description.split(' ')[2]
          species_present.append(spec_name)
          og_list_goi[record.id] = {'species': spec_name,
                                    'gene_id': gene_id,
                                  'seq':str(record.seq)}

  missing_specs = species_names - set(species_present)
  print(missing_specs)
  name_errata_goi['missing_species']= missing_specs
  
  og_list[goi] = og_list_goi


cox1
{'Ascoidea rubescens', 'Alloascoidea hylecoeti', 'Sporopachydermia lactativora', 'Schizosaccharomyces pombe'}
cox2
{'Pachysolen tannophilus', 'Alloascoidea hylecoeti', 'Ascoidea rubescens', 'Schizosaccharomyces pombe'}
cox3
{'Ascoidea rubescens', 'Alloascoidea hylecoeti', 'Schizosaccharomyces pombe'}
cob
{'Ascoidea rubescens', 'Alloascoidea hylecoeti', 'Schizosaccharomyces pombe'}
atp6
{'Ascoidea rubescens', 'Alloascoidea hylecoeti', 'Schizosaccharomyces pombe'}
atp8
{'Ascoidea rubescens', 'Eremothecium gossypii', 'Schizosaccharomyces pombe', 'Alloascoidea hylecoeti'}
atp9
Wickerhamomyces__anomalus__35273_atp9_copy_1__consensusdoes not have a name separated by a space.
{'Ascoidea rubescens', 'Alloascoidea hylecoeti', 'Kluyveromyces marxianus', 'Schizosaccharomyces pombe'}


In [20]:
og_list

{'cox1': {'Candida.albicans.AF285261_cox1': {'species': 'Candida albicans',
   'gene_id': 'AF285261_cox1 cox1 gene-40-mRNA-1',
   'seq': 'MSYVTRWLFSTSHKDIGILYLIYGMVSAMVATGMSVIIRLELSGPGPMFLHGNNQVFNVLVTGHAIAMIFLFVMPILIGSFGNYFLPIMIGAVDMAFARLNNISFWCLPPALVCVIASVLIETGAGTGWTVYPPLSSISAHSGPSVDLAIFAIHLTSISSLLGAINFIVTTLNMRSIGIHMIDMPLFVWAIFFTAILLLLSLPVLTAGVTLLLMDRNFNTGFYEVAAGGDPILYEHLFWFFGHPEVYIIIIPGFGIISHVISTYTKKPIFGQIGMIYAIGSIGLLGFLVWSHHMYVVGLDIDSRAYFTSATMVIAIPTGIKIFSWLATLYGGELRLAVPMLFALGFLFLFTIGGLTGVMLSNASIDVAFHDTYYVVGHFHYVLSMGALFSLIAGYYYWGPAMFGLKYNKVLAEVHYWLLFVSVNIIFLPMHFLGLNGMPRRIPQYPDAFVGWNVVSSWGSIMSVISVLIGLYSVLVQLTNGENEREEIQVTPDYLESNNTRDVRDSDLELITSRPAQYHTFSELPILIQQI*'},
  'Candida__apicola_cox1': {'species': 'Starmerella apicola',
   'gene_id': 'gene-54-mRNA-1',
   'seq': 'MSNLTLNYINKVNNNPVDRGYHWVESWLFSTNAKQIGILYGIFAVFSGLVGLSLSILMRIELASPNPQILMHNGQLWNVLVTAHAVFMVFFLVMPITMGALGNYLTPLMIGTSDTAFPRINNIAFWFLVPSMLFAVLSCLIDEGPGTGWTIYPPLSSLQSHSGSSVDLAIFSLHLSGLSSIFGAINLMVTIINMRANGMDYSKLPLFVWSVLITAVLIILA

In [25]:
#Add pombe genomes
pombe_genename_map_rev = {mito_goi : mito_goi for mito_goi in mito_gois}
pombe_genename_map_rev['cob'] = 'cob1'
pombe_genename_map = {genename_pombe:genename for genename, genename_pombe in pombe_genename_map_rev.items()}

pombe_proteome_fname = external_data_base + os.sep + os.path.normpath('schizosaccharomyces_pombe/peptide_20240918.fa')

pombe_mito_genes_fname = base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/pombe_core_genes.fasta')

with open(pombe_mito_genes_fname, 'w', newline='\n') as f_out: 
    pombe_prots = SeqIO.parse(pombe_proteome_fname, 'fasta')
    for record in pombe_prots:
        pombe_genename = record.description.split(' ')[1].split('|')[0]
        if pombe_genename in set(pombe_genename_map.keys()):
            genename = pombe_genename_map[pombe_genename] 
            pombase_id = record.id.split(':')[0]
            f_out.write('>Schizosaccharomyces__pombe__' +genename + ' ' + genename + ' pombase__' + pombe_genename + '_' + pombase_id + '\n')
            f_out.write(str(record.seq) + '\n')

In [27]:
#Extract peptide sequences from wolters et al

mito_genes_wolters_fname = base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/from_wolters.fasta')

with open(mito_genes_wolters_fname, 'w', newline='\n') as f_out:

    for goi, og_list_goi in og_list.items():
        
        for wolters_id, gene_info in og_list_goi.items():
            spec = gene_info['species']
            f_out.write('>' + spec.split(' ')[0] + '__' + spec.split(' ')[1] + '__' +goi + ' ' + goi + ' wolters__' + gene_info['gene_id'] + '\n')
            f_out.write(gene_info['seq'] + '\n')
        



In [28]:
import shutil

mito_genes_fname = base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/mito_core_genes.fasta')

missing_genes_fname = base_dir + os.sep + os.path.normpath('examples/etc/mitochondrial_sequences/missing_sequences.fasta')

with open(mito_genes_fname,'wb') as f_out:
    for fname in [pombe_mito_genes_fname, mito_genes_wolters_fname, missing_genes_fname]:
        with open(fname,'rb') as f_in:
            shutil.copyfileobj(f_in, f_out)

go through missing species and make errata list.  Send list to Antonis and Jacob. 

Missing genes from Uniprot: 

E. gos ATP8

https://www.uniprot.org/uniprotkb/Q75G40/entry


P. tan cox2
https://www.uniprot.org/uniprotkb/Q9XM29/entry
or 
https://www.uniprot.org/uniprotkb/A0A1E4TMZ3/entry


Cox1 from 'Sporopachydermia lactativora'

Antonis found some hits using tblastn vs the S. lac mito genome:
https://www.ncbi.nlm.nih.gov/nuccore/OR704033.1?report=GenBank 

Saved search: 

In [None]:
Pombe missing

No 'Alloascoidea hylecoeti'

Komagataella__pseudopastoris__34939_atp6

No 'Ascoidea rubescens'

In [None]:
#Extract mitochondrial genes for key species and put into single fasta
#


