In [41]:
import os
import pandas as pd
import pickle
import json
from Bio import SeqIO
from Bio import AlignIO   #, Align


%load_ext autoreload
%autoreload 2
import diverse_yeast_tools as dyt

base_dir = os.path.normpath('G:/My Drive/Crick_LMS/projects/diverse_yeasts/alphafold')
divyeast_dir = os.path.normpath('C:/Users/heineib/Documents/GitHub/diverse_yeast')
y1000plus_dir = os.path.normpath('C:/Users/heineib/Documents/GitHub/y1000plus_tools/data') + os.sep
genomes_dir = os.path.normpath('G:/My Drive/Crick_LMS/external_data/genomes')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
#Load main analysis file
struct_analysis = pickle.load(open(base_dir + os.sep + os.path.normpath('Output/data/Analysis_new_02.pkl'), 'rb'))

#Load original sequence file, and make dictionary of gene_id to fasta header and gene_id to peptide sequence
#Read in Sequence File, extract all names make dict of seq_alignment sequences
selected_proteins = SeqIO.parse(base_dir +os.sep +  'selected_proteins.fasta', 'fasta')
selected_proteins_headers = {}
selected_proteins_seqs = {}
for record in selected_proteins: 
    selected_proteins_headers[record.id] = record.description
    selected_proteins_seqs[record.id] = str(record.seq)

#Load peptide sequences for model species, make dictionary from gene id to peptide sequence
model_protein_dict = {}
for spec_abbrev in ['Scer', 'Spom', 'Calb']: 
    model_protein_dict[spec_abbrev] = dyt.load_model_protein_dict(spec_abbrev)
    

#ID Mapping for model species
#Lookup from gene_id to y1000_id 
gene_id_2_y1000_id = dyt.load_model_gene_id_2_y1000_id()

#swissprot_id_2_gene_id = dyt.load_model_swissprot_id_2_gene_id()
swissprot_id_2_gene_id = dyt.load_model_swissprot_id_2_gene_id()


# #Load S.cer lookup table: 
# scer_lookup_fname = y1000plus_dir + os.path.normpath('y1000plus_tools_data/y1000plus/id_lookups/saccharomyces_cerevisiae.csv')
# scer_lookup = pd.read_csv(scer_lookup_fname, index_col=0)
# gene_id_2_y1000_id['Scer'] = dict(zip(scer_lookup.index,scer_lookup['y1000_id']))

# #Load C.alb lookup table
# calb_lookup_fname = y1000plus_dir + os.path.normpath('y1000plus_tools_data/y1000plus/id_lookups/candida_albicans.csv')
# calb_lookup = pd.read_csv(calb_lookup_fname, index_col=0)
# gene_id_2_y1000_id['Calb'] = dict(zip(calb_lookup.index,calb_lookup['y1000_id']))

# #Lookup from swissprot_id to gene_id
# #would prefer to use a more official source for these for S.cer and S.pom

# swissprot_id_2_gene_id= {}
# scer_swissprot_id_2_gene_id_df = pd.read_table(base_dir + os.sep + os.path.normpath('msas/structural/Scer_protein_names.tsv'))
# swissprot_id_2_gene_id['Scer'] = dict(zip(scer_swissprot_id_2_gene_id_df['Swiss-Prot'],scer_swissprot_id_2_gene_id_df['OLN']))

# calb_swissprot_id_2_gene_id = pickle.load(open(base_dir + os.sep + os.path.normpath('msas/structural/Mapping_calb.pkl'),"rb"))
# swissprot_id_2_gene_id['Calb'] = dict(zip(calb_swissprot_id_2_gene_id.values(),calb_swissprot_id_2_gene_id.keys()))

# spom_swissprot_id_2_gene_id_df = pd.read_table(genomes_dir + os.sep + os.path.normpath('Schizosaccharomyces_pombe/PomBase2UniProt.tsv'), header=None)
# swissprot_id_2_gene_id['Spom'] = dict(zip(spom_swissprot_id_2_gene_id_df[1],spom_swissprot_id_2_gene_id_df[0]))



In [37]:
specs

['saccharomyces_cerevisiae',
 'debaryomyces_hansenii',
 'wickerhamomyces_anomalus',
 'lachancea_thermotolerans',
 'kazachstania_naganishii',
 'kluyveromyces_marxianus',
 'yHMPu5000034957_hanseniaspora_osmophila_160519',
 'geotrichum_candidum',
 'zygosaccharomyces_rouxii',
 'candida_albicans',
 'candida_tropicalis',
 'kluyveromyces_lactis',
 'komagataella_pastoris',
 'vanderwaltozyma_polyspora',
 'torulaspora_delbrueckii',
 'eremothecium_gossypii',
 'ascoidea_rubescens',
 'cyberlindnera_jadinii',
 'ogataea_parapolymorpha',
 'pachysolen_tannophilus',
 'yHMPu5000034604_sporopachydermia_lactativora_160519',
 'alloascoidea_hylecoeti',
 'candida_apicola',
 'yarrowia_lipolytica',
 'tortispora_caseinolytica',
 'lipomyces_starkeyi']

In [43]:
base_dir

'G:\\My Drive\\Crick_LMS\\projects\\diverse_yeasts\\alphafold'

In [67]:
#Filter original structural alignments to remove bad structures
#Also include metadata from original species selection fasta and add metadata for model species. 
# base_dir/selected_proteins.fasta
#
#
# Make a protein fasta for each alignment from orignal sequence and with model species sequences
#
# Make a dictionary of lists of each protein present in each species - save as a .json

species_table = pd.read_csv(base_dir + os.sep + 'species_selection.csv')
specs = list(species_table.loc[species_table['Load']=='Y']['original_genome_id'])
proteins_present_by_spec = {spec: [] for spec in specs}

modelspec_params = {'Scer':('saccharomyces_cerevisiae'), 
                    'Calb':('candida_albicans'), 
                    'Spom':('schizosaccharomyces_pombe')
                   }
proteins_present_by_spec_fname = base_dir + os.sep + 'selected_protein_ids.json'

for fasta_fname in os.listdir(base_dir + os.sep +  os.path.normpath('msas\structural\FASTA') + os.sep  ):
    #fasta_fname = 'OG1004_REF_Scer_AF-P15938-F1-model_v2.FASTA'
    fasta_fname_base = fasta_fname.split('.')[0]

    seqs_to_remove = struct_analysis[fasta_fname.split('.')[0]]['Files to be excluded']

    fname_struct_aln_orig = base_dir + os.sep +  os.path.normpath('msas\structural\FASTA\\' + fasta_fname)   # os.path.normpath('msas\FILES_ogs_pep_aligned\\' + og + '.mfaa.mafft')
    fname_struct_aln_filt_out = base_dir + os.sep + os.path.normpath('msas\structural\\fasta_filt\\' + fasta_fname_base + '.struct_filt.fasta')
    fname_proteome = base_dir + os.sep + os.path.normpath('og_sequences\proteome\\' + fasta_fname_base + '.pep.fasta')

    #Read in dictionary of ref name map to sequence name
    struct_align = AlignIO.read(fname_struct_aln_orig, 'fasta')
    with open(fname_struct_aln_filt_out , 'w') as f_out_filt:
        with open(fname_proteome, 'w') as f_out_prot: 
            for record in struct_align: 
                structure_imported = False
                name_orig = record.id    

                #Check if name in ref sequence (S.cer, C.alb, and S. pom) + convert
                if name_orig.split('_')[0] in set(['Scer', 'REF', 'Calb', 'Spom']):
                    structure_imported = True
                    if name_orig.split('_')[0] == 'REF': 
                        spec_abbrev = name_orig.split('_')[1]
                        swissprot_id = name_orig.split('_')[2].split('-')[1]
                    else: 
                        spec_abbrev = name_orig.split('_')[0]
                        swissprot_id = name_orig.split('_')[1].split('-')[1]
                    
                    (spec) = modelspec_params[spec_abbrev]
                    gene_id = swissprot_id_2_gene_id[spec_abbrev][swissprot_id]
                    if spec_abbrev == 'Spom':
                        y1000_id = 'None'
                    else: 
                        #spec_old = spec_abbrev_dict[spec_abbrev]
                        y1000_id = gene_id_2_y1000_id[spec_abbrev][gene_id]

                else: 
                    (spec, og, y1000_id) = name_orig.split('__')


                #Filter out seqs with bad alignments
                if not(name_orig in seqs_to_remove): 
                    
                    #Extract sequence from original peptide fasta
                    
                    if structure_imported: 
                        prot_seq = model_protein_dict[spec_abbrev][gene_id]
                        L = len(prot_seq)
                        header = '>' + name_orig + ' source=af2  gene_full=' + gene_id +' y1000_id=' + y1000_id + ' L=' + str(L) + '\n' 
                        
                        proteins_present_by_spec[spec].append((name_orig,gene_id,y1000_id,swissprot_id))
                    
                        
                    else: 
                        header_dict = {}
                        header_raw =selected_proteins_headers[name_orig] 
                        for item in header_raw.split(' ')[1:]: 
                            key,val = item.split('=')
                            header_dict[key] = val
                        
                        if header_dict['source']=='uniprot':
                            swissprot_id = header_dict['gene_full'].split('|')[1]
                        elif header_dict['source']=='shen':
                            swissprot_id = None

                        proteins_present_by_spec[spec].append((name_orig,None,y1000_id,swissprot_id))
            
                        prot_seq = selected_proteins_seqs[name_orig]
                        header = '>' + header_raw + '\n'  
                        

                    f_out_prot.write(header)
                    f_out_prot.write(prot_seq + '\n')

                    f_out_filt.write(header)
                    f_out_filt.write(str(record.seq) + '\n')
                    



# Save proteins_present_by_spec

#remove_duplicates
for spec in specs: 
    proteins_present_by_spec[spec] = list(set(proteins_present_by_spec[spec]))

with open(proteins_present_by_spec_fname, 'w') as f:
    json.dump(proteins_present_by_spec, f, sort_keys=True, indent=4 )


IndexError: list index out of range

In [69]:
header_dict

{'source': 'uniprot',
 'gene_full': 'augustus_masked-Deha2E-processed-gene-7.91',
 'gene_full_shen': 'augustus_masked-Deha2E-processed-gene-7.91',
 'L': '1184',
 'sim_score_vs_shen': '60.1',
 'sim_score_vs_shen_diff': '0.5'}

In [64]:
header_dict

{'source': 'uniprot',
 'gene_full': 'sp|Q6BUG5|PURA_DEBHA',
 'gene_full_shen': 'maker-Deha2C-augustus-gene-9.144',
 'L': '429',
 'sim_score_vs_shen': '95.2',
 'sim_score_vs_shen_diff': '44.5'}

In [None]:
cds_params = { 
                 'saccharomyces_cerevisiae': ,
                 'schizosaccharomyces_pombe': ,
                 'candida_albicans' :,
                 'debaryomyces_hansenii',
                 #shen_only
                 'kazachstania_naganishii',
                 'geotrichum_candidum',
                 'vanderwaltozyma_polyspora',                              
                 'eremothecium_gossypii',
                 'ascoidea_rubescens',
                 'cyberlindnera_jadinii',
                 'ogataea_parapolymorpha',
                 'pachysolen_tannophilus',
                 'yHMPu5000034604_sporopachydermia_lactativora_160519',
                 'alloascoidea_hylecoeti',
                 'candida_apicola',
                 'yarrowia_lipolytica',
                 'tortispora_caseinolytica',
                 'lipomyces_starkeyi'
                 #shen/uniprot-ENA         
                 'wickerhamomyces_anomalus',
                 'yHMPu5000034957_hanseniaspora_osmophila_160519',
                 'candida_tropicalis',
                 #shen/uniprot-NCBI
                 'zygosaccharomyces_rouxii',             
                 'lachancea_thermotolerans',
                 'kluyveromyces_marxianus',
                 'kluyveromyces_lactis',
                 'komagataella_pastoris',
                 'torulaspora_delbrueckii',
    }

In [55]:
specs

['saccharomyces_cerevisiae',
 'debaryomyces_hansenii',
 'wickerhamomyces_anomalus',
 'lachancea_thermotolerans',
 'kazachstania_naganishii',
 'kluyveromyces_marxianus',
 'yHMPu5000034957_hanseniaspora_osmophila_160519',
 'geotrichum_candidum',
 'zygosaccharomyces_rouxii',
 'candida_albicans',
 'candida_tropicalis',
 'kluyveromyces_lactis',
 'komagataella_pastoris',
 'schizosaccharomyces_pombe',
 'vanderwaltozyma_polyspora',
 'torulaspora_delbrueckii',
 'eremothecium_gossypii',
 'ascoidea_rubescens',
 'cyberlindnera_jadinii',
 'ogataea_parapolymorpha',
 'pachysolen_tannophilus',
 'yHMPu5000034604_sporopachydermia_lactativora_160519',
 'alloascoidea_hylecoeti',
 'candida_apicola',
 'yarrowia_lipolytica',
 'tortispora_caseinolytica',
 'lipomyces_starkeyi']

In [None]:
#For each species subset the database of CDS files and make a dictionary

for spec in specs: 
    


In [53]:
proteins_present_by_spec_fname = base_dir + os.sep + 'selected_protein_ids.json'

In [50]:
a = proteins_present_by_spec['saccharomyces_cerevisiae']

In [52]:
proteins_present_by_spec_fname

'selected_protein_ids.json'

In [51]:
len(a)

531

In [None]:
#For each of the Clustered OG sets, make a CDS file

#G:\My Drive\Crick_LMS\projects\diverse_yeasts\alphafold\msas\structural\fasta_renamed_struct_filt
#G:\My Drive\Crick_LMS\projects\diverse_yeasts\alphafold\og_sequences 
#    cds
#    proteome

#for fname_proteome in os.listdir(base_dir + os.sep + os.path.normpath('alphafold/og_sequences/proteome')): 
fname_proteome = 'OG1004_REF_Scer_AF-P15938-F1-model_v2.pep.fasta'
fname_base = fname_proteome.split('.')[0]

fname_proteome_full = base_dir + os.sep + os.path.normpath('og_sequences\proteome\\' + fname_proteome)
fname_cds = base_dir + os.sep + os.path.normpath('og_sequences\proteome\\' + fname_proteome)


#Cycle through the proteomes make a dictionary of lists of proteins from each species.  Save as a .json

#For each species build a dictionary (and save as a short fasta) with only the appropriate proteins from that species. 

#Cycle through the proteomes again
proteome = SeqIO.parse(fname_proteome_full, 'fasta')
with open(fname_cds, 'w') as f_out_cds:
    for record in proteome: 
        #extract species
        
        #extract cds from cds file
        
        



In [30]:
#Model species info

#C. albicans
#http://www.candidagenome.org/download/sequence/C_albicans_SC5314/Assembly22/current/  
# C_albicans_SC5314_A22_current_default_coding.fasta
# C_albicans_SC5314_A22_current_default_protein.fasta
#downloaded on 20221012 and stored in
#G:\My Drive\Crick_LMS\external_data\genomes\Candida_albicans


#S. cerevisiae
#G:\My Drive\Crick_LMS\external_data\genomes\Saccharomyces_cerevisiae\S288C_reference_genome_R64-2-1_20150113
#orf_coding_all_R64-2-1_20150113.fasta
#orf_trans_all_R64-2-1_20150113.fasta

#S. pombe
#From https://www.pombase.org/data/genome_sequence_and_features/feature_sequences/ on 20221012
#cds.fa 
#peptide.fa  
#both last modified 2022-10-12 03:26
#Stored at
#G:\My Drive\Crick_LMS\external_data\genomes\Schizosaccharomyces_pombe

## Make a Fasta of structures for S.pom, S.cer, and C. alb that we need Kcat/Km from

In [None]:
#Import og_metadata