In [160]:
%load_ext autoreload
%autoreload 2
import diverse_yeast_tools as dyt
import os
import sys
import pandas as pd
from collections import Counter
import pickle
import json
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import warnings
from Bio import SeqIO, AlignIO, pairwise2, SeqUtils


#from Bio import AlignIO   #, Align
#import shutil

base_dir = os.path.normpath('G:/My Drive/Crick_LMS/projects/diverse_yeasts/alphafold')

summary_data_dir = base_dir + os.sep + os.path.normpath('/Output/Summary')

#Load dictionary to translate 3 letter aa code to 1 letter aa code
trans_dict = SeqUtils.IUPACData.protein_letters_3to1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [99]:
# Trim alignment by structural feature

In [148]:
feature

True

In [151]:
# First Feature:  Binding Site
feature='binding_site'
min_feature_size = 5

In [161]:
#Example OG1316_REF_Scer_AF-P19097-F1-model_v2 -- FAS2 as it is an enzyme with a less conserved binding site
#og_ref = 'OG1316_REF_Scer_AF-P19097-F1-model_v2'

feature_sizes = {}

for fasta_file in os.listdir(base_dir + os.sep + os.path.normpath('msas/structural/tm_align/fasta_renamed/')):
    og_ref = fasta_file.split('.')[0]
    print(og_ref)

    ref = '_'.join(og_ref.split('_')[1:])
     
    
    if ref+'.pkl' in os.listdir(summary_data_dir): 
        #Load Summary data for given og_ref
        og_summary_fname = summary_data_dir + os.sep + ref + '.pkl'
        with open(og_summary_fname, 'rb') as f_in:
                og_summary = pickle.load(f_in)

        #Check to see if the feature exists for that protein
        if 'Binding Site' in set(og_summary['Mapping Colors'].keys()):

            aln_fname = base_dir + os.sep + os.path.normpath('msas/structural/tm_align/fasta_renamed/' + og_ref + '.tm.fasta')
            aln = AlignIO.read(open(aln_fname),'fasta')

            aln_cds_fname = base_dir + os.sep + os.path.normpath('msas/structural/tm_align/cds_aln/' + og_ref + '.tm.cds.aln.fasta')
            aln_cds = AlignIO.read(open(aln_cds_fname),'fasta')


            #Load alignment and make dictionary to map ref structure coordinates to alignment coordinates

            ref_ind=None
            for (jj, record) in enumerate(aln): 
                if ref == record.id.split('.')[0]:
                    ref_ind = jj

            #Get map for reference sequence from msa index
            ref_seq_from_msa = str(aln[ref_ind,:].seq)
            ref_seq, msa2ref, ref2msa, pair_mapping = dyt.seq_squeeze(ref_seq_from_msa)


            #Get list of indices for the feature   
            feat_inds_ref = []
            for jj, feature_present in enumerate((og_summary['Mapping Colors']['Binding Site']==0)): 
                if feature_present:
                    feat_inds_ref.append(jj)

            feature_size = len(feat_inds_ref)
            feature_sizes[og_ref] = feature_size

            if feature_size>= min_feature_size: 
                feat_inds_msa = []


                for feat_ind in feat_inds_ref: 
                    feat_inds_msa.append(ref2msa[feat_ind])

                    #Verify seq_squeeze is mapping correctly
                    assert (ref_seq[feat_ind] == ref_seq_from_msa[ref2msa[feat_ind]]), "bad mapping to ref from seq_squeeze"

                    #Verify identical sequence in tm_align reference and in Output/Summary data
                    aa_3L_caps = og_summary['Mapping']['Amino Acid'][0,:][feat_ind]
                    aa_3L = aa_3L_caps[0] + aa_3L_caps[1:].lower()
                    trans_dict[aa_3L]
                    assert (ref_seq[feat_ind] == trans_dict[aa_3L]), "Difference in sequence between tm_align alignment and Output/Summary data: " + og_ref + ' ' + str(feat_ind) + ' ' + ref_seq[feat_ind] + ' vs ' + aa3L 

                #Extract Alignment Subsets
                #A more efficient way of doing this is here: https://www.biostars.org/p/460408/
                #Converts alignment to numpy array then converts back to aln object

                aln_temp = aln[:,0:0]
                aln_cds_temp = aln_cds[:,0:0]

                for x in feat_inds_msa:
                    aln_subset = aln_temp[:,:] + aln[:, (x-1):x]
                    aln_temp = aln_subset

                    aln_cds_subset = aln_cds_temp[:,:] + aln_cds[:, (3*(x-1)):(3*x)]
                    aln_cds_temp = aln_cds_subset

                #Save alignment subset
                feature_subset_dir = base_dir + os.sep + os.path.normpath('msas/structural/tm_align/feature_subsets/' + feature) + os.sep

                aln_subset_fname_out = feature_subset_dir + os.path.normpath('fasta_renamed/' + og_ref + '.tm.fasta')
                with open(aln_subset_fname_out,'w') as f_out: 
                    for record in aln_subset:
                        f_out.write('>' + record.id + '\n')   # Could add a length to this
                        f_out.write(str(record.seq) + '\n')

                aln_cds_subset_fname_out = feature_subset_dir + os.path.normpath('cds_aln/' + og_ref + '.tm.cds.aln.fasta')
                with open(aln_cds_subset_fname_out,'w') as f_out: 
                    for record in aln_cds_subset:
                        f_out.write('>' + record.id + '\n')   # Could add a length to this
                        f_out.write(str(record.seq) + '\n')
            else: 
                print(feature + ' size of ' + str(feature_size) + ' below threshold size of ' + str(min_feature_size))

        else: 
            print('No ' + feature + ' for ' + og_ref)
    
    else: 
        print('No .pkl file for ' + og_ref)

OG1004_REF_Scer_AF-P15938-F1-model_v2
OG1004_REF_Scer_AF-P40459-F1-model_v2
OG1012_REF_Scer_AF-P08004-F1-model_v2
No binding_site for OG1012_REF_Scer_AF-P08004-F1-model_v2
OG1016_REF_Scer_AF-P00330-F1-model_v2
No binding_site for OG1016_REF_Scer_AF-P00330-F1-model_v2
OG1021_REF_Scer_AF-P40047-F1-model_v2
No binding_site for OG1021_REF_Scer_AF-P40047-F1-model_v2
OG1022_REF_Scer_AF-P38715-F1-model_v2
No binding_site for OG1022_REF_Scer_AF-P38715-F1-model_v2
OG1022_REF_Scer_AF-P49954-F1-model_v2
No binding_site for OG1022_REF_Scer_AF-P49954-F1-model_v2
OG1030_REF_Scer_AF-P37291-F1-model_v2
No binding_site for OG1030_REF_Scer_AF-P37291-F1-model_v2
OG1032_REF_Scer_AF-P03965-F1-model_v2
No binding_site for OG1032_REF_Scer_AF-P03965-F1-model_v2
OG1039_REF_Scer_AF-P19414-F1-model_v2
No binding_site for OG1039_REF_Scer_AF-P19414-F1-model_v2
OG1052_REF_Scer_AF-P07245-F1-model_v2
No binding_site for OG1052_REF_Scer_AF-P07245-F1-model_v2
OG1054_REF_Scer_AF-P16861-F1-model_v2
OG1055_REF_Scer_AF-P52

In [162]:
feature_sizes

{'OG1004_REF_Scer_AF-P15938-F1-model_v2': 1026,
 'OG1004_REF_Scer_AF-P40459-F1-model_v2': 291,
 'OG1054_REF_Scer_AF-P16861-F1-model_v2': 956,
 'OG1055_REF_Scer_AF-P52910-F1-model_v2': 661,
 'OG1056_REF_Scer_AF-P21954-F1-model_v2': 385,
 'OG1060_REF_Scer_AF-P06169-F1-model_v2': 507,
 'OG1060_REF_Scer_AF-P50873-F1-model_v2': 370,
 'OG1070_REF_Scer_AF-P21524-F1-model_v2': 799,
 'OG1112_REF_Scer_AF-P23254-F1-model_v2': 641,
 'OG1145_REF_Scer_AF-P00358-F1-model_v2': 295,
 'OG1180_REF_Scer_AF-P41903-F1-model_v2': 341,
 'OG1193_REF_Scer_AF-P00924-F1-model_v2': 399,
 'OG1208_REF_Scer_AF-P04806-F1-model_v2': 470,
 'OG1266_REF_Scer_AF-P23337-F1-model_v2': 616,
 'OG1299_REF_Scer_AF-P00549-F1-model_v2': 459,
 'OG1310_REF_Scer_AF-P15202-F1-model_v2': 481,
 'OG1316_REF_Scer_AF-P19097-F1-model_v2': 1792,
 'OG1364_REF_Scer_AF-P27796-F1-model_v2': 391,
 'OG1365_REF_Scer_AF-P29509-F1-model_v2': 250,
 'OG1377_REF_Scer_AF-P53090-F1-model_v2': 472,
 'OG1388_REF_Scer_AF-P07342-F1-model_v2': 586,
 'OG1414_RE

In [157]:
set(og_summary['Mapping Colors'].keys())

{'Color keys Amino Acid',
 'Color keys Amino Acid Type',
 'Color keys Amino Acid Type reduced',
 'Color keys Amino Acid reduced',
 'Color keys DSSP',
 'Color keys DSSP reduced',
 'Colormap Amino Acid',
 'Colormap Amino Acid Type',
 'Colormap Amino Acid Type reduced',
 'Colormap Amino Acid reduced',
 'Colormap DSSP',
 'Colormap DSSP reduced',
 'Colormap No Mutations',
 'Colormap No Type Mutations',
 'Colormap SASA',
 'Colormap pLDDT',
 'Mapping Amino Acid',
 'Mapping Amino Acid Type',
 'Mapping Amino Acid Type reduced',
 'Mapping Amino Acid reduced',
 'Mapping DSSP',
 'Mapping DSSP reduced',
 'No Mutations',
 'No Type Mutations'}

In [123]:
print(aln_cds_subset)

Alignment with 26 rows and 285 columns
ACTAAATAT---CCAGAAGGTTCCGGTGGTGCTTACGAAGGTCC...ATT Calb_AF-A0A1D8PK65-F1-model_v2.pdb
GAAAAATAC---GCTGAGGGTTCTGGTGGTGCCTACGAGGGTGC...CAC REF_Scer_AF-P19097-F1-model_v2.pdb
AAAAAATAT---TCTGATGGATCTGGTGGTGCTTATGAGGGACC...ATT Spom_AF-Q10289-F1-model_v2.pdb
ACCAAATAT---CCTGATGGATCTGGTGGTGCCTACGAAGGACC...ATT alloascoidea_hylecoeti__OG1316__0_7309.pdb
ACCAAATAT---CCTGATAACTCTGGTGGTGCCTATGAAGGTCC...ATT ascoidea_rubescens__OG1316__6_2952.pdb
GAGCCCTAC---GACAAGGATAGTGGTGGTGCCTACGAGGGT--...ATC candida_apicola__OG1316__13_3208.pdb
ACCAAATAT---CCAGAAGGTTCCGGTGGTGCTTACGAAGGTCC...ATC candida_tropicalis__OG1316__30_3680.pdb
ACCAAGTAT---CCAGATGGTTCTGGTGGTGCATATGAAGGTCC...ATT debaryomyces_hansenii__OG1316__36_4062.pdb
ACCAAGTAT---GGAGAAGGCTCTGGTGGTGCATATGAGGGTCC...ATC eremothecium_gossypii__OG1316__40_3799.pdb
ACTAAGTAC---CCTGAGGGTTCCGGTGGTGCTTATGAGGGTGC...ATT geotrichum_candidum__OG1316__43_5811.pdb
GAGAAGTAC---AGCGACGGTTCTGGTGGCGCTTATGAGGGTCC...ATA geotrichum_ca