# Find alternative secondary structures for 11ntR variants
Copyright 2023 John Shin under GPL-3.0

Supporting Figures come from this document.

In [1]:
from tqdm import tqdm
import re

import numpy as np

In [2]:
def populateSeqDictFold(filename,seq_dict):
    with open(filename,'r') as f:
        for line in f:
            if line.startswith('>'):

                seq_id,scaffold,seq = line.split('::')
                seq_id = seq_id[1:]
                scaffold = scaffold[3:]
                seq = seq.rstrip()
                
                seq_dict[seq_id] = seq_dict.get(seq_id,{})

                seq_dict[seq_id]['seq'] = seq_dict[seq_id].get('seq',seq)
                
                seq_dict[seq_id][scaffold] = seq_dict[seq_id].get(scaffold,{})
                
            elif line.lower().startswith(('a','u','c','g')):
                seq_dict[seq_id][scaffold]['full_seq'] = line.rstrip()
            
            elif line.startswith(' free energy'):

                free_energy = re.search(r'-\d+.\d+',line).group(0)

                if free_energy:
                    seq_dict[seq_id][scaffold]['dG_ensemble'] = np.float64(free_energy)
                else:
                    print(f"oh no, issue with {seq_id}::{scaffold}")
            elif line.startswith(' frequency'):

                frequency = re.search(r'\d+.\d+',line).group(0)

                if frequency:
                    seq_dict[seq_id][scaffold]['MFE_freq'] = np.float64(frequency)
                else:
                    print(f"oh no, issue with {seq_id}::{scaffold}")


In [3]:
def freqFromDG(dG,dG_ensemble):
    '''Assumes T=37 C and uses R = 1.98717 kcal/mol'''
    return np.exp(-(dG-dG_ensemble)/(1.98717/1000*(37+273.15)))

def populateSeqDictSubopt(filename,seq_dict):
    with open(filename,'r') as f:
                
        for line in f:
            if line.startswith('>'):
                seq_id,scaffold,seq = line.split('::')
                seq_id = seq_id[1:]
                scaffold = scaffold[3:]
                seq = seq.split(' ')[0]
                
                seq_dict[seq_id] = seq_dict.get(seq_id,{})

                seq_dict[seq_id]['seq'] = seq_dict[seq_id].get('seq',seq)
                
                seq_dict[seq_id][scaffold] = seq_dict[seq_id].get(scaffold,{})
                
                alt_structs = 0
                cum_struct_freq = 0
            
            elif re.search(r'^[.()]+',line) and (cum_struct_freq < 0.95):

                free_energy = re.search(r'-\d+.\d+',line)

                if free_energy:
                    free_energy = np.float64(free_energy.group(0))
                    structure = re.search(r'[(.)]+',line).group(0)
                    frequency = freqFromDG(free_energy,
                                   seq_dict[seq_id][scaffold]['dG_ensemble'])
                    
                    seq_dict[seq_id][scaffold][f"alt{alt_structs}"] =\
                        seq_dict[seq_id][scaffold].get(f"alt{alt_structs}",{})
                    
                    seq_dict[seq_id][scaffold][f"alt{alt_structs}"]['free_energy'] = \
                        free_energy
                    seq_dict[seq_id][scaffold][f"alt{alt_structs}"]['structure'] = \
                        structure
                    seq_dict[seq_id][scaffold][f"alt{alt_structs}"]['frequency'] = \
                        frequency
                    
                    cum_struct_freq += frequency
                    
                else:
                    print(f"oh no, issue with {seq_id}::{scaffold}::alt{alt_structs}")
                
                alt_structs += 1
                

In [4]:
data_path = 'Data/structures/'

In [5]:
all_seq_info = {}

populateSeqDictFold(data_path+'all_muts_13854_fold.fasta',all_seq_info)
populateSeqDictFold(data_path+'all_muts_14007_fold.fasta',all_seq_info)
populateSeqDictFold(data_path+'all_muts_14073_fold.fasta',all_seq_info)
populateSeqDictFold(data_path+'all_muts_35311_A_fold.fasta',all_seq_info)
populateSeqDictFold(data_path+'all_muts_35600_fold.fasta',all_seq_info)

In [6]:
populateSeqDictSubopt(data_path+'all_muts_13854_subopt.fasta',all_seq_info)
populateSeqDictSubopt(data_path+'all_muts_14007_subopt.fasta',all_seq_info)
populateSeqDictSubopt(data_path+'all_muts_14073_subopt.fasta',all_seq_info)
populateSeqDictSubopt(data_path+'all_muts_35311_A_subopt.fasta',all_seq_info)
populateSeqDictSubopt(data_path+'all_muts_35600_subopt.fasta',all_seq_info)

In [7]:
scaffold_params = ((11,21),(25,-12))
receptor_params = ((6,11),(-12,-6))

## Synergistic Mutants

In [9]:
import pandas as pd

In [10]:
syn_df = pd.read_csv('Data/GAAA_coop.csv').set_index('seq')
syn_df['muts'] = syn_df[['first_loc','first_res','second_loc','second_res']].apply(
                            (lambda s: ''.join([str(x) for x in s])), axis=1)
syn_df = syn_df[syn_df['coop']=='S'][['muts','dddG']]
syn_df.head()

Unnamed: 0_level_0,muts,dddG
seq,Unnamed: 1_level_1,Unnamed: 2_level_1
UAUGG_ACUACG,1A5C,"-0.39 (-0.75,-0.18)"
AAUGG_GCUAAG,7A1G,"-0.31 (-0.75,-0.08)"
UAUGG_UCUAUG,1U5U,"-0.94 (-1.32,-0.70)"
UAUGG_UCUAGG,1U5G,"-0.46 (-0.94,-0.05)"
UAUGG_GCUCAG,1G4C,"-1.02 (-1.37,-0.82)"


In [11]:
def fold11ntR(seq,fold,verbose=False,pad=''):
    left = 0
    right = len(seq)-1
    
    lines = []
    
    finished = False
    seen = 0
    
    while not finished:
        
        if verbose:
            print(left,right,seen)
            print('\n'.join(lines[::-1]),'\n')
        
        
        if left == right:
            lines += [f"{pad}{seq[left]}"]
            left += 1
            seen += 1          
        elif fold[left] == '.':            
            if fold[right] == '.':
                lines += [f"{pad}{seq[left]} {seq[right]}"]
                left += 1
                right -= 1
                seen += 2
            else:
                lines += [f"{pad}{seq[left]} |"]
                left += 1
                seen += 1
                
        elif fold[left] == '(':            
            if fold[right] == '.':
                lines += [f"{pad}| {seq[right]}"]
                right -= 1
                seen += 1
            else:
                lines += [f"{pad}{seq[left]}-{seq[right]}"]
                left += 1
                right -= 1
                seen += 2
            
        if seen == len(seq):
            finished = True
            
            
    lines = lines[::-1]
    
    return '\n'.join(lines)
                   

def get11ntRFolds(d,verbose=False):
    
    scaffold_dicts = {}

    for k,v in d.items():        
        if k == 'seq':
            seq = v            
        else:
            scaffold = k
            fold_energies = {}
            fold_freqs = {}
            
            for kk,vv in v.items():
                tot_freq = 0
                if 'alt' in kk:
                    fold = ''.join([vv['structure'][slice(*i)] for i in receptor_params])
                    fold_energies[fold] = fold_energies.get(fold,0) + vv['free_energy']*vv['frequency']
                    fold_freqs[fold] = fold_freqs.get(fold,0) + vv['frequency']
                    
            for k in fold_energies.keys():
                fold_energies[k] = fold_energies[k]/fold_freqs[k]
                
            scaffold_dicts[scaffold] = fold_energies
    
    if verbose:
        return scaffold_dicts
    
    output = {}
    
    for k,v in scaffold_dicts.items():
        fold = sorted(v, key=v.get)[0]
        output[fold] = output.get(fold,{})
        output[fold].update({k:v[fold]})
        
    return output

In [12]:
print('Scaffold 1')
print(fold11ntR('cuaggaUAUGGaacugagucgGGAAcgacugaguuCCUAAGuccuag'.replace('_',''),
                '(((((((..((((((.(((((....))))).))))))...)))))))',pad='   '))

print('Scaffold 2')
print(fold11ntR('cuaggaUAUGGaaugcacaggGGAAccugugcauuCCUAAGuccuag'.replace('_',''),
                '(((((((..((((((((((((....))))))))))))...)))))))',pad='   '))

print('Scaffold 3')
print(fold11ntR('cuaggaUAUGGagggaucuugGGAAcaagaucccuCCUAAGuccuag'.replace('_',''),
                '(((((((..((((((((((((....))))))))))))...)))))))',pad='   '))

print('Scaffold 4')
print(fold11ntR('cuaggaUAUGGaagccggucgGGAAcgaccguggcuuCCUAAGuccuag'.replace('_',''),
                '(((((((..((((((((((((....)))))..)))))))...)))))))',pad='   '))

print('Scaffold 5')
print(fold11ntR('cuaggaUAUGGaagccggucgGGAAcgaccaggcuuCCUAAGuccuag'.replace('_',''),
                '(((((((..((((((((((((....))))).)))))))...)))))))',pad='   '))



Scaffold 1
   G A
   G A
   g-c
   c-g
   u-a
   g-c
   a-u
   g g
   u-a
   c-g
   a-u
   a-u
   G-C
   G-C
   | U
   U A
   A A
   U-G
   a-u
   g-c
   g-c
   a-u
   u-a
   c-g
Scaffold 2
   G A
   G A
   g-c
   g-c
   a-u
   c-g
   a-u
   c-g
   g-c
   u-a
   a-u
   a-u
   G-C
   G-C
   | U
   U A
   A A
   U-G
   a-u
   g-c
   g-c
   a-u
   u-a
   c-g
Scaffold 3
   G A
   G A
   g-c
   u-a
   u-a
   c-g
   u-a
   a-u
   g-c
   g-c
   g-c
   a-u
   G-C
   G-C
   | U
   U A
   A A
   U-G
   a-u
   g-c
   g-c
   a-u
   u-a
   c-g
Scaffold 4
   G A
   G A
   g-c
   c-g
   u-a
   g-c
   g-c
   | g
   | u
   c-g
   c-g
   g-c
   a-u
   a-u
   G-C
   G-C
   | U
   U A
   A A
   U-G
   a-u
   g-c
   g-c
   a-u
   u-a
   c-g
Scaffold 5
   G A
   G A
   g-c
   c-g
   u-a
   g-c
   g-c
   | a
   c-g
   c-g
   g-c
   a-u
   a-u
   G-C
   G-C
   | U
   U A
   A A
   U-G
   a-u
   g-c
   g-c
   a-u
   u-a
   c-g


In [13]:
syn_df['dG_fold'] = [list(get11ntRFolds(all_seq_info[muts]).values())[0]['13854'] for muts in syn_df['muts']]
syn_df['Structure'] = [list(get11ntRFolds(all_seq_info[muts]).keys())[0] for muts in syn_df['muts']]

syn_df[['muts','dddG','dG_fold','Structure']].sort_values('dddG',ascending=False)

Unnamed: 0_level_0,muts,dddG,dG_fold,Structure
seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GAUGG_CCUAAC,7G6C,"-2.09 (-3.28,-1.38)",-28.488971,(..(())...)
GAUGG_CCUACG,7G5C,"-1.87 (-2.36,-1.64)",-26.868439,...(())....
UAUGG_CCUAUA,5U6A,"-1.49 (-1.76,-1.18)",-30.488971,((((()).)))
UAAGG_CCUGAG,9A4G,"-1.37 (-2.00,-0.90)",-29.588971,(.((()))..)
UAUGG_GCUAUG,1G5U,"-1.13 (-1.49,-0.92)",-28.151093,(((((.)))))
UAUGG_GCUCAG,1G4C,"-1.02 (-1.37,-0.82)",-22.741249,(..(..)...)
UAUGG_CCUACA,5C6A,"-0.99 (-1.44,-0.71)",-26.788971,(..(())...)
UAGGG_CCUGAG,9G4G,"-0.99 (-1.42,-0.77)",-29.588971,(.((()))..)
UAUGG_UCUAUG,1U5U,"-0.94 (-1.32,-0.70)",-28.188971,(((((.)))))
UAUGG_CCUACC,5C6C,"-0.83 (-1.23,-0.60)",-25.82287,...(())....


Wild Type

In [14]:
list(get11ntRFolds(all_seq_info['wt']).values())[0]['13854']

-27.088971165679713

In [18]:
list(get11ntRFolds(all_seq_info['wt']).keys())[0]

'(..(())...)'