In [None]:
# imports and settings
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import MDAnalysis as mda
import seaborn as sns
import nglview as nv                            # for visualisation
from MDAnalysis.analysis.align import alignto   # for aligning structures
from MDAnalysis.analysis.pca import PCA         # for PCA
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP                   # for secondary structure selection
from Bio.PDB.SASA import ShrakeRupley           # for SASA calculation
from IPython.display import display             # for data frame display
from multiprocessing import Pool                # for multiprocessing
from tqdm import tqdm                           # for progress bars

# surpress warnings
warnings.filterwarnings(action='ignore', module='matplotlib')
warnings.filterwarnings(action='ignore', module='mdanalysis')
warnings.filterwarnings('ignore')

# pandas settings
pd.set_option('display.max_colwidth', None)

############################################################################################################

# base directories
base_directory = "/biggin/b212/bioc1781/Projects/CTNS/human/monomer/red-msa/with-dropout/" #"/biggin/b212/bioc1781/Projects/KDELR/red-msa/"  #"/biggin/b212/bioc1781/Projects/CTNS/human/monomer/red-msa/with-dropout/"
structure_directory = base_directory + "ensemble/" # directory with the AF2 ensemble, cannot be the same as base directory

# settings
selection_for_writeout  = "protein and name CA"  # useful if you want all the outputs to have a specific selection
conserved_residues      = "resid 162 or resid 281 or resid 142 or resid 143 or resid 346 or resid 345 or resid 305 or resid 280 or resid 138 or resid 211 or resid 208 or resid 273 or resid 173 or resid 335 or resid 205 or resid 332 or resid 176 " #"resid 205 or resid 305 or resid 346 or resid 169 or resid 308 or resid 339 or resid 260 or resid 158 or resid 338 or resid 177 or resid 288 or resid 222 or resid 139 or resid 141 or resid 298 or resid 182 or resid 280 or resid 335 or resid 142 or resid 138 or resid 170 or resid 208 or resid 173 or resid 134"        #'resid 205 or resid 305 or resid 346 or resid 169 or resid 308 or resid 339 or resid 260 or resid 158 or resid 338 or resid 177 or resid 288 or resid 222 or resid 139 or resid 141 or resid 298 or resid 182 or resid 280 or resid 335 or resid 142 or resid 138 or resid 170 or resid 208 or resid 173 or resid 134'
excluded_residues       = ""        # resid 134-149
resid_offset    = 115 #0 #115            # first resID in the reference structures if the chain does not start from 1
# structure filtering   
thresh_rmsd     = 10 #6             # threshold for discarding structures based on RMSD
thresh_pLDDT    = 90                # threshold for discarding structures based on pLDDT
thresh_sasa_coeff = 100             # multiple of highest sasa of the reference structure above which structures are discarded
#diffmat_thresh  = 0                # threshold for ignoring atoms in RMSD calculations based on how much they differ between the reference structures
# analysis  
num_processes   = 12                # CPU cores to use for multiprocessing
n_pcs           = 3                 # number of principal components to keep in PCA
# monte carlo   
mc_temp         = 500               # monte carlo temperature
mc_wf_sasa      = 0                 # monte carlo energy function weight factor for SASA (check if this should be -ve, depends on order of subtraction and matrix indexing)
mc_n_runs       = 1                 # number of monte carlo runs
mc_n_bins       = 24                # so mc_n_bins is the total number of bins (because of reference structures and zero indexing)
collective_variable = 'lumen_helix_bundle_separation'         # collective_variable to bin for path finding
start_from_endstates = False
# for plotting
plot_variable_1     = 'PC1'   
plot_variable_2     = 'PC2'
plot_variable_3     = 'sasa'

os.environ["OMP_NUM_THREADS"] = str(num_processes)

############################################################################################################

# reference structures
use_reference_structures = True

if use_reference_structures:

    outward_open_pdb = base_directory + "8DKE_cytosol_noNTD_hydrogens.pdb"
    inward_open_pdb  = base_directory + "8DKI_lumen_noNTD_hydrogens.pdb"

    # universes for reference structures
    u_outward_open = mda.Universe(outward_open_pdb, outward_open_pdb)
    u_inward_open  = mda.Universe(inward_open_pdb, inward_open_pdb)

    # align reference structures to eachother
    
    alignto(u_inward_open, u_outward_open, select='all', weights="mass")

    # write reindexed pdbs of reference structures for inclusion in the ensemble
    u_outward_open.atoms.residues.resids -= resid_offset
    u_outward_open.atoms.write(structure_directory + "ref_outward.pdb")
    u_outward_open.atoms.residues.resids += resid_offset
    #
    u_inward_open.atoms.residues.resids -= resid_offset
    u_inward_open.atoms.write(structure_directory + "ref_inward.pdb")
    u_inward_open.atoms.residues.resids += resid_offset

else:
    outward_open_pdb = None
    inward_open_pdb  = None



In [None]:
# retrieve structures

# user specific - do whatever you need here to get a dictionary of structures and their associated pLDDT values

# # ONLY NEED TO RUN ONCE 
# # copy structures from outputs to ensemble
# msa_depths = ['32-64', '64-128', '128-256', '256-512', '512-1024']
# 
# for msa in msa_depths:
#     structures_msa = [f for f in os.listdir(base_directory + 'output_' + msa) if f.endswith('.pdb')]
#     for structure in structures_msa:
#         # copy all the pdb files with "_relaxed_" in the name to the structure directory
#         if "_relaxed_" in structure:
#             # get the rank of the strcuture (last thing before the file extension)
#             rank = 'rank_' + structure.split('_')[structure.split('_').index('rank')+1]
#             os.system('cp ' + base_directory + 'output_' + msa + '/' + structure + ' ' + structure_directory + '/msa-' + msa + '_' + rank + '.pdb')
# 
# # get list of all files in structure directory directory with the pdb extension
# structures = [f for f in os.listdir(structure_directory) if f.endswith('.pdb')]
# structures.sort()

# associate structures with pLDDT values
structure_and_pLDDT = {}

msa_depths = ['8-16', '16-32', '32-64', '64-128', '128-256', '256-512', '512-1024']
for msa in msa_depths:
    structures_msa = [f for f in os.listdir(structure_directory) if f.endswith('.pdb') and msa in f and str(msa) in f]
    log_file = structure_directory + msa + "_log.txt"   # look up the corresponding log file
    for structure in structures_msa:
        rank = 'rank_' + structure.split('_')[structure.split('_').index('rank')+1] # get the rank of the structure
        rank = rank.split('.')[0]           # remove the file extension
        with open(log_file, "r") as file:
            for line in file:               # match the rank with the pLDDT value
                if rank in line:
                    pLDDT = line.split()[3].replace('pLDDT=','')
                    structure_and_pLDDT[structure] = float(pLDDT)
                    break

print('Structures:', len(structure_and_pLDDT))

# if no reference structures, chose 2 hightest pLDDT structures as references
if not use_reference_structures:
    # get the 2 structures with the highest pLDDT values
    ref_structures = sorted(structure_and_pLDDT, key=structure_and_pLDDT.get, reverse=True)[:2]
    print('Reference structures:', ref_structures)
    # declare the appropriate variable (the name of the structure file)
    outward_open_pdb = structure_directory + ref_structures[0]
    inward_open_pdb  = structure_directory + ref_structures[1]
    # universes for reference structures
    u_outward_open = mda.Universe(outward_open_pdb, outward_open_pdb)
    u_inward_open  = mda.Universe(inward_open_pdb, inward_open_pdb)

    # align reference structures to eachother
    alignto(u_inward_open, u_outward_open, select='protein', weights="mass")

    # write reindexed pdbs of reference structures for inclusion in the ensemble
    #u_outward_open.atoms.residues.resids -= resid_offset
    #u_outward_open.atoms.write(structure_directory + "ref_outward.pdb")
    #u_outward_open.atoms.residues.resids += resid_offset
    #
    #u_inward_open.atoms.residues.resids -= resid_offset
    #u_inward_open.atoms.write(structure_directory + "ref_inward.pdb")
    #u_inward_open.atoms.residues.resids += resid_offset


In [None]:
# automatic "clever" selection definer

# temporary universe
u = u_outward_open

# identify regions of secondary structure
p = PDBParser()
structure = p.get_structure('reference', outward_open_pdb)
model = structure[0]
dssp = DSSP(model, outward_open_pdb, dssp='mkdssp')

helices = []
sheets  = []
loops   = []

# get secondary structure labels for resIDs
for key in dssp.keys():
    if dssp[key][2] == 'H' or dssp[key][2] == 'G' or dssp[key][2] == 'I':
        helices.append(key[1][1])
    elif dssp[key][2] == 'E':
        sheets.append(key[1][1])
    elif dssp[key][2] == 'T' or dssp[key][2] == 'S':
        loops.append(key[1][1])
    
helices = sorted(list(set(helices)))
sheets = sorted(list(set(sheets)))
loops = sorted(list(set(loops)))

helices_contiguous = []
sheets_contiguous  = []
loops_contiguous   = []

# get contiguous regions of secondary structure
for i in range(len(helices)):
    if i == 0:
        helices_contiguous.append([helices[i]])
    elif helices[i] == helices[i-1] + 1:
        helices_contiguous[-1].append(helices[i])
    else:
        helices_contiguous.append([helices[i]])

for i in range(len(sheets)):
    if i == 0:
        sheets_contiguous.append([sheets[i]])
    elif sheets[i] == sheets[i-1] + 1:
        sheets_contiguous[-1].append(sheets[i])
    else:
        sheets_contiguous.append([sheets[i]])

for i in range(len(loops)):
    if i == 0:
        loops_contiguous.append([loops[i]])
    elif loops[i] == loops[i-1] + 1:
        loops_contiguous[-1].append(loops[i])
    else:
        loops_contiguous.append([loops[i]])

selection_helices = []
selection_sheets  = []
selection_loops   = []

# make mdanalysis selections corresponding to these regions
for i in range(len(helices_contiguous)):
    selection_helices.append('(resid %s-%s)' % (helices_contiguous[i][0], helices_contiguous[i][-1]))
for i in range(len(sheets_contiguous)):
    selection_sheets.append('(resid %s-%s)' % (sheets_contiguous[i][0], sheets_contiguous[i][-1]))
for i in range(len(loops_contiguous)):
    selection_loops.append('(resid %s-%s)' % (loops_contiguous[i][0], loops_contiguous[i][-1]))

# conserved residues
selection_conserved_residues = '((' + conserved_residues + ') and (name CA or name CG or name CZ* or name NZ))' 

# format selections
selection_helices = ','.join(selection_helices)
selection_helices = selection_helices.replace(',', ' or ')
selection_sheets = ','.join(selection_sheets)
selection_sheets = selection_sheets.replace(',', ' or ')
selection_loops = ','.join(selection_loops)
selection_loops = selection_loops.replace(',', ' or ')

endstates = {}

resids_from_ca_dist_diffmat = []

## get the resid of the residues that differ by more than the threshold in absolute terms
#above_thresh = np.where(abs(ca_dist_difference_matrix) >= diffmat_thresh)
#for i in range(len(above_thresh[0])):
#    resids_from_ca_dist_diffmat.append(u.atoms[above_thresh[0][i]].resid)
#resids_from_ca_dist_diffmat = list(set(resids_from_ca_dist_diffmat))    # get unique residues

# make a selection token for these residues
selection_from_ca_dist_diffmat = []
for i in range(len(resids_from_ca_dist_diffmat)):
    selection_from_ca_dist_diffmat.append('resid %s' % resids_from_ca_dist_diffmat[i])
selection_from_ca_dist_diffmat = ','.join(selection_from_ca_dist_diffmat)
selection_from_ca_dist_diffmat = selection_from_ca_dist_diffmat.replace(',', ' or ')

# final rmsd_selection for analysis
rmsd_selection = '( ( (' + selection_helices + ') and name CA ) or ( (' + selection_loops + ') and name CA ) )' # or' + selection_conserved_residues #+ ' and not (resid 116-120 or resid 356-367)'
if conserved_residues != "":
    rmsd_selection += ' or ( (' + selection_helices + ') and' + selection_conserved_residues + ')'
if excluded_residues != "":
    rmsd_selection += ' and not (' + excluded_residues + ')'

print(rmsd_selection)

# write a pdb of the selection
# select the rmsd_selection atoms in the outward open structure
check = u.select_atoms(rmsd_selection)
# write out the selection
check.write(base_directory + 'rmsd_selection.pdb')




In [None]:
# make main pandas dataframe and filter data
ensemble_df = pd.DataFrame(index=structure_and_pLDDT.keys())
# rename the index column to structure
ensemble_df.index.names = ['structure']

pLDDT_scores = {}
plddt_df = pd.DataFrame(list(structure_and_pLDDT.items()), columns=['structure', 'pDDLDT']) # Convert the dictionary to a DataFrame

# add entries to dictionary for the reference structures with pLDDT scores of 100
if use_reference_structures:
    plddt_df = pd.concat([plddt_df, pd.DataFrame({'structure': 'ref_outward.pdb', 'pDDLDT': 100}, index=[0])], ignore_index=True)   # append is deprecated, use concat instead
    plddt_df = pd.concat([plddt_df, pd.DataFrame({'structure': 'ref_inward.pdb', 'pDDLDT': 100}, index=[0])], ignore_index=True)

# Merge the two DataFrames on the structure column
ensemble_df = pd.merge(ensemble_df, plddt_df, on='structure')

# filter by pLDDT
ensemble_df = ensemble_df[ensemble_df['pDDLDT'] > thresh_pLDDT]      # discard < thresh_pLDDT
structures = list(ensemble_df['structure'])                          # apply the filter to the list of structure names

display(ensemble_df)

In [None]:
# run PCA on the ensemble

# write a multistate pdb file for the whole ensemble so MDA will interpret it as a trajectory
with mda.Writer(base_directory + 'ensemble.pdb', u.atoms.n_atoms) as W:
    for structure in ensemble_df['structure']:
        u = mda.Universe(structure_directory + structure, 
                         structure_directory + structure)
        # renumber for consistency and to match selection token   s     
        u.atoms.residues.resids += resid_offset
        u.atoms.segments.segids = 'A'
        u.atoms.chainIDs = 'A'
        W.write(u.select_atoms(rmsd_selection))
 
# make a universe containing all the structures
u = mda.Universe(base_directory + "ensemble.pdb")

# align the ensemble to the selection token
aligner = mda.analysis.align.AlignTraj(u, u, select=rmsd_selection, in_memory=True).run()

# perform principal component analysis
pc = PCA(u, select=rmsd_selection, align=True, mean=None, n_components=None).run()

# project coorindates onto the principal components
pc_projection = pc.transform(u.select_atoms(rmsd_selection), n_components=n_pcs)

# make a dataframe to store the principal components
pca_df = pd.DataFrame(pc_projection, columns=['PC{}'.format(i+1) for i in range(n_pcs)])
pca_df['structure'] = ensemble_df.index

# print out the PCs
display(pd.DataFrame(pca_df).head())

# show table of variances explained by each PC
display(pd.DataFrame((pc.cumulated_variance*100).round(), columns=['PC cumulated variance']).head())

#drop the structure column for plotting
pca_df_pairgrid = pca_df.drop('structure', axis=1)
g = sns.PairGrid(pca_df_pairgrid)
g.map(plt.scatter, marker='.')
plt.show()
pca_df_pairgrid = None

# add principal components to the dataframe - making sure the structures are in the same order
pca_df = pca_df.sort_values(by=['structure'])
pca_df = pca_df.reset_index(drop=True)
ensemble_df['PC1'] = pca_df['PC1'].values
ensemble_df['PC2'] = pca_df['PC2'].values
ensemble_df['PC3'] = pca_df['PC3'].values

# visualisation
n_pcs = 3
for i in range(n_pcs):
    pc_n =    pc.p_components[:, i]
    trans_n =   pc_projection[:, i]
    projected = np.outer(trans_n, pc_n) + pc.mean.flatten()
    coordinates = projected.reshape(len(trans_n), -1, 3)
    
    proj_n = mda.Merge(u.select_atoms(rmsd_selection))
    proj_n.load_new(coordinates)
    
    # write this to a multistate pdb file
    with mda.Writer(base_directory + 'pca{}.pdb'.format(i+1), proj_n.atoms.n_atoms) as W:
        for ts in proj_n.trajectory:
            W.write(proj_n.atoms)

In [None]:
# select references structures for which to perform rmsd calculations below
from sklearn.cluster import HDBSCAN

# min samples is 10 percent of size of ensemble
clustering_min_samples = int(len(ensemble_df) * 0.01)

cluster = HDBSCAN(min_samples=clustering_min_samples, store_centers="medoid").fit(ensemble_df[['PC1', 'PC2', 'PC3']])

# add the cluster labels to the dataframe
ensemble_df['cluster'] = cluster.labels_

# get the number of clusters that are not noise (-1)
num_clusters = len(set(cluster.labels_)) - (1 if -1 in cluster.labels_ else 0)

# plot the clusters
#plt.scatter(ensemble_df['PC1'], ensemble_df['PC2'], c=ensemble_df['cluster'], cmap='viridis')

cluster_representatives = []
for i in cluster.medoids_:
    # lookup the relevant structures in the dataframe (using medoid guarantees this has been sampled)
    cluster_representatives.append(ensemble_df[(ensemble_df['PC1'] == i[0]) & (ensemble_df['PC2'] == i[1]) & (ensemble_df['PC3'] == i[2])]['structure'].values[0])

cluster_best_pLDDT = []
# for each cluster number, get the structure with the highest pLDDT
for i in range(num_clusters):
    cluster_best_pLDDT.append(ensemble_df[ensemble_df['cluster'] == i].sort_values(by=['pDDLDT'], ascending=False)['structure'].values[0])
    
# plot the cluster representatives
#for i in cluster_representatives:
#    plt.scatter(ensemble_df[(ensemble_df['structure'] == i)]['PC1'], ensemble_df[(ensemble_df['structure'] == i)]['PC2'], c='red', marker='x', s=100)

# display the cluster representatives in the dataframe
#print('Cluster centroids (medoids):')
#display(ensemble_df[ensemble_df['structure'].isin(ref_structures)])

# display the best structures per cluster 
print('Top pLDDT per cluster:')
display(ensemble_df[ensemble_df['structure'].isin(cluster_best_pLDDT)].sort_values(by=['pDDLDT'], ascending=False))

ref_structures = cluster_representatives

# diversity pick of structures from the over

xaxis = 'PC1'
yaxis = 'PC2'
zaxis = 'PC3'

# 3D plot of the ensemble
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(ensemble_df[xaxis], ensemble_df[yaxis], ensemble_df[zaxis], c=ensemble_df['cluster'], cmap='Paired', s=1, alpha=1)
# plot the cluster representatives
for i in cluster_representatives:
    ax.scatter(ensemble_df[(ensemble_df['structure'] == i)][xaxis], ensemble_df[(ensemble_df['structure'] == i)][yaxis], ensemble_df[(ensemble_df['structure'] == i)][zaxis], c='black', marker='x', s=200)
ax.set_xlabel(xaxis)
ax.set_ylabel(yaxis)
ax.set_zlabel(zaxis)
plt.show()

In [None]:
# calculate RMSDs of all structures relative to the reference structures

# define function to get rmsd to a structure
def get_rmsd_to_structure(structure):
    mobile = mda.Universe(structure_directory + structure, structure_directory + structure) # make universe
    mobile.atoms.residues.resids += resid_offset                                            # renumber residues in mobile
    rmsds = alignto(mobile, ref, select=rmsd_selection, match_atoms=True, weights=None)     # these ref selections are different becasue ref has different residue numbering
    return [structure, rmsds[1]]                                                            # [1] = rmsd after alignment

# define a function to calculate the SASA of a structure
def get_sasa(structure):
    p = PDBParser(QUIET=1)
    struct = p.get_structure(structure, structure_directory + structure)
    sr = ShrakeRupley(probe_radius=1.4, n_points=100)
    sr.compute(struct, level="S")
    return [structure, struct.sasa]  


if use_reference_structures:

    # get average positions and make new universe with these positions
    average_positions = (u_outward_open.atoms.positions + u_inward_open.atoms.positions) / 2
    u_average = mda.Universe(outward_open_pdb, outward_open_pdb)
    u_average.atoms.positions = average_positions

    rmsd_to_outward = {}

    # outward open
    ref = u_outward_open
    with Pool(processes=num_processes) as pool:
        rmsds = list(tqdm(pool.imap(get_rmsd_to_structure, structures), total=len(structures)))
    rmsd_to_outward = dict(rmsds)   # make this list of tuples into a dictionary

    rmsd_to_inward = {}

    # inward open
    ref = u_inward_open
    with Pool(processes=num_processes) as pool:
        rmsds = list(tqdm(pool.imap(get_rmsd_to_structure, structures), total=len(structures)))
    rmsd_to_inward = dict(rmsds)   # make this list of tuples into a dictionary

    rmsd_to_average = {}

    # average
    ref = u_average
    with Pool(processes=num_processes) as pool:
        rmsds = list(tqdm(pool.imap(get_rmsd_to_structure, structures), total=len(structures)))
    rmsd_to_average = dict(rmsds)   # make this list of tuples into a dictionary

    sasa_dict = {}

    # solvent accesible surface areas
    with Pool(processes=num_processes) as pool:
        sasas = list(tqdm(pool.imap(get_sasa, structures), total=len(structures)))
    sasa_dict = dict(sasas)         # make this list of tuples into a dictionary

    # add the rmsd values to the dataframe
    ensemble_df['rmsd_to_outward'] = rmsd_to_outward.values()
    ensemble_df['rmsd_to_inward'] = rmsd_to_inward.values()
    ensemble_df['rmsd_to_average'] = rmsd_to_average.values()
    # add the sasa values to the dataframe
    ensemble_df['sasa'] = sasa_dict.values()

else:

    # get rmsd with respect ot each of hte reference structures
    for ref_structure in ref_structures:
        
        # make a universe with the reference structure
        ref = mda.Universe(structure_directory + ref_structure, structure_directory + ref_structure)

        # get rmsd to reference structure
        rmsds = {}
        with Pool(processes=num_processes) as pool:
            rmsds = list(tqdm(pool.imap(get_rmsd_to_structure, structures), total=len(structures)))
        rmsds = dict(rmsds)

        # add to the dataframe
        ensemble_df['rmsd_to_' + ref_structure] = rmsds.values()

    sasa_dict = {}

    # solvent accesible surface areas
    with Pool(processes=num_processes) as pool:
        sasas = list(tqdm(pool.imap(get_sasa, structures), total=len(structures)))
    sasa_dict = dict(sasas)         # make this list of tuples into a dictionary

    # add the sasa values to the dataframe
    ensemble_df['sasa'] = sasa_dict.values()

display(ensemble_df)

In [None]:
# measure certain distances and add them to the dataframe
cyto_helix_bundle_1  = '(' + 'resid 286-291 or resid 349-356 or resid 299-304' + ') and name CA' # '(' + 'resid 276-305 or resid 342-357' + ') and name CA'
cyto_helix_bundle_2  = '(' + 'resid 147-152 or resid 220-225' + ') and name CA' # '(' + 'resid 139-166 or resid 213-241' + ') and name CA'
lumen_helix_bundle_1 = '(' + 'resid 178-182 or resid 203-207' + ') and name CA'
lumen_helix_bundle_2 = '(' + 'resid 121-125 or resid 319-323' + ') and name CA'

# calculate certain distances e.g. to use as a CV
def calc_distance(structure, selection1, selection2):
        universe = mda.Universe(structure_directory + structure, structure_directory + structure) # make universe
        universe.atoms.residues.resids += resid_offset      
        sel1_coords = universe.select_atoms(selection1).positions
        sel2_coords = universe.select_atoms(selection2).positions
        sel1_coords_avg = np.mean(sel1_coords, axis=0)  # geometric average = COM when all atoms are Ca
        sel2_coords_avg = np.mean(sel2_coords, axis=0)  # geometric average = COM when all atoms are Ca
        distance = np.linalg.norm(sel1_coords_avg - sel2_coords_avg)
        return distance

# calculate distances
distances = {}
for structure in structures:
    distances[structure] = calc_distance(structure, cyto_helix_bundle_1, cyto_helix_bundle_2)

# add the distances to the dataframe
ensemble_df['cyto_helix_bundle_separation'] = distances.values()

distances = {}
for structure in structures:
    distances[structure] = calc_distance(structure, lumen_helix_bundle_1, lumen_helix_bundle_2)

# add the distances to the dataframe
ensemble_df['lumen_helix_bundle_separation'] = distances.values()
   
#  show
display(ensemble_df)


In [None]:
# make plots
if use_reference_structures:

    # display tables of closest to reference structures, lowest sasa, and most different
    print('Closest to outward open:')
    display(ensemble_df.sort_values(by=['rmsd_to_outward']).head(1)[['structure', 'pDDLDT', 'rmsd_to_outward', 'PC1', 'PC2']])
    print('Closest to inward open:')
    display(ensemble_df.sort_values(by=['rmsd_to_inward']).head(1)[['structure', 'pDDLDT', 'rmsd_to_inward', 'PC1', 'PC2']])

    # plot the rmsd to the reference structures
    plt.scatter(ensemble_df['rmsd_to_outward'], ensemble_df['rmsd_to_inward'], c=ensemble_df['pDDLDT'], cmap='coolwarm')
    plt.xlabel('RMSD to outward open')
    plt.ylabel('RMSD to inward open')
    plt.colorbar(label='pLDDT')
    #limits
    plt.xlim(0)
    plt.ylim(0)
    plt.show()

        # plot the rmsd to the reference structures
    plt.scatter(ensemble_df['rmsd_to_outward'], ensemble_df['rmsd_to_inward'], c=ensemble_df['sasa'], cmap='coolwarm')
    plt.xlabel('RMSD to outward open')
    plt.ylabel('RMSD to inward open')
    plt.colorbar(label='SASA')
    plt.show()


    # plot histogram of rmsd to outward open
    plt.hist(ensemble_df['rmsd_to_outward'], bins=20, alpha=0.5, label='outward open')
    plt.hist(ensemble_df['rmsd_to_inward'], bins=20, alpha=0.5, label='inward open')

else:

    # display tables of closest to reference structures, lowest sasa, and most different
    print('Represnetative structures from each cluster:')
    for i in ref_structures:
        display(ensemble_df.sort_values(by=['rmsd_to_' + i]).head(1)[['structure', 'pDDLDT', 'PC1', 'PC2']])
    print('Lowest 3 SASA:')
    display(ensemble_df.sort_values(by=['sasa']).head(3)[['structure', 'cluster', 'pDDLDT', 'PC1', 'PC2']])
    
    
    # plot ensemble
    plt.scatter(ensemble_df[plot_variable_1], ensemble_df[plot_variable_2], c=ensemble_df[plot_variable_3], cmap='coolwarm')
    plt.xlabel(plot_variable_1)
    plt.ylabel(plot_variable_2)
    plt.colorbar(label=plot_variable_3)
    
    # annotate reference structures
    for i in ref_structures:
        plt.scatter(ensemble_df.loc[ensemble_df['structure'] == i][plot_variable_1], ensemble_df.loc[ensemble_df['structure'] == i][plot_variable_2], c='black', marker='x', s=100)
        plt.annotate(i, (ensemble_df.loc[ensemble_df['structure'] == i][plot_variable_1], ensemble_df.loc[ensemble_df['structure'] == i][plot_variable_2]))
    plt.show()
    
    # for each reference structure, plot the histogram of rmsd to that structure
    for i in ref_structures:
        plt.hist(ensemble_df['rmsd_to_' + i], bins=20, alpha=0.5, label=i)
    plt.legend(loc='upper right')
    plt.xlabel('RMSD')
    plt.ylabel('Count')
    plt.xlim(0)
    fig = plt.gcf()
    fig.set_size_inches(10, 2)
    plt.show()
    
    # plot 2 pane histogram of SASA and pLDDT scores
    fig, (ax1, ax2) = plt.subplots(1, 2)
    ax1.hist(ensemble_df['sasa'], bins=20, alpha=0.5, label='sasa', color='C3')
    ax1.set_xlabel('SASA')
    ax1.set_ylabel('Count')
    ax2.hist(ensemble_df['pDDLDT'], bins=20, alpha=0.5, label='pDDLDT')
    ax2.set_xlabel('pDDLDT')
    fig = plt.gcf()
    fig.set_size_inches(10, 2)
    plt.show()


In [None]:
# parallel calculation of the pairwise RMSD matrix (for the MC energy function used in the next cell)

## load backup matrix
rmsd_matrix = np.load(base_directory + 'rmsd_matrix_backup_90_bigsample.npy') # 'rmsd_matrix_80_6_0_3326-strucs.npy'

# # Initialize a matrix to store pairwise RMSD values
# rmsd_matrix = np.zeros((len(ensemble_df), len(ensemble_df)))
# 
# # function to calculate RMSD values in parallel
# def calculate_rmsd(i):
# 
#     rmsd_values = []
# 
#     # setup universe for i-th structure
#     mobile = mda.Universe(structure_directory + ensemble_df['structure'].values[i], structure_directory + ensemble_df['structure'].values[i])
#     if mobile.atoms.residues.resids[0] == 1: # if first resID == one, then residues are already numbered correctly
#         mobile.atoms.residues.resids += resid_offset
# 
#     for j in range(len(ensemble_df)):
# 
#         # setup universe for j-th structure
#         ref = mda.Universe(structure_directory + ensemble_df['structure'].values[j], structure_directory + ensemble_df['structure'].values[j])
#         if ref.atoms.residues.resids[0] == 1: # if first resID == one, then residues are already numbered correctly
#             ref.atoms.residues.resids += resid_offset
# 
#         # Calculate RMSD and store it in the list
#         rmsd = alignto(mobile, ref, select=rmsd_selection, match_atoms=True)[1]
#         rmsd_values.append(rmsd)
# 
#     mobile = None       # Release memory by setting the loaded structures to None
#     ref = None
# 
#     return rmsd_values
# 
# # Perform parallel computation of RMSD values
# with Pool(processes=num_processes) as pool:
#     results = list(tqdm(pool.imap(calculate_rmsd, range(len(ensemble_df))), total=len(ensemble_df)))
# 
# # Update the rmsd_matrix with the results
# for i, rmsd_values in enumerate(results):
#     rmsd_matrix[i] = rmsd_values
# 
# results = None  # Release memory by setting the results to None
# 
# np.save(base_directory + 'rmsd_matrix.npy', rmsd_matrix)    # save the matrix (to avoid recomputing it in the future)
# 
# # Display progress information
# print('Calculation completed for', len(ensemble_df), 'pairs of structures', end='\r')
# 
# # plot the matrix
# plt.imshow(rmsd_matrix, cmap='inferno')
# plt.colorbar()


In [None]:
# make a matrix of pairwise SASA differences
sasa_matrix = np.zeros((len(ensemble_df), len(ensemble_df)))
n_elements = len(ensemble_df) * len(ensemble_df)

# all pairwise sasas (these can just be subtracted from eachother)
for i in range(len(ensemble_df)):
    for j in range(len(ensemble_df)):
        sasa_matrix[i,j] = (ensemble_df['sasa'].iloc[i] - ensemble_df['sasa'].iloc[j])
        #sasa_matrix[i,j] = (ensemble_df['sasa'][i] - ensemble_df['sasa'][j])

# show the matrix
plt.imshow(sasa_matrix, cmap='coolwarm')
plt.colorbar()


In [None]:
# make a matrix of differences in collective variable between structures
cv_matrix = np.zeros((len(ensemble_df), len(ensemble_df)))
n_elements = len(ensemble_df) * len(ensemble_df)

# all pairwise difference in collective variable (these can just be subtracted from eachother)
for i in range(len(ensemble_df)):
    for j in range(len(ensemble_df)):
        cv_matrix[i,j] = (ensemble_df[collective_variable].iloc[i] - ensemble_df[collective_variable].iloc[j])
        #cv_matrix[i,j] = (ensemble_df[collective_variable][i] - ensemble_df[collective_variable][j])

# show the matrix
plt.imshow(cv_matrix, cmap='coolwarm')
plt.colorbar()



In [None]:
# assign bins for binned mc path finding

use_defined_end_states = True

occluded_state = 'msa-16-32_rank_024.pdb'
#lumen_open 'msa-128-256_rank_1163.pdb'
#cyto open 'msa-32-64_rank_127.pdb'

# end states to interpolate between values of the CV of
end_state_1 = 'msa-128-256_rank_1163.pdb' #'msa-32-64_rank_127.pdb'
end_state_2 = occluded_state

# get the values of the CV for these structures
end_state_1_cv_value = ensemble_df[ensemble_df['structure'] == end_state_1][collective_variable].values[0]
end_state_2_cv_value = ensemble_df[ensemble_df['structure'] == end_state_2][collective_variable].values[0]

if use_defined_end_states == True:
    max_value = end_state_1_cv_value
    min_value = end_state_2_cv_value
else:
    # just get the min and max value of the end states globally, like before
    max_value = ensemble_df[collective_variable].max()
    min_value = ensemble_df[collective_variable].min()


## get the ideal collective variable values per window between min and max_value using linear interpolation
ideal_window_values = np.zeros(mc_n_bins)
for i in range(mc_n_bins):
    ideal_window_values[i] = np.interp(i, [0, mc_n_bins-1], [max_value, min_value])
ideal_initial_path = []
for i in ideal_window_values:
    ideal_initial_path.append(ensemble_df[collective_variable].sub(i).abs().idxmin())

# replace the first and last structures in the ideal path with the end states
if use_defined_end_states == True:
    ideal_initial_path[0] = ensemble_df[ensemble_df['structure'] == end_state_1].index[0]
    ideal_initial_path[-1] = ensemble_df[ensemble_df['structure'] == end_state_2].index[0]

## set up the bins for the binned path
ensemble_df['closest_ideal_structure'] = ensemble_df[collective_variable].apply(lambda x: ideal_initial_path[np.argmin(np.abs(ideal_window_values - x))])
ensemble_df['bin'] = ensemble_df['closest_ideal_structure'].apply(lambda x: ideal_initial_path.index(x))


# plotting
#display(ensemble_df)
plt.hist(ensemble_df['bin'], bins=mc_n_bins)
plt.show()

# show bin distribution
plt.scatter(ensemble_df[plot_variable_1], ensemble_df[plot_variable_2], c=ensemble_df['bin'], cmap='tab20c')
plt.colorbar()

# plot the initial guesses
for i in range(mc_n_bins):
    plt.scatter(ensemble_df.loc[ensemble_df.index == ideal_initial_path[i]][plot_variable_1], ensemble_df.loc[ensemble_df.index == ideal_initial_path[i]][plot_variable_2], marker='x', color='black', s=10)
# label the ideal structures wit htheir associated bin
for i in range(mc_n_bins):
    plt.annotate(i, (ensemble_df.loc[ensemble_df.index == ideal_initial_path[i]][plot_variable_1], ensemble_df.loc[ensemble_df.index == ideal_initial_path[i]][plot_variable_2]))

In [None]:
# MC path finding by exchanging structures within bins

def calc_energy(path, path_length):

    wf_rmsd = 1
    wf_cv = 1

    # for minimising rmsd between structures on path
    energy_rmsd = 0
    for i in range(path_length - 1):
        energy_rmsd += (rmsd_matrix[np.where(ensemble_df.index.values == path[i])[0][0], np.where(ensemble_df.index.values == path[i+1])[0][0]]**2)
    energy_rmsd = wf_rmsd * np.sqrt ( 1/path_length * energy_rmsd )

    # for minimising pairwise differences in collective variable between structures on path
    energy_cv = 0
    for i in range(path_length -1):
        energy_cv += (cv_matrix[np.where(ensemble_df.index.values == path[i])[0][0], np.where(ensemble_df.index.values == path[i+1])[0][0]]**2)
    energy_cv = wf_cv * np.sqrt ( 1/path_length * energy_cv )

    # final term is sum
    energy = energy_rmsd + energy_cv

    # final energy
    return energy

# define a function that runs monte carlo simulated annealing to optimise path smoothness
def mc_path_optimisation(seed):

    fixed_endpoints = True
    mc_n_steps = 1000
    initial_temperature = 0.00001

    np.random.seed(seed)

    # list of temperatures mapped to mc_n_steps increasing stepwise in 10 increments
    final_temperature = initial_temperature*1000
    temperatures = np.logspace(np.log10(initial_temperature), np.log10(final_temperature), num=mc_n_steps)

    mc_path_length = mc_n_bins

    mcpath = []

    initial_guess_indices = []
    initial_guess_indices = ideal_initial_path

    mcpath = initial_guess_indices

    # dictionary of paths and energies
    path_energies = {}

    # run n_mc_runs steps
    for i in range(mc_n_steps):

        temperature = temperatures[i]

        # calculate energy of current path
        energy = calc_energy(mcpath, mc_path_length)
        
        # propose an exchange of a random (not endstate) structure in the path with a random structure from the pool
        new_mcpath = mcpath.copy()

        if fixed_endpoints == True:
            start_point = 1
            end_point = len(new_mcpath)-1
        else:
            start_point = 0
            end_point = len(new_mcpath)

        # loop throguh new_mcpath
        for point in range(start_point, end_point):
            structure_in_path = mcpath[point]

            # select a random structure in the same bin as the structure in the path
            random_structure_in_pool = np.random.choice(ensemble_df[ensemble_df['bin'] == ensemble_df.loc[structure_in_path]['bin']].index.values)

            # replace the random structure in the new path with the random structure from the pool
            new_mcpath[np.where(new_mcpath == structure_in_path)[0][0]] = random_structure_in_pool

            # calcualte energy of proposed path
            new_energy = calc_energy(new_mcpath, mc_path_length)

            # calculate the difference between the two energies
            delta_energy = 0
            delta_energy = new_energy - energy

            # metropolis criterion

            # if lower, accept the new path
            if delta_energy < 0:

                mcpath = new_mcpath
                path_energies[new_energy] = mcpath

            else:

                # if higher, accept with probability e^(-difference/temp)
                if np.random.rand() < np.exp(-delta_energy / temperature):

                    mcpath = new_mcpath
                    path_energies[new_energy] = mcpath

                else:
                    pass
            
    # get the path with the lowest energy
    final_energy = min(path_energies.keys())
    final_path = path_energies[final_energy]
    relaxation_energies = list(path_energies.keys())
    final_path_structures = tuple(ensemble_df.loc[final_path]['structure'].values)

    return [final_energy, final_path, final_path_structures, relaxation_energies]

mc_n_runs = 100

# run mc_n_bins runs in parallel
with Pool(processes=num_processes) as pool:
    mc_runs = list(tqdm(pool.imap(mc_path_optimisation, range(mc_n_runs)), total=mc_n_runs))

# add the final energies and final path structures to a dataframe
mc_runs_df = pd.DataFrame(mc_runs, columns=['energy', 'path', 'path structures', 'relaxation energies'])
mc_runs_df = mc_runs_df.sort_values(by=['energy']) 

# display the best run
display(mc_runs_df[['energy', 'path structures']].head(3))

# plot relaxation of the best run
for i in range(1, 4):
    plt.plot(mc_runs_df.iloc[i]['relaxation energies'])

# label
plt.xlabel('MC step')
plt.ylabel('Energy')
plt.show()

# calculate the energy for each point in the best path (mc_runs[1]) (for plotting)
energy_over_path = []
energy = {}
# get hte "path" of the best run (in indices)
test = mc_runs_df.iloc[1]['path']
for i in range(mc_n_bins - 1):
        energy[i] = (rmsd_matrix[np.where(ensemble_df.index.values == test[i])[0][0], np.where(ensemble_df.index.values == test[i+1])[0][0]]**2)
        energy[i] = np.sqrt ( 1 * energy[i] )
plt.plot(energy.values(), label='best')

test = mc_runs_df.iloc[-1]['path']
for i in range(mc_n_bins - 1):
        energy[i] = (rmsd_matrix[np.where(ensemble_df.index.values == test[i])[0][0], np.where(ensemble_df.index.values == test[i+1])[0][0]]**2)
        energy[i] = np.sqrt ( 1 * energy[i] )
plt.plot(energy.values(), label='worst')
# plot energy

# plot energy
plt.legend()
# label axes
plt.xlabel('point in path')
plt.ylabel('MC energy')
plt.show()

# write out the best run to a multistate pdb file and append timestamp to filename
with mda.Writer(base_directory + 'best_run.pdb', u.atoms.n_atoms) as W:
    # get the structures of the lowest energy path
    for structure in mc_runs_df.head(1)['path structures'].values[0]:
        # if structure is the reference, dont renumber residues
        if structure == 'ref_outward.pdb' or structure == 'ref_inward.pdb':
            # use a non-reference structure to make the universe
            u = mda.Universe(structure_directory + structure, 
                             structure_directory + structure) # make universe
            u.atoms.residues.resids += resid_offset 
            u.atoms.segments.segids = 'A'
            u.atoms.chainIDs = 'A'
            W.write(u.select_atoms('protein'))
        else:
            u = mda.Universe(structure_directory + structure, 
                             structure_directory + structure) # make universe
            u.atoms.residues.resids += resid_offset
            W.write(u.select_atoms('protein'))
        

In [None]:
# plot the best path

# plot all the structures
plt.scatter(ensemble_df[plot_variable_1], ensemble_df[plot_variable_2], c=ensemble_df['bin'], cmap='tab20c', alpha=0.8)

# for each structure in best run, plot the rmsd to outward and inward from the ensemble_df dataframe
for structure in mc_runs_df.head(1)['path'].values[0]:
    plt.scatter(ensemble_df.loc[structure][plot_variable_1], ensemble_df.loc[structure][plot_variable_2], c='black')
    #plt.annotate(structure, (ensemble_df.loc[structure][plot_variable_1], ensemble_df.loc[structure][plot_variable_2]))   
    #plt.annotate(ensemble_df.loc[structure]['structure'], (ensemble_df.loc[structure][plot_variable_1], ensemble_df.loc[structure][plot_variable_2]), xytext=(ensemble_df.loc[structure][plot_variable_1], ensemble_df.loc[structure][plot_variable_2]+0.5), ha='center', va='bottom', arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.2'))

    # annotate them with their bin number
    plt.annotate(ensemble_df.loc[structure]['bin'], (ensemble_df.loc[structure][plot_variable_1], ensemble_df.loc[structure][plot_variable_2]), xytext=(ensemble_df.loc[structure][plot_variable_1], ensemble_df.loc[structure][plot_variable_2]+0.5), ha='center', va='bottom', arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.2'))

# draw connecting lines
for i in range(len(mc_runs_df.head(1)['path'].values[0]) - 1):
    # get the rmsd to outward and inward from ensemble_df
    x1 = ensemble_df.loc[mc_runs_df.head(1)['path'].values[0][i]][plot_variable_1]
    y1 = ensemble_df.loc[mc_runs_df.head(1)['path'].values[0][i]][plot_variable_2]
    x2 = ensemble_df.loc[mc_runs_df.head(1)['path'].values[0][i+1]][plot_variable_1]
    y2 = ensemble_df.loc[mc_runs_df.head(1)['path'].values[0][i+1]][plot_variable_2]
    plt.plot([x1, x2], [y1, y2], c='black', alpha=0.6, linestyle='dashed')

# plot the initial guesses
for i in range(mc_n_bins):
    plt.scatter(ensemble_df.loc[ensemble_df.index == ideal_initial_path[i]][plot_variable_1], ensemble_df.loc[ensemble_df.index == ideal_initial_path[i]][plot_variable_2], marker='x', color='black', s=10)



plt.xlabel(plot_variable_1)
plt.ylabel(plot_variable_2)
plt.show()

# # plot SASA
# for structure in mc_runs_df.head(1)['path'].values[0]:
#     # plot rmsd to each end state on x and y axis and colour by sasa
#     plt.scatter(ensemble_df.loc[structure]['rmsd_to_outward'], ensemble_df.loc[structure]['rmsd_to_inward'], c=ensemble_df.loc[structure]['sasa'], cmap='coolwarm')
# #labels
# plt.xlabel('rmsd_to_outward')
# plt.ylabel('rmsd_to_inward')
# #plt.ylim(ensemble_df['rmsd_ratio_outwardness'].min(), ensemble_df['rmsd_ratio_outwardness'].max())
# fig = plt.gcf()
# #fig.set_size_inches(15, 2)
# plt.xticks(rotation=90)
# plt.show()

#print('Best Run:')
#mc_runs_df.head(1)


In [None]:
# Plot how good this is

# plot the ideal window values
plt.plot(ideal_window_values, label='ideal', color='black', alpha=0.2)
# show dots
plt.scatter(range(mc_n_bins), ideal_window_values, c=range(mc_n_bins), cmap='coolwarm', alpha=0.5)

# get the actual values
actual_window_values = np.zeros(mc_n_bins)
for i in range(mc_n_bins):
    actual_window_values[i] = ensemble_df.loc[mc_runs_df.head(1)['path'].values[0][i]][collective_variable]

# print the ideal and actual values in a table
print('Ideal vs Actual Values:')
display(pd.DataFrame({'ideal': ideal_window_values, 'actual': actual_window_values}))

# plot the actual values
plt.plot(actual_window_values, label='actual', c='black', alpha=0.8)
# show dots
plt.scatter(range(mc_n_bins), actual_window_values, c=range(mc_n_bins), cmap='coolwarm')

# plot vertical lines between the two
for i in range(mc_n_bins):
    plt.plot([i, i], [ideal_window_values[i], actual_window_values[i]], c='black', alpha=0.2, linestyle='dashed')

if ideal_initial_path == True:

    # alternate where initial guessi ncides are just those of the structures closest to the ideal values of the collective variable (from a linear interpolation)
    initial_guess_indices = []
    # get the ideal values of the collective variable

    # enpoint values are min adn max of the collective variable
    ideal_values = np.linspace(ensemble_df[collective_variable].min(), ensemble_df[collective_variable].max(), mc_n_bins)

    # get the closest structure to each ideal value
    for i in ideal_values:
        initial_guess_indices.append(ensemble_df[collective_variable].sub(i).abs().idxmin())
    # plot the initial guesses
    for i in range(mc_n_bins):
        plt.scatter(i, ensemble_df.loc[initial_guess_indices[i]][collective_variable], marker='x', color='black')
    print(len(initial_guess_indices))   

# x axis labels
plt.xticks(range(mc_n_bins), range(mc_n_bins))
plt.xlabel('Window Number')
plt.ylabel(collective_variable + 'CV')
plt.legend()

# add a title
plt.title('Ideal vs Actual Window CV Values')
plt.show()

# now just plot the offsets (absolute) as a bar chart
offsets = np.abs(ideal_window_values - actual_window_values)
plt.bar(range(mc_n_bins), offsets, color='black', alpha=0.2)
plt.show()


In [None]:
# set up umbrella sampling windows
from MDAnalysis.analysis import align, rms

def align_universe(mobile, ref):
# aligns mobile universe to first frame of reference universe
    alignment_selection = 'protein and name CA'
    mobile.trajectory[-1]  # set mobile trajectory to last frame
    ref.trajectory[0]  # set reference trajectory to first frame
    mobile_ca = mobile.select_atoms(alignment_selection)
    ref_ca = ref.select_atoms(alignment_selection)
    rms.rmsd(
             mobile_ca.positions, 
             ref_ca.positions, 
             superposition=False
             )
    aligner = align.AlignTraj(mobile, 
                              ref, 
                              select=alignment_selection,
                              in_memory=True).run()
    return mobile

#template universe embedded in lipid bilyer with gromacs topology etc
template_universe = mda.Universe(base_directory + 'template.pdb', base_directory + 'template.pdb')

umbrella_sampling_directory = base_directory + 'umbrella_sampling/'
window_directories = []

# make a directory for each window
for i in range(mc_n_bins):
    window_directories.append(umbrella_sampling_directory + 'window_' + str(i))
    if not os.path.exists(window_directories[i]):
        os.makedirs(window_directories[i])

#gmx_exec = '/biggin/b212/bioc1781/software/gromacs/2022.4_CUDA_AVX2_plumed/bin/gmx'
#
#grompp = gmx.commandline_operation('gmx', 'grompp',
#                                   input_files={
#                                       '-f': mdpfile,
#                                       '-p': solvate.output.file['-p'],
#                                       '-c': solvate.output.file['-o'],
#                                       '-po': mdout_mdp,
#                                   },
#                                   output_files={'-o': tprfile})


# for each window, replace the coordinates of the protein in the template universe with the coordinates of the structure at the corresponding index in the best path
for i in range(mc_n_bins):

    # write the relevant structure to a pdb file in the window directory
    # pass the structure to gmx pdb2gmx to get the correct atom order (discard topology)


    ## make a copy of the template universe
    #u = template_universe.copy()
    #
    ## make a universe out of the structure corresponding to the index in the best path
    #structure = mc_runs_df.head(1)['path structures'].values[0][i]
    #u_structure = mda.Universe(structure_directory + structure, structure_directory + structure)
    ## select the protein atoms in the template universe
    #protein = u.select_atoms('protein')
    ## align the structure to the template universe
    #u_structure = align_universe(u_structure, u)
    ## replace the coordinates of the protein in the template universe with the coordinates of the protein in the structure universe
    #protein.positions = u_structure.select_atoms('protein').positions
#
    ## write out the universe to a pdb file
    #u.atoms.write(window_directories[i] + '/window_' + str(i) + '.pdb')

In [None]:
# run propka
from propkatraj import PropkaTraj

residues = ['205', '305']
# copy dataframe 
pka_df = ensemble_df




## write a multistate pdb file for the whole ensemble so MDA will interpret it as a trajectory
#with mda.Writer(base_directory + 'ensemble.pdb', u.atoms.n_atoms) as W:
#    for structure in ensemble_df['structure']:
#        u = mda.Universe(structure_directory + structure, 
#                         structure_directory + structure)
#        # renumber for consistency and to match selection token   s     
#        u.atoms.residues.resids += resid_offset
#        u.atoms.segments.segids = 'A'
#        u.atoms.chainIDs = 'A'
#        W.write(u.select_atoms(rmsd_selection))
 
 ## make a universe containing all the structures
 #u = mda.Universe(base_directory + "ensemble.pdb")
 #
 #u.atoms.residues.resids += resid_offset
 #u.atoms.segments.segids = 'A'
 #u.atoms.chainIDs = 'A'
 #
 #pkatraj = PropkaTraj(u, select='protein', skip_failure=True, Verbose=True)
 #pkatraj.run()   # creates a pandas dataframe with the pka values for each residue in each frame results.pkas
 #struc_pkas = pkatraj.results.pkas.describe()

# align the ensemble to the selection token
#aligner = mda.analysis.align.AlignTraj(u, u, select=rmsd_selection, in_memory=True).run()




# make a universe object containing all the structures
#u = mda.Universe(structure_directory + structures[0], structure_directory + structures[0])

# loop thorugh the structures making a universe object for each
# make a black array to store results for all structures
results = np.zeros((len(structures), len(residues)))
print(structures[1:])
for structure in structures[1:]:
    u = mda.Universe(structure_directory + structure, structure_directory + structure)
    # renumber residues
    u.atoms.residues.resids += resid_offset
    u.atoms.segments.segids = 'A'
    u.atoms.chainIDs = 'A'

    #print(structure)
    pkatraj = PropkaTraj(u, select='protein', skip_failure=True, Verbose=True)
    pkatraj.run()   # creates a pandas dataframe with the pka values for each residue in each frame results.pkas
    #print(pkatraj.results.pkas)
    struc_pkas = pkatraj.results.pkas.describe()
    # add to results array
    results[np.where(structures == structure)[0][0]] = struc_pkas.loc['mean', residues]


  
  #for residue in residues:
  #    # get the mean pka for the residue of interest
  #    pka = struc_pkas.loc['mean', residue]
  #    # add this pka value to a column labelled residue in the pka_df with the structure name as the index 
  #    pka_df.loc[structure, 'pka_' + str(residue)] = pka
        