# pLDDT processing

**NOTE**: I should change the name of the notebook, given that there is no b-factor involved here, only a field that is stored in the bfactor column.



In [None]:
# declare a list tasks whose products you want to use as inputs
upstream = ['02-rename-chains']
product = None
system_name = None

Load libraries

In [None]:
import prody as pdy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

We have a somehow convolved scheme to remove the disordered/badly-predicted N and C tails. The idea is to apply a window, and remove all those regions that have an average pLDDT lower than 70. Once we find a residue with pLDDT higher than 70 from any of the ends, we stop cutting. 

This ain't the best, but it works.

In [None]:
def window_betas(beta, window_size=3):
    beta_windowed = []
    for i in range(window_size, len(beta) - window_size):
        beta_windowed.append([i, np.mean(beta[i-window_size:i+window_size])])

    beta_windowed = np.array(beta_windowed)
    return beta_windowed

def mark_n_tail(betas, threshold = 70):
    cut_point = None
    for i in range(betas.shape[0]):
        if betas[i, 1] > threshold:
            break
        else:
            cut_point = betas[i, 0]
    return cut_point

def mark_c_tail(betas, threshold = 70):
    cut_point = None
    for i in reversed(range(betas.shape[0])):
        if betas[i, 1] > threshold:
            break
        else:
            cut_point = betas[i, 0]
    return cut_point

def clean(structure, pLDDT_df):
    temp = dict()
    for chain in structure.getHierView().iterChains():
        chain.setOccupancies(1.0)

        temp[f'chain_{chain.getChid()}'] = np.mean(chain.select('name CA').getBetas())
        temp[f'chain_{chain.getChid()}_sequence'] = chain.getSequence()
        temp[f'chain_{chain.getChid()}_len'] = len(chain.getSequence())

        betas = window_betas(pLDDT_df.query(f'chain == "{chain.getChid()}"').sort_values(by='resnum')['pLDDT'])
        ctail_cutpoint = mark_c_tail(betas)
        ntail_cutpoint = mark_n_tail(betas)
        if ctail_cutpoint is not None:
            chain.select(f'resnum > {int(ctail_cutpoint) + 1:d}').setOccupancies(0.0)
        if ntail_cutpoint is not None:
            chain.select(f'resnum < {int(ntail_cutpoint) + 1:d}').setOccupancies(0.0)

    structure = structure.select('occupancy > 0.0')
    return structure, temp

def build_pLDDT_df(items):
    pdb_items = []
    group_pLDDT = []
    for pdb_file in items:
        pdb = pdy.parsePDB(pdb_file)
        
        for chain in pdb.getHierView().iterChains():
            for beta, resnum in zip(chain.select('name CA').getBetas(), chain.select('name CA').getResnums()):
                group_pLDDT.append([pdb_file, beta, resnum, chain.getChid()])
        pdb_items.append(pdb)
    group_pLDDT = pd.DataFrame(group_pLDDT, columns=['file', 'pLDDT', 'resnum', 'chain'])
    group_pLDDT_mean = group_pLDDT.groupby(['chain', 'resnum'], as_index=False).mean(numeric_only=True)
    return group_pLDDT_mean, pdb_items


def group_structures(files):
    groups = dict()
    for file in files:
        parent = '_'.join(file.split('_')[:2])
        try:
            groups[parent].append(file.strip())
        except:
            groups[parent] = [file.strip()]
    return groups


We apply it here

In [None]:
stripped = []
out = []
for key, items in group_structures(open(upstream['02-rename-chains']['data'], 'r').readlines()).items():
    print(key)
    df, pdbs = build_pLDDT_df(items)
    for pdb_file, pdb in zip(items, pdbs):
        print(pdb_file)
        try:
            clean_pdb, features = clean(pdb, df)
        except:
            print("unable to clean {pdb_file}")
        features['id'] = pdb_file
        pdy.writePDB(pdb_file.replace('.rechained', '.clean'), clean_pdb)
        stripped.append(pdb_file.replace('.rechained', '.clean'))
        out.append(features)

The results provide the pLDDT of all the chains.

In [None]:
pd.DataFrame.from_records(out).to_csv(product['data'])

For future steps.

In [None]:
with open(product['stripped'], 'w') as f:
    f.write('\n'.join(stripped))