# Get Crystal Structures Metadata

In [1]:
from pathlib import Path
from glob import glob
import pandas as pd
import numpy as np
import sys
sys.path.append('../..')
%load_ext autoreload
%autoreload 2

In [2]:
prot_name       = 'cdk2'
uniprot_id      = 'P24941'
ref_struc_id    = '1fin'
ref_ligand_name = 'ATP'

- Load the `df_pdbids` computed in the previous notebook.

In [3]:
# Load the pdbids dataframe
df_pdbids = pd.read_pickle(f'./df_{prot_name}_pdbids.pkl')
df_pdbids.head()

Unnamed: 0,pdb_id,method,resolution,chain,start,end,seq_len
0,1aq1,X-ray,2.0,A,1,298,298
1,1b38,X-ray,2.0,A,1,298,298
2,1b39,X-ray,2.1,A,1,298,298
3,1buh,X-ray,2.6,A,1,298,298
4,1ckp,X-ray,2.05,A,1,298,298


In [4]:
import pypdb
from helper_modules.pdbs_from_uniport import get_cocrystal_ligand
from helper_modules.pdbs_from_uniport import get_pdb_sequence
from helper_modules.pdbs_from_uniport import get_nonstd_aa
from helper_modules.pdbs_from_uniport import get_identity
from helper_modules.find_gaps import get_gaps_and_coverage
from helper_modules.run_or_load import run_or_load_joblib

In [5]:
def get_metadata(df_pdbids):
    prot_crys_mtd_dict = {}
    columns = ['pdb_id', 'chain', 'file_path']
    for i, pdb_id, chain, pdb_file in df_pdbids[columns].itertuples():
        print(i, '->', pdb_id)
        # Get the metadata using pypdb and in-house functions
        descrip_pdb = pypdb.describe_pdb(pdb_id)
        crystal     = pypdb.get_entity_info(pdb_id)
        ligs_names  = get_cocrystal_ligand(pdb_id)
        num_ligs    = len(ligs_names)
        sequence    = get_pdb_sequence(pdb_id)
        nonstd_resnames, nonstd_resnums = get_nonstd_aa(pdb_file)
        # Find number of gaps and coverage 
        seq_algn, coverage, gaps = get_gaps_and_coverage(
                                     pdb_file = pdb_file, 
                                     full_sequence = seq_prot,
                                     chain = 'A') # Hardcoded as the chain was renamed in the previous notebook
        identity = get_identity(seq_alg, seq_prot, gap_char = '-') 
        prot_crys_mtd_dict.update(
            {pdb_id : 
                 {'describe_pdb': descrip_pdb, 
                  'pdb_info'    : crystal, 
                  'identity'    : identity,
                  'name_ligs'   : ligs_names,
                  'seq'         : sequence,
                  'seq_alg'     : seq_algn, 
                  'coverage'    : coverage,
                  'gaps'        : gaps,
                  'chain'       : chain,
                  'nonstd_resnames': nonstd_resnames, 
                  'nonstd_resnums' : nonstd_resnums
                 }
            })
    return prot_crys_mtd_dict

# Save the results to a file to ommit repeate the analysis
@run_or_load_joblib
def get_metadata_SAVE(filename, **kwargs):
    return get_metadata(**kwargs)

## Get the available metadata for all pdb entries

In [6]:
prot_crys_mtd_dict = get_metadata_SAVE(
                        filename = f'./dict_metadata_{prot_name}_pdbids.joblib', 
                        df_pdbids = df_pdbids)

File loaded: ./dict_metadata_cdk2_pdbids.joblib


In [7]:
def get_data_rows(pdb_entry_values: dict) -> pd.Series:
    '''
    This function takes a nested dictionary containing the metadata of 
    a pdb_entry from the `prot_crys_mtd_dict` dictionary created in 
    the previous cell, and returns a pandas series containing the entry metadata
    '''
    e = pdb_entry_values
    d = e['describe_pdb']
    l = e

    dic = {
          "PDB_ID"     : d['rcsb_id'].lower(),
          "Title"      : d['citation'][0]['title'].lower(),
          "Date"       : d['rcsb_accession_info']\
                          ['initial_release_date'].split('T')[0],
          "Entities"   : d['pdbx_vrpt_summary']['protein_dnarnaentities'],
          "ChainID"    : e['chain'],
          "Resolution" : d['pdbx_vrpt_summary']['pdbresolution'],
          "Identity"   : round(e['identity'], 3),
          "Coverage"   : round(e['coverage'], 3),
          "NumGaps"    : e['gaps']['num_gaps'],
          "GapLen"     : e['gaps']['gap_lengths'],
          "GapPos"     : e['gaps']['gap_list'],
          "NumLigs"    : e['num_ligs'], 
          "NameLigs"   : e['name_ligs'],
          "NonStndResnames": e['nonstd_resnames'],
          "NonStndResnums" : e['nonstd_resnums'],
    }
    entry_series = pd.Series(dic)
    return entry_series

def get_mtd_table(mtd_dic: dict):
    '''
     This function is designed to take the 
     `prot_crys_mtd_dict` dictionary
     and extract a pandas series for each pdb entry
    '''
    df_metadata = pd.concat([get_data_rows(values) 
                             for pdb_id, values in mtd_dic.items()], 
                            axis = 1).T
    return df_metadata

## Create a DataFrame with the main properties

In [11]:
# Now we create the dataframe
df_prot = get_mtd_table(prot_crys_mtd_dict)
df_prot

Unnamed: 0,PDB_ID,Title,Date,Entities,ChainID,Resolution,Identity,Coverage,NumGaps,GapLen,GapPos,NumLigs,NameLigs,NonStndResnames,NonStndResnums
0,1aq1,protein kinase inhibition by staurosporine rev...,1997-11-12,1,A,2.0,0.993,92.953,2,"[8, 13]","[[36, 43], [149, 161]]",1,[STU],,
1,1b38,effects of phosphorylation of threonine 160 on...,1998-12-23,1,A,2.0,0.993,97.315,1,[8],"[[36, 43]]",2,"[ATP, MG]",,
2,1b39,effects of phosphorylation of threonine 160 on...,1998-12-23,1,A,2.1,0.993,97.315,1,[8],"[[36, 43]]",2,"[ATP, MG]",,
3,1buh,crystal structure and mutational analysis of t...,1998-09-09,12,A,2.6,0.993,96.309,2,"[7, 4]","[[40, 46], [295, 298]]",0,[],,
4,1ckp,"exploiting chemical libraries, structure, and ...",1999-01-13,1,A,2.05,0.993,93.624,2,"[8, 11]","[[36, 43], [153, 163]]",2,"[PVB, EDO]",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,7b7s,"3 h -pyrazolo[4,3- f ]quinoline-based kinase i...",2021-08-04,12,A,2.54,0.993,96.309,4,"[2, 2, 2, 5]","[[15, 16], [39, 40], [72, 73], [294, 298]]",2,"[NO3, T1T]",[TPO],[160]
421,7e34,the sun1-spdya interaction plays an essential ...,2021-04-14,123,A,3.19,0.993,100.0,0,[],[],1,[GOL],,
422,7kjs,"discovery of pf-06873600, a cdk2/4/6 inhibitor...",2021-06-23,12,A,2.19,0.993,99.664,1,[1],"[[298, 298]]",1,[WG1],[TPO],[160]
423,7m2f,balancing properties with carboxylates: a lead...,2021-07-07,1,A,1.63,0.993,94.295,2,"[10, 7]","[[36, 45], [148, 154]]",1,[YOS],,


## Drop conformations with missing C-term and N-term regions

As mentioned in the previous notebook (LINK), some conformations lack the more than 10 C-terminus or N-terminus residues. We will identify and drop these conformations from the crystal ensemble.

In [9]:
gap_thr         = 10
protein_seq_len = 298
confs_to_drop   = []

for conf_gaps in df_prot['GapPos']:
    if len(conf_gaps) == 0:
        confs_to_drop.append(False)
        continue
    first_gap = conf_gaps[0]
    last_gap  = conf_gaps[-1]
    if gap_thr in range(*first_gap):
        confs_to_drop.append(True)
    elif protein_seq_len - gap_thr in range(*last_gap):
        confs_to_drop.append(True)
    else:
        confs_to_drop.append(False)

print('Conformations that will be discarded:')
df_prot.loc[confs_to_drop]

Conformations that will be discarded:


Unnamed: 0,PDB_ID,Title,Date,Entities,ChainID,Resolution,Identity,Coverage,NumGaps,GapLen,GapPos,NumLigs,NameLigs,NonStndResnames,NonStndResnums
10,1f5q,crystal structure of a gamma-herpesvirus cycli...,2000-12-27,12,A,2.5,0.993,99.329,1,[2],"[[10, 11]]",1,[CL],,
39,1jsu,crystal structure of the p27kip1 cyclin-depend...,1997-07-29,123,A,2.3,0.993,95.973,1,[12],"[[1, 12]]",1,[SO4],[TPO],[160]
383,5uq3,structural basis of divergent cyclin-dependent...,2017-07-05,123,A,3.6,0.993,90.604,3,"[18, 4, 6]","[[1, 18], [37, 40], [293, 298]]",0,[],,
384,6ath,dynamic anticipation by cdk2/cyclin a-bound p2...,2018-09-12,123,A,1.82,0.993,93.289,2,"[16, 4]","[[1, 16], [38, 41]]",1,[SO4],[TPO],[160]
413,6sg4,discriminative skp2 interactions with cdk-cycl...,2021-01-27,12,A,2.43,0.993,87.919,2,"[32, 4]","[[1, 32], [295, 298]]",0,[],[TPO],[160]
418,7b5l,ubiquitin ligation to f-box protein targets by...,2021-02-10,123456789101112,L,3.8,0.993,95.302,2,"[12, 2]","[[1, 12], [154, 155]]",0,[],[TPO],[160]
419,7b5r,ubiquitin ligation to f-box protein targets by...,2021-02-10,1234567,L,3.8,0.993,95.302,2,"[12, 2]","[[1, 12], [154, 155]]",0,[],[TPO],[160]


## Save the final DataFrame

In [10]:
idx_drop = df_prot.loc[confs_to_drop].index
df_prot = df_prot.drop(idx_drop, axis = 0)

filename = f'./df_metadata_{prot_name}_pdbids.pkl'
df_prot.to_pickle(filename)

## Exploratory Data Analysis

In [None]:
# TODO: Exploratory data Analysis

In [138]:
import itertools
# Which conformations have modified residues
df_modified_res = df_prot[~ df_prot['NonStndResnames'].isnull()]
print(F'{len(df_modified_res)} of {len(df_prot)} structures have at least one MODIFIED residue.')

89 of 425 structures have at least one MODIFIED residue.


In [137]:
df_prot_ligs = df_prot[df_prot.NumLigs > 0]
print(F'{len(df_prot_ligs)} of {len(df_prot)} structures have at least one HETATM molecule.')

394 of 425 structures have at least one HETATM molecule.


In [141]:
# How many cocrystalized molecules are there?
# A set of all HETATM molecules
lig_full_list = list( itertools.chain.from_iterable( df_prot.NameLigs ) )
print(f'There are a total of {len(lig_full_list)}', 
       'HETATM molecules (duplicates included).')

lig_all_list = list(set(lig_full_list))
print(f'There are {len(lig_all_list)} unique', 
       'HETATM molecules (without duplicates).')

There are a total of 541 HETATM molecules (duplicates included).
There are 359 unique HETATM molecules (without duplicates).
