1. Imports, etc

In [None]:
#install biopython library
%pip install biopython

#install DSSP ()
!sudo apt-get update && sudo apt-get install -y dssp

import os
import io
import re
import json
import requests
import warnings
from contextlib import redirect_stderr

#fix naming conflict
if not os.path.exists('/usr/bin/dssp'):
  !sudo ln -s /usr/bin/mkdssp /usr/bin/dssp

#google drive
from google.colab import drive

#Biopython
from Bio import SeqIO
from Bio import pairwise2
from Bio.PDB import PDBList, MMCIFParser
from Bio.PDB.DSSP import DSSP
from Bio.PDB.PDBExceptions import PDBConstructionWarning

#PyTorch
import torch



In [None]:
DATA_URL = "https://webs.iiitd.edu.in/raghava/pep2d/dataset/pep2ddataset.fasta"

STRUCT_DIR = "pdb_structures"

#output file
DRIVE_OUTPUT = "/content/drive/MyDrive/pep2d_ss8labled_final.json"

#mapping dictionary
SS8_MAP = {
    'H': 'H', 'G': 'G', 'I': 'I',
    'E': 'E', 'B': 'B',
    'S': 'S', 'T': 'T',
    ' ': 'L'
    }

warnings.filterwarnings("ignore", message="Invalid mmCIF file")

2. Get PDB data from pep2d

In [None]:
def get_data (url):
  response = requests.get(url)

  #deals with errors
  response.raise_for_status()

  return response.text
def get_entries(data):
  entries = []
  fasta_file = io.StringIO(data)

  #regex pattern
  pattern = re.compile(r"^(\w{4})(\w)")

  for record in SeqIO.parse(fasta_file, "fasta"):
    header = record.id
    sequence = str(record.seq)

    match = pattern.search(header)
    if match:
      pdb_id = match.group(1).lower()
      chain_id = match.group(2)

      entries.append({
          "header": header,
          "pdb_id": pdb_id,
          "chain_id": chain_id,
          "sequence": sequence
      })

  print(f"Total entries: {len(entries)}")
  return entries




3. Download structure from PDB and run DSSP

In [None]:
def get_structure(pdb_id, chain_id, storage_loc):
  #pdb downloader
  downloader = PDBList()

  #download file to drive
  path = downloader.retrieve_pdb_file(pdb_id, pdir=storage_loc, file_format="mmCif", overwrite=False)

  #parse file into object
  parser = MMCIFParser(QUIET=True)
  structure = parser.get_structure(pdb_id, path)

  #takes first config
  model = structure[0]

  dssp = DSSP(model, path, dssp="mkdssp")
  return model, dssp




4. Allign fasta and dssp sequences

In [None]:
def fix_mapping(fasta, dssp, chain_id, mappings):
    #get DSSP entries
    chain_keys = sorted([k for k in dssp.keys() if k[0] == chain_id], key=lambda x: x[1][1])

    #map actual structure
    structure_ss = "".join([mappings.get(dssp[k][2], 'L') for k in chain_keys])

    #pad with loops/trim
    labels = list(structure_ss)
    while len(labels) < len(fasta):
        labels.append('L')
    return "".join(labels[:len(fasta)])

5. Final execution

In [None]:
drive.mount('/content/drive')
data = get_data(DATA_URL)
entries = get_entries(data)
os.makedirs(STRUCT_DIR, exist_ok=True)
final_data = []

for entry in entries:
  try:
    model, dssp_results = get_structure(
        entry['pdb_id'],
        entry['chain_id'],
        STRUCT_DIR)
    label = fix_mapping(
        entry['sequence'],
        dssp_results,
        entry['chain_id'],
        SS8_MAP)
    final_data.append({
        "pdb_id": entry['pdb_id'],
        "chain_id": entry['chain_id'],
        "sequence": entry['sequence'],
        "label": label
    })
  except:
    print(f"Error processing {entry['pdb_id']}")
    continue

with open(DRIVE_OUTPUT, "w") as f:
  json.dump(final_data, f, indent=4)

print(f"SAVED {len(final_data)}")