# Install & Import Dependencies

In [1]:
# ! pip install rcsb-api
# ! pip install biopython
# ! pip install swifter

In [2]:
from rcsbapi.data import DataQuery as Query
import json
from rcsbapi.search import search_attributes as attrs
import pandas as pd
from Bio import pairwise2
from Bio.Seq import Seq
from Bio.Align import substitution_matrices
import re
from Bio.pairwise2 import format_alignment
import os
import subprocess
import swifter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import os
# num_cores = os.cpu_count()
# if num_cores is not None:
#     print(f"Number of logical CPU cores: {num_cores}")
# else:
#     print("Could not determine CPU count.")

# RCSB Search Query

In [4]:
q1 = attrs.rcsb_entity_source_organism.scientific_name == "Homo sapiens"
q2 = attrs.exptl.method == "X-RAY DIFFRACTION"

In [5]:
query = q1 & q2

In [6]:
results = query()
output = list()
for rid in results:
    output.append(rid)

In [7]:
len(output)

59477

# RCSB Data Query

In [8]:
query = Query(
    input_type="entries",
    input_ids=output,
    return_data_list=[
        "exptl.method",
        "polymer_entities.polymer_entity_instances.rcsb_polymer_entity_instance_container_identifiers.entity_id",
        "polymer_entities.uniprots.rcsb_uniprot_protein.sequence",
        "polymer_entities.entity_poly.pdbx_seq_one_letter_code",
        "polymer_entities.uniprots.rcsb_uniprot_protein.source_organism"
    ]
)
query.exec(progress_bar=True)
response_data = query.get_response()
# response_data

100%|██████████| 199/199 [03:07<00:00,  1.06it/s]


In [9]:
len(response_data['data']['entries'])

59477

# Creating Pandas DF

In [10]:
rcsb_ids = list()
rcsb_entity_ids = list()
uniprot_seqs = list()
pbd_ids = list()

for result in response_data['data']['entries']:
  for entity in result['polymer_entities']:
    if entity['uniprots']:
      for uniprot in entity['uniprots']:
        if uniprot['rcsb_uniprot_protein']['source_organism']['taxonomy_id'] == 9606:
          rcsb_ids.append(result['rcsb_id'])
          rcsb_entity_ids.append(entity['polymer_entity_instances'][0]['rcsb_polymer_entity_instance_container_identifiers']['entity_id'])
          uniprot_seqs.append(uniprot['rcsb_uniprot_protein']['sequence'])
          pbd_ids.append(entity['entity_poly']['pdbx_seq_one_letter_code'])

In [11]:
len(rcsb_ids), len(rcsb_entity_ids), len(uniprot_seqs), len(pbd_ids)

(70732, 70732, 70732, 70732)

In [12]:
df = pd.DataFrame(
    data = {'rcsb_id': rcsb_ids, 'rcsb_entity_ids': rcsb_entity_ids, 'uniprot_seq': uniprot_seqs, 'pbd_id': pbd_ids}
)

In [13]:
df.head()

Unnamed: 0,rcsb_id,rcsb_entity_ids,uniprot_seq,pbd_id
0,2GD8,1,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,SHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPL...
1,2GDD,1,MLNLLLLALPVLASRAYAAPAPGQALQRVGIVGGQEAPRSKWPWQV...,IVGGQEAPRSKWPWQVSLRVHGPYWMHFCGGSLIHPQWVLTAAHCV...
2,2GDE,1,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...,TFGSGEADCGLRPLFEKKSLEDKTERELLESYIDGR
3,2GDE,2,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...,IVEGSDAEIGMSPWQVMLFRKSPQELLCGASLISDRWVLTAAHCLL...
4,2GDO,1,MAVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRAV...,MAVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRAV...


In [15]:
df.to_excel("/Users/haripat/Desktop/SF/protein/data/protein_constructs.xlsx")

In [16]:
df.shape

(70732, 4)

In [3]:
df = pd.read_excel('/Users/haripat/Desktop/SF/protein/data/protein_constructs.xlsx')

# Cleaning Data

In [4]:
def sanitize_sequence_advanced(sequence: str) -> str:
    if not isinstance(sequence, str):
        return "" 

    ptm_replacements = {
        "(MSE)": "M",  # Selenomethionine -> Methionine
        "(SEP)": "S",  # Phosphoserine -> Serine
        "(TPO)": "T",  # Phosphothreonine -> Threonine
        "(PTR)": "Y",  # Phosphotyrosine -> Tyrosine
        "(NEP)": "K",  # N-Epsilon-Phospholysine -> Lysine
        "(MLY)": "K",  # Monomethyllysine -> Lysine
        "(M2L)": "K",  # Dimethyllysine -> Lysine
        "(M3L)": "K",  # Trimethyllysine -> Lysine
        "(ALY)": "K",  # Acetyllysine -> Lysine
        "(HLY)": "K",  # Hydroxylysine -> Lysine
        "(M1G)": "R",  # Monomethylarginine -> Arginine
        "(M2G)": "R",  # Dimethylarginine -> Arginine
        "(CIR)": "R",  # Citrulline -> Arginine
        "(HYP)": "P",  # Hydroxyproline -> Proline
        "(CGU)": "E",  # Gamma-carboxyglutamate -> Glutamate
        "(NH2)": "",   # C-Terminal Amidation -> Remove
        "(ACE)": "",   # N-Acetyl Group -> Remove
    }

    processed_seq = sequence
    for mod_code, standard_aa in ptm_replacements.items():
        processed_seq = processed_seq.replace(mod_code, standard_aa)
    valid_chars = "ACDEFGHIKLMNPQRSTVWY"
    sanitized_seq = re.sub(f"[^{valid_chars}]", "X", processed_seq.upper())
    return sanitized_seq

In [5]:
# df['pbd_id'] = df['pbd_id'].str.replace("(MSE)", "M")
# df['pbd_id'] = df['pbd_id'].str.replace("(TPO)", "T")
# df['pbd_id'] = df['pbd_id'].str.replace("(SEP)", "S")
# df['pbd_id'] = df['pbd_id'].str.replace("(NH2)", "")
# df['pbd_id'] = df['pbd_id'].str.replace("(PTR)", "Y")
# df['pbd_id'] = df['pbd_id'].str.replace("(M3L)", "K")
# df['pbd_id'] = df['pbd_id'].str.replace("(NEP)", "K")
df['pdb_sequence_sanitized'] = df['pbd_id'].apply(sanitize_sequence_advanced)
df['pdb_sequence_sanitized'] = df['pdb_sequence_sanitized'].str.replace("U", "C")

In [6]:
# df[df['pbd_id'].str.contains("\(")]['pbd_id']

# Labeling Data

In [7]:
# df = pd.read_csv('project_data_v3.csv')

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,rcsb_id,rcsb_entity_ids,uniprot_seq,pbd_id,pdb_sequence_sanitized
0,0,2GD8,1,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,SHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPL...,SHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPL...
1,1,2GDD,1,MLNLLLLALPVLASRAYAAPAPGQALQRVGIVGGQEAPRSKWPWQV...,IVGGQEAPRSKWPWQVSLRVHGPYWMHFCGGSLIHPQWVLTAAHCV...,IVGGQEAPRSKWPWQVSLRVHGPYWMHFCGGSLIHPQWVLTAAHCV...
2,2,2GDE,1,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...,TFGSGEADCGLRPLFEKKSLEDKTERELLESYIDGR,TFGSGEADCGLRPLFEKKSLEDKTERELLESYIDGR
3,3,2GDE,2,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...,IVEGSDAEIGMSPWQVMLFRKSPQELLCGASLISDRWVLTAAHCLL...,IVEGSDAEIGMSPWQVMLFRKSPQELLCGASLISDRWVLTAAHCLL...
4,4,2GDO,1,MAVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRAV...,MAVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRAV...,MAVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRAV...


In [9]:
def create_multi_class_mask(uniprot_sequence: str, pdb_construct_sequence: str) -> list[int] | None:
    """
    Generates a multi-class modification mask by globally aligning a UniProt
    sequence with a PDB construct sequence.

    The mask is the same length as the UniProt sequence. Each position is labeled:
    - 0: Maintained (the residue is the same in both sequences)
    - 1: Deleted (the residue is in UniProt but absent in the PDB construct)
    - 2: Mutated (the residue is present but changed to a different amino acid)

    Args:
        uniprot_sequence: The full-length, canonical protein sequence.
        pdb_construct_sequence: The engineered sequence from the PDB.

    Returns:
        A list of integers (0, 1, or 2) representing the modification mask,
        or None if no alignment can be generated.
    """
    try:
      # print("--- Performing Global Alignment ---")

      blosum62 = substitution_matrices.load("BLOSUM62")

      alignments = pairwise2.align.globalds(
          uniprot_sequence,
          pdb_construct_sequence,
          blosum62,
          -10,  # Gap open penalty
          -0.5  # Gap extend penalty
      )

      if not alignments:
          print("Error: Could not generate an alignment.")
          return None

      best_alignment = alignments[0]
      aligned_uniprot, aligned_pdb, score, begin, end = best_alignment


      modification_mask = []

      for uniprot_char, pdb_char in zip(aligned_uniprot, aligned_pdb):
          if uniprot_char == '-':
              # This case means there's an insertion in the PDB sequence (e.g., a tag).
              # It doesn't correspond to a position in the UniProt sequence, so we skip it.
              continue

          if pdb_char == '-':
              # A gap in the PDB sequence means the UniProt residue was deleted.
              modification_mask.append(1) # 1 = Deleted
          elif uniprot_char == pdb_char:
              # The characters match, so the residue was maintained.
              modification_mask.append(0) # 0 = Maintained
          else:
              # The characters are different, so the residue was mutated.
              modification_mask.append(2) # 2 = Mutated

      if len(modification_mask) != len(uniprot_sequence):
          print(f"Error: Mask length ({len(modification_mask)}) does not match UniProt sequence length ({len(uniprot_sequence)}).")
          return None

      return modification_mask, best_alignment
    except:
      return None, None

def format_alignment_for_display(alignment):
    """Helper function to print the alignment nicely."""
    uniprot_alg, pdb_alg, score, begin, end = alignment

    connector = ""
    for u_char, p_char in zip(uniprot_alg, pdb_alg):
        if u_char == p_char:
            connector += "|"
        elif u_char == '-' or p_char == '-':
            connector += " "
        else:
            connector += "."

    return (
        f"Score: {score}\n\n"
        f"UniProt: {uniprot_alg}\n"
        f"         {connector}\n"
        f"PDB    : {pdb_alg}"
    )

In [9]:
i = 51510
pdb_sequence = df.loc[i]['pbd_id'] #df[df['pbd_id'].str.contains('HHHHHH')].loc[i]['pbd_id']
uniprot_sequence = df.loc[i]['uniprot_seq'] #df[df['pbd_id'].str.contains('HHHHHH')].loc[i]['uniprot_seq']

In [10]:
pdb_sequence

'MGAFLDKPKMEKHNAQGQGNGLRYGLSSMQGWRVEMEDAHTAVIGLPSGLESWSFFAVYDGHAGSQVAKYCCEHLLDHITNNQDFKGSAGAPSVENVKNGIRTGFLEIDEHMRVMSEKKHGADRSGSTAVGVLISPQHTYFINCGDSRGLLCRNRKVHFFTQDHKPSNPLEKERIQNAGGSVMIQRVNGSLAVSRALGDFDYKCVHGKGPTEQLVSPEPEVHDIERSEEDDQFIILACDGIWDVMGNEELCDFVRSRLEVTDDLEKVCNEVVDTCLYKGSRDNMSVILICFPNAPKVSPEAVKKEAELDKYLECRVEEIIKKQGEGVPDLVHVMRTLASENIPSLPPGGELASKRNVIEAVYNRLNPYKNDDTDSTSTDDMWLEHHHHHH'

In [11]:
uniprot_sequence

'MGAFLDKPKMEKHNAQGQGNGLRYGLSSMQGWRVEMEDAHTAVIGLPSGLESWSFFAVYDGHAGSQVAKYCCEHLLDHITNNQDFKGSAGAPSVENVKNGIRTGFLEIDEHMRVMSEKKHGADRSGSTAVGVLISPQHTYFINCGDSRGLLCRNRKVHFFTQDHKPSNPLEKERIQNAGGSVMIQRVNGSLAVSRALGDFDYKCVHGKGPTEQLVSPEPEVHDIERSEEDDQFIILACDGIWDVMGNEELCDFVRSRLEVTDDLEKVCNEVVDTCLYKGSRDNMSVILICFPNAPKVSPEAVKKEAELDKYLECRVEEIIKKQGEGVPDLVHVMRTLASENIPSLPPGGELASKRNVIEAVYNRLNPYKNDDTDSTSTDDMW'

In [12]:
pdb_seq = pdb_sequence
uniprot_seq = uniprot_sequence

result = create_multi_class_mask(uniprot_seq, pdb_seq)

if result:
    mask, alignment = result

    print("\n" + "="*80)
    print("RESULTS")
    print("="*80)

    print("\n--- Visual Alignment ---")
    print(format_alignment_for_display(alignment))

    print(f"\n--- Multi-Class Mask (first 100 values) ---")
    print(mask)

    # --- Statistics ---
    maintained_count = mask.count(0)
    deleted_count = mask.count(1)
    mutated_count = mask.count(2)

    print("\n--- Summary ---")
    print(f"UniProt Sequence Length: {len(uniprot_seq)}")
    print(f"Mask Length:             {len(mask)}")
    print(f"Residues Maintained (0): {maintained_count}")
    print(f"Residues Deleted (1):    {deleted_count}")
    print(f"Residues Mutated (2):    {mutated_count}")


RESULTS

--- Visual Alignment ---
Score: 2021.5

UniProt: MGAFLDKPKMEKHNAQGQGNGLRYGLSSMQGWRVEMEDAHTAVIGLPSGLESWSFFAVYDGHAGSQVAKYCCEHLLDHITNNQDFKGSAGAPSVENVKNGIRTGFLEIDEHMRVMSEKKHGADRSGSTAVGVLISPQHTYFINCGDSRGLLCRNRKVHFFTQDHKPSNPLEKERIQNAGGSVMIQRVNGSLAVSRALGDFDYKCVHGKGPTEQLVSPEPEVHDIERSEEDDQFIILACDGIWDVMGNEELCDFVRSRLEVTDDLEKVCNEVVDTCLYKGSRDNMSVILICFPNAPKVSPEAVKKEAELDKYLECRVEEIIKKQGEGVPDLVHVMRTLASENIPSLPPGGELASKRNVIEAVYNRLNPYKNDDTDSTSTDDMW--------
         ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||        
PDB    : MGAFLDKPKMEKHNAQGQGNGLRYGLSSMQGWRVEMEDAHTAVIGLPSGLESWSFFAVYDGHAGSQVAKYCCEHLLDHITNNQDFKGSAGAPSVENVKNGIRTGFLEIDEHMRVMSEKKHGADRSGSTAVGVLISPQHTYF

In [28]:
# def pd_align(row):
#     try:
#         return create_multi_class_mask(row['uniprot_seq'], row['pbd_id'])[0]
#     except:
#         print("jere")

In [29]:
# df['label_mask'] = df.apply(lambda row: create_multi_class_mask(row['uniprot_seq'], row['pdb_sequence_sanitized'])[0], axis=1)

In [30]:
# df['label_mask'] = df.swifter.apply(lambda row: create_multi_class_mask(row['uniprot_seq'], row['pdb_sequence_sanitized'])[0], axis=1)

In [None]:
df['label_mask'] = df.swifter.apply(lambda row: create_multi_class_mask(row['uniprot_seq'], row['pdb_sequence_sanitized'])[0], axis=1)

Pandas Apply:   0%|          | 104/70732 [00:01<21:43, 54.18it/s]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x107261d90>>
Traceback (most recent call last):
  File "/Users/haripat/Desktop/SF/protein/.venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 796, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
                                                 ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/haripat/.local/share/uv/python/cpython-3.12.10-macos-aarch64-none/lib/python3.12/threading.py", line 1543, in enumerate
    with _active_limbo_lock:
         ^^^^^^^^^^^^^^^^^^
SystemError: <built-in method __enter__ of _thread.RLock object at 0x102d06380> returned a result with an exception set
Pandas Apply:   0%|          | 312/70732 [00:05<20:13, 58.02it/s]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkerne

In [13]:
# 51510, 36647, 5304, 6516

In [16]:
import dask.dataframe as dd

ddf = dd.from_pandas(df[0:1000], npartitions=6)

def apply_mask(partition):
    return partition.apply(
        lambda row: create_multi_class_mask(row['uniprot_seq'], row['pdb_sequence_sanitized'])[0], 
        axis=1
    )

# result = ddf.map_partitions(apply_mask, meta=('label_mask', 'object')).compute()
# df['label_mask'] = result 

In [17]:
result = ddf.map_partitions(apply_mask, meta=('label_mask', 'object')).compute()

KeyboardInterrupt: 

In [None]:
results = []
for index, row in df.iterrows():
    try:
        mask = create_multi_class_mask(row['uniprot_seq'], row['pdb_sequence_sanitized'])[0]
        results.append(mask)
    except:
        print(f"Error processing row {index}")
        results.append(None) 

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x105a62840>>
Traceback (most recent call last):
  File "/Users/haripat/Desktop/SF/protein/.venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 796, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
                                                 ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/haripat/.local/share/uv/python/cpython-3.12.10-macos-aarch64-none/lib/python3.12/threading.py", line 1543, in enumerate
    with _active_limbo_lock:
         ^^^^^^^^^^^^^^^^^^
SystemError: <built-in method __enter__ of _thread.RLock object at 0x1029c6400> returned a result with an exception set
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x105a62840>>
Traceback (most recent call last):
  File "/Users/haripat/Desktop/SF/protein/.venv/lib/pyt

In [12]:
# results[0]