In [13]:
import os
import re
import sys
import yaml
import json

# https://stackoverflow.com/questions/17935130/which-module-should-contain-logging-config-dictconfigmy-dictionary-what-about
import logging.config  # noqa
import pandas as pd

# Temporary fix for imports, investigate later
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
from common.utils import get_ipc_files, get_timestamp
from common.constants import BASE_RAW_DATA_DIR, BASE_PTMS_DIR
from common.logger import get_logger_config

In [14]:
logger_config = get_logger_config(subdir="scripts")
logging.config.dictConfig(logger_config)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

In [15]:
df = pd.read_csv(BASE_PTMS_DIR / "identified_ptms_annotated.csv")
df.head(5)

Unnamed: 0,amino_acid,glycan_mass,project_name,file_name,spectrum_id,ipc_index,modified_peptide,assigned_mod,observed_mz,composition,unimod accession,proposed encoding
0,N,1493,PXD026629,20180904YLJ-VSV4h-02.ipc,4558,0,HN[1493]GTGGR,2N(1378.4757),,HexNAc(2)Hex(6) % 1378.4757,1465.0,[UNIMOD:1465]
1,N,1493,PXD026629,20180904YLJ-VSV4h-02.ipc,7340,29,AAGMN[1493]HTK,5N(1378.4757),Mod1: Unannotated mass-shift 1379.5092 (PeakAp...,HexNAc(2)Hex(6) % 1378.4757,,[UNIMOD:1465]
2,N,1493,PXD026629,20180904YLJ-VSV4h-02.ipc,8650,42,KCLN[1493]HTTQK,"2C(57.0215),4N(1378.4757)","Mod1: Hex(6) HexNAc(2) (PeakApex: 1378.4760, T...",HexNAc(2)Hex(6) % 1378.4757,,[UNIMOD:1465]
3,N,1493,PXD026629,20180904YLJ-VSV4h-02.ipc,8903,51,CLN[1493]HTTQK,"1C(57.0215),3N(1378.4757)","Mod1: Hex(6) HexNAc(2) (PeakApex: 1378.4760, T...",HexNAc(2)Hex(6) % 1378.4757,,[UNIMOD:1465]
4,N,1493,PXD026629,20180904YLJ-VSV4h-02.ipc,9178,75,HQN[1493]QTLR,3N(1378.4757),"Mod1: Hex(6) HexNAc(2) (PeakApex: 1378.4760, T...",HexNAc(2)Hex(6) % 1378.4757,,[UNIMOD:1465]


In [16]:
residue_remapping_config = {}
residue_mass_mapping_config = {}

In [17]:
df.columns

Index(['amino_acid', 'glycan_mass', 'project_name', 'file_name', 'spectrum_id',
       'ipc_index', 'modified_peptide', 'assigned_mod', 'observed_mz',
       'composition', 'unimod accession', 'proposed encoding'],
      dtype='object')

In [18]:
df = df.rename(
    columns={
        "unimod accession": "unimod_accession",
        "proposed encoding": "proposed_encoding",
    }
)
df.columns

Index(['amino_acid', 'glycan_mass', 'project_name', 'file_name', 'spectrum_id',
       'ipc_index', 'modified_peptide', 'assigned_mod', 'observed_mz',
       'composition', 'unimod_accession', 'proposed_encoding'],
      dtype='object')

In [19]:
previous_ptm = None
for sequence_object in df.itertuples(name="SequenceObject"):
    if previous_ptm == (sequence_object.amino_acid, sequence_object.glycan_mass):
        # We're still on the same modification
        continue

    # Get the UNIMOD encoding
    proposed_encoding = (
        f"{sequence_object.amino_acid}{sequence_object.proposed_encoding}"
    )
    residue_remapping_config[
        f"{sequence_object.amino_acid}[{sequence_object.glycan_mass}]"
    ] = proposed_encoding
    try:
        residue_mass_mapping_config[proposed_encoding] = float(
            sequence_object.composition.split("%")[-1]
        )
    except AttributeError as exc:
        logger.error(exc.args[0], exc_info=exc)
        residue_mass_mapping_config[proposed_encoding] = "N/A"
    previous_ptm = (sequence_object.amino_acid, sequence_object.glycan_mass)

with open(BASE_PTMS_DIR / "residue_remapping_config.yaml", "w") as yaml_file, open(
    BASE_PTMS_DIR / "residue_remapping_config.json", "w"
) as json_file:
    yaml.dump(residue_remapping_config, yaml_file, default_flow_style=False)
    json.dump(residue_remapping_config, json_file, indent=4)

with open(BASE_PTMS_DIR / "residue_mass_mapping_config.yaml", "w") as yaml_file, open(
    BASE_PTMS_DIR / "residue_mass_mapping_config.json", "w"
) as json_file:
    yaml.dump(residue_mass_mapping_config, yaml_file, default_flow_style=False)
    json.dump(residue_mass_mapping_config, json_file, indent=4)

2025-03-31 04:57:41,166 - __main__ - ERROR - 'float' object has no attribute 'split'
Traceback (most recent call last):
  File "/tmp/ipykernel_215344/3584271470.py", line 16, in <module>
    sequence_object.composition.split("%")[-1]
AttributeError: 'float' object has no attribute 'split'


In [20]:
print()


