In [1]:
import pandas as pd
import re
import random
from psm_utils.psm import PSM
from psm_utils.psm_list import PSMList
from psm_utils.io import write_file
from pathlib import Path
import sys

import pandas as pd

from deeplc import DeepLC

# Alternative way if __file__ is not defined in Jupyter notebooks
project_root = Path.cwd().parent

sys.path.append(str(project_root))

from make_predictions.intensity_predictions import obtain_predictions_pairs
from seq_utils.fasta_to_peptides import create_tryptic_peptides
from seq_utils.peptide import (
    remove_non_il,
    remove_ux_containing,
    switch_first_il,
    switch_random_il,
)
from metrics.get_metrics import metrics_comparison

In [2]:
def to_lowercase(match) -> str:
    """
    Convert a match to lowercase.

    Parameters
    ----------
    match : re.Match
        The match object from a regular expression.

    Returns
    -------
    str
        The lowercase version of the matched string.
    """
    return match.group(0).lower()


def count_chars(input_string: str, isalpha: bool = True, isupper: bool = True) -> int:
    """
    Count the number of characters in the string that match the given criteria.

    Parameters
    ----------
    input_string : str
        The input string.
    isalpha : bool, optional
        Whether to count alphabetic characters. Defaults to True.
    isupper : bool, optional
        Whether to count uppercase characters. Defaults to True.

    Returns
    -------
    int
        The count of characters that match the criteria.
    """
    if isalpha and isupper:
        return sum(1 for char in input_string if char.isalpha() and char.isupper())
    if isalpha:
        return sum(1 for char in input_string if char.isalpha())
    if isupper:
        return sum(1 for char in input_string if char.isupper())


def match_brackets(
    input_string: str,
    pattern: str = r"\[([^]]+)\]",
    isalpha: bool = True,
    isupper: bool = True,
) -> tuple:
    """
    Match and extract bracketed modifications from the string.

    Parameters
    ----------
    input_string : str
        The input string.
    pattern : str, optional
        The regular expression pattern for matching modifications. Defaults to "\\[([^]]+)\\]".
    isalpha : bool, optional
        Whether to match alphabetic characters. Defaults to True.
    isupper : bool, optional
        Whether to match uppercase characters. Defaults to True.

    Returns
    -------
    tuple
        A tuple containing the matched modifications and their positions.
    """
    matches = [
        (match.group(), match.start(), match.end())
        for match in re.finditer(pattern, input_string)
    ]
    positions = (
        count_chars(input_string[0 : m[1]], isalpha=isalpha, isupper=isupper)
        for m in matches
    )
    mods = (m[0] for m in matches)
    return mods, positions


def get_stripped_seq(
    input_string: str, isalpha: bool = True, isupper: bool = True
) -> str:
    """
    Get a stripped version of the sequence containing only characters that match the given criteria.

    Parameters
    ----------
    input_string : str
        The input string.
    isalpha : bool, optional
        Whether to include alphabetic characters. Defaults to True.
    isupper : bool, optional
        Whether to include uppercase characters. Defaults to True.

    Returns
    -------
    str
        The stripped sequence.
    """
    if isalpha and isupper:
        return "".join(
            char for char in input_string if char.isalpha() and char.isupper()
        )
    if isalpha:
        return "".join(char for char in input_string if char.isalpha())
    if isupper:
        return "".join(char for char in input_string if char.isupper())


def get_proforma_bracketed(
    input_string: str,
    before_aa: bool = True,
    isalpha: bool = True,
    isupper: bool = True,
    pattern: str = r"\[([^]]+)\]",
    modification_dict: dict = {
        "+57.0215": "Carbamidomethyl",
        "+15.9949": "Oxidation",
        "-17.026548": "Gln->pyro-Glu",
        "-18.010565": "Glu->pyro-Glu",
        "+42": "Acetyl",
    },
) -> str:
    """
    Generate a proforma string with bracketed modifications.

    Parameters
    ----------
    input_string : str
        The input sequence string.
    before_aa : bool, optional
        Whether to add the modification before the amino acid. Defaults to True.
    isalpha : bool, optional
        Whether to include alphabetic characters. Defaults to True.
    isupper : bool, optional
        Whether to include uppercase characters. Defaults to True.
    pattern : str, optional
        The regular expression pattern for matching modifications. Defaults to "\\[([^]]+)\\]".
    modification_dict : dict, optional
        A dictionary of modifications and their names.

    Returns
    -------
    str
        The proforma sequence with bracketed modifications.
    """
    input_string = re.sub(pattern, to_lowercase, input_string)
    modifications, positions = match_brackets(
        input_string, pattern=pattern, isalpha=isalpha, isupper=isupper
    )
    new_modifications = []

    for m in modifications:
        if m in modification_dict:
            new_modifications.append(modification_dict[m])
        else:
            new_modifications.append(m)

    modifications = new_modifications
    pos_mod_dict = dict(zip(positions, modifications))

    stripped_seq = get_stripped_seq(input_string, isalpha=isalpha, isupper=isupper)

    new_seq = ""
    for idx, aa in enumerate(stripped_seq):
        if before_aa:
            new_seq += aa
        if idx in pos_mod_dict:
            if idx == 0:
                new_seq += f"[{pos_mod_dict[idx]}]-"
            elif idx == len(stripped_seq):
                new_seq += f"-[{pos_mod_dict[idx]}]"
            else:
                new_seq += f"[{pos_mod_dict[idx]}]"
        if not before_aa:
            new_seq += aa

    return new_seq


df = pd.read_csv("../temp_data/input_file.txt", sep="\t")

parse_column = "Modified sequence"
before_aa = False
isalpha = True
isupper = True
pattern = "\\([^()]*\\)|\\([^()]*\\([^()]*\\)[^()]*\\)"
modification_dict = {"(ox)" : "Oxidation", "(ac)" : "Acetyl", "(oxidation (m))" : "Oxidation", "(acetyl (protein n-term))" : "Acetyl"}


df["proforma"] = df["Modified sequence"].apply(
    get_proforma_bracketed,
    before_aa=before_aa,
    isalpha=isalpha,
    isupper=isupper,
    pattern=pattern,
    modification_dict=modification_dict,
)

In [3]:
merged_df = df.merge(
    df.groupby("proforma")["Calibrated retention time"].mean(),
    on="proforma",
    how="left",
)
merged_df = merged_df.drop_duplicates(subset="proforma", keep="first")

In [5]:
fraction = 0.5
random_seed = 42

df_train = merged_df.sample(frac=fraction, random_state=random_seed)
df_eval = merged_df.drop(df_train.index)

psm_list_train = []
psm_list_eval = []
psm_list_eval_swapped = []

for idx, row in df_train.iterrows():
    psm_list_train.append(
        PSM(
            peptidoform=row["proforma"],
            retention_time=row["Calibrated retention time_y"],
            spectrum_id=idx,

        )
    )

for idx, row in df_eval.iterrows():
    seq_swap = switch_random_il(row["proforma"])
    psm_list_eval.append(
        PSM(
            peptidoform=row["proforma"],
            retention_time=row["Calibrated retention time_y"],
            spectrum_id=idx,
        )
    )

    psm_list_eval_swapped.append(
        PSM(
            peptidoform=seq_swap,
            retention_time=row["Calibrated retention time_y"],
            spectrum_id=idx,
        )
    )


psm_list_train = PSMList(psm_list=psm_list_train)
psm_list_eval = PSMList(psm_list=psm_list_eval)
psm_list_eval_swapped = PSMList(psm_list=psm_list_eval_swapped)

print(list(df_train["proforma"]))

['LDLVGQLEHGK', 'AVGLPEIQVIR', 'NMFGFESWVGGR', 'DQVLAMLEK', 'IINGEVPEGLK', 'YVDQVLQLVYK', 'AQVESASDGESPVGNK', 'EDLTEIR', 'DIILPFR', 'SEGEGEAASADDGSLNTSGAGPK', 'EIELAEHEMPGLMAIR', 'HGLINFGIYK', 'STGEAFVQFASK', 'AVLLTQDTK', 'SFLLDLLNATGK', 'TLTQEDVEALEK', 'SKDNDDMEIDDLDTELGSSATPSK', 'LYIGNLSENAAPSDLESIFK', 'VIPSIAYTEPEVAWVGLTEK', 'YLEGTSCIAGVLVPAK', 'ELPDPQESIQR', '[Acetyl]-M[Oxidation]EEIGILVEK', 'YGSFFCDCGAK', 'LQSLFDAPDFSK', 'IFDSEEILAGYK', 'AVGIDLGTTYSCVAHFANDR', 'GQEEVEMPSK', 'VNLDTDCQYAYLTGIR', 'DEIPVLLEK', 'NALATIAQAQEESLGDNKPADDLLNLEGVDR', 'GEDEDKGPPCGPVNCNEK', 'VLVPEHEK', 'SESLIDASEDSQLEAAIR', 'HPTPTMLSIQK', 'IHPQTIIAGWR', 'ATVGILITTIASK', 'AIIKPQYVDNIPR', 'ILIANTGMDTDKIK', 'TLAYLLPAIVHINHQPYLER', 'LHTLDTCLK', 'ISPPVVAYR', 'TFGIPVVVAINR', 'TQWPSEQPSDGR', 'YSPTSPTYSPTTPK', 'LVILANNCPALR', 'EALCGCTVNVPTLDGR', 'LVDHHPEGLSAR', 'AAAGEFADDPCSSVK', 'ALLQQQPEDDSK', 'QTLM[Oxidation]WSATWPK', 'ILNEKPTTDEPEK', 'SVAAIHPSLEIPMLIPK', '[Acetyl]-TDAAVSFAK', 'GEDTGTCQGLPDSESSFTYTLDEK', 'GLTVPIAS

In [None]:
dlc = DeepLC(
    deeplc_retrain=True,
    n_epochs=50,
)
dlc.calibrate_preds(
    psm_list_train
)


In [6]:
preds = dlc.make_preds(psm_list=psm_list_eval)

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 140ms/step


In [7]:
preds

[52.44609069824219,
 41.87855911254883,
 102.1176986694336,
 80.39617919921875,
 67.67731475830078,
 54.236331939697266,
 118.96453094482422,
 108.33065032958984,
 132.85968017578125,
 64.51750183105469,
 82.65251922607422,
 128.79278564453125,
 38.85247039794922,
 128.12144470214844,
 91.38021087646484,
 36.43638229370117,
 66.78545379638672,
 88.05728912353516,
 63.76235580444336,
 119.43968200683594,
 97.90909576416016,
 114.48240661621094,
 114.55039978027344,
 119.95547485351562,
 53.01274490356445,
 55.89055252075195,
 71.0845718383789,
 45.42073440551758,
 120.64756774902344,
 49.51716995239258,
 87.28238677978516,
 57.3756217956543,
 31.931604385375977,
 107.21257019042969,
 96.63986206054688,
 65.37653350830078,
 64.34225463867188,
 110.66448211669922,
 90.71798706054688,
 56.11643600463867,
 129.96914672851562,
 125.24651336669922,
 87.69208526611328,
 80.54931640625,
 99.19086456298828,
 60.109954833984375,
 39.39842224121094,
 100.20773315429688,
 119.71247100830078,
 86.62

In [5]:
for v in psm_list_eval:
    print(v)

peptidoform=Peptidoform('LDVTILSPSR') spectrum_id='128635' run=None collection=None spectrum=None is_decoy=None score=None qvalue=None pep=None precursor_mz=None retention_time=91.917 ion_mobility=None protein_list=None rank=None source=None provenance_data={} metadata={} rescoring_features={}
peptidoform=Peptidoform('DLPTVLPPGFTIGAICK') spectrum_id='34119' run=None collection=None spectrum=None is_decoy=None score=None qvalue=None pep=None precursor_mz=None retention_time=134.41500000000002 ion_mobility=None protein_list=None rank=None source=None provenance_data={} metadata={} rescoring_features={}
peptidoform=Peptidoform('AFQYVETHGEVCPANWTPDSPTIKPSPAASK') spectrum_id='4405' run=None collection=None spectrum=None is_decoy=None score=None qvalue=None pep=None precursor_mz=None retention_time=89.64641666666667 ion_mobility=None protein_list=None rank=None source=None provenance_data={} metadata={} rescoring_features={}
peptidoform=Peptidoform('TFEVLATNGDTHLGGEDFDSR') spectrum_id='21603