# Deep learning to predict charges of peptides

### Go beyond MS2/RT prediction for DIA search

Input peptide sequences and output probabilities of each charge states:
| z=1 | z=2 | z=3 | z=4 | z=5 | z=6 |
| --- | --- | --- | --- | --- | --- |
| 0.1 | 0.8 | 0.9 | 0.0 | 0.0 | 0.0 |

### Prepare the data

In [1]:
import os

def get_msms_txt_list(dir, max_file_num=100000):
    msms_txt_list = []
    for msms_txt in os.listdir(dir):
        msms_txt_list.append(os.path.join(dir,msms_txt))
        if len(msms_txt_list) >= max_file_num:
            break
    return msms_txt_list

train_txt_list = get_msms_txt_list("../test_data/ProteomeKingdom/train/", max_file_num=10)
test_txt_list = get_msms_txt_list("../test_data/ProteomeKingdom/test/", max_file_num=10)


In [2]:
import pandas as pd
from alphabase.psm_reader import MaxQuantReader

def load_psm_df(msms_txt_list):
    psm_df_list = [
        MaxQuantReader().import_file(msms_txt) for msms_txt in msms_txt_list
    ]
    return pd.concat(psm_df_list, ignore_index=True)

train_df = load_psm_df(train_txt_list)
test_df = load_psm_df(test_txt_list)
test_df

Unnamed: 0,sequence,charge,rt,scan_num,raw_name,spec_idx,mods,mod_sites,nAA,rt_norm,precursor_mz
0,KLEEELR,3,44.996,34115,20181223_QX3_JoMu_SA_LC04_uPAC200cm_61_Deinoco...,34114,,,7,0.250033,306.174792
1,ALAGVQR,2,38.669,26563,20181223_QX3_JoMu_SA_LC04_uPAC200cm_61_Deinoco...,26562,,,7,0.214876,357.716488
2,SVLFLNK,2,113.480,112715,20181223_QX3_JoMu_SA_LC04_uPAC200cm_61_Deinoco...,112714,,,7,0.630585,410.749996
3,SVLFLNK,2,113.110,112266,20181223_QX3_JoMu_SA_LC04_uPAC200cm_61_Deinoco...,112265,,,7,0.628529,410.749996
4,SVEALNK,2,35.964,23307,20181223_QX3_JoMu_SA_LC04_uPAC200cm_61_Deinoco...,23306,,,7,0.199844,380.713611
...,...,...,...,...,...,...,...,...,...,...,...
959520,GQFALIPVSIFLQQELNAAEDVVVDQDEETITPSEVGGEQK,4,175.880,199212,20181010_QX1_JoMu_SA_Easy12-7_uPAC_500ng_2_Bac...,199211,,,41,0.977491,1111.557698
959521,WLEGGMVEITASYPAGVIGTTLENLQEAAAGEHEEWSLDYPK,3,175.230,198432,20181010_QX1_JoMu_SA_Easy12-7_uPAC_500ng_2_Bac...,198431,,,42,0.973879,1521.395119
959522,WLEGGMVEITASYPAGVIGTTLENLQEAAAGEHEEWSLDYPK,4,174.940,198078,20181010_QX1_JoMu_SA_Easy12-7_uPAC_500ng_2_Bac...,198077,Oxidation@M,6,42,0.972267,1145.296887
959523,MGKPEDMEAEGESASADSGSTSAGGGYGAGAWNSNTAYGTTR,3,97.891,103685,20181010_QX1_JoMu_SA_Easy12-7_uPAC_500ng_2_Bac...,103684,,,42,0.544050,1362.571380


In [3]:
import numpy as np

min_charge = 1
max_charge = 6

def get_charge_array(
    charge_list, 
    min_charge,
    max_charge,
):
    charge_array = np.zeros(max_charge-min_charge+1)
    for charge in charge_list:
        if charge <= max_charge and charge >= min_charge:
            charge_array[charge-min_charge] = 1.0
    return charge_array

In [4]:
train_charge_df = train_df.groupby("sequence")["charge"].apply(
    lambda x: get_charge_array(set(x), 
        min_charge=min_charge, 
        max_charge=max_charge
    )
).reset_index(drop=False).rename(columns={"charge":"charge_vector"})
train_charge_df

Unnamed: 0,sequence,charge_vector
0,AAAAAAAAAAAAGDSAVNGQAEQQAIPTIGR,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]"
1,AAAAAAAAAAAVAGTVPENETAEDK,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
2,AAAAAAAAAAEAAADDGANQR,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
3,AAAAAAAAAAGVWGATAEK,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0]"
4,AAAAAAAAAPSGPAPEGPAAK,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0]"
...,...,...
337362,YYYVHSYK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
337363,YYYVPMER,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
337364,YYYVQLSTGVSQWETPTDPAPVGATPGAAHEHPYGVPGSDADR,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
337365,YYYVVGEGTSMR,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"


In [5]:
test_charge_df = test_df.groupby("sequence")["charge"].apply(
    lambda x: get_charge_array(set(x), 
        min_charge=min_charge, 
        max_charge=max_charge
    )
).reset_index(drop=False).rename(columns={"charge":"charge_vector"})
test_charge_df = test_charge_df[~test_charge_df.sequence.isin(train_charge_df.sequence)]
test_charge_df

Unnamed: 0,sequence,charge_vector
0,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAMEEDSEASSSR,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]"
1,AAAAAAAAAAGAAGGR,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
2,AAAAAAAAADEQDEEK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
3,AAAAAAAAADEQDEEQEEEEAEEEEK,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
4,AAAAAAAAAGGAK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
...,...,...
316208,YYYVQNVYTPVDENVYPDHR,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
316209,YYYVTEGLK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
316210,YYYWLDDGGK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
316211,YYYYGLGQDLDVGK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"


In [6]:
from peptdeep.model.model_shop import (
    Model_for_Generic_ModAASeq_BinaryClassification_Transformer,
    ModelInterface_for_Generic_ModAASeq_BinaryClassification,
    Model_for_Generic_AASeq_BinaryClassification_Transformer,
    ModelInterface_for_Generic_AASeq_BinaryClassification,
)
import torch

class ModelInterface_Chargeability(
    ModelInterface_for_Generic_AASeq_BinaryClassification
):
    def __init__(self,
        min_charge = 1,
        max_charge = 6,
    ):
        super().__init__(
            model_class=Model_for_Generic_AASeq_BinaryClassification_Transformer,
            nlayers=4, hidden_dim=256, dropout=0.1,
            output_dim=max_charge-min_charge+1, # six target values
        )
        self.target_column_to_train = 'charge_vector'
        self.target_column_to_predict = 'charge_probs'
        self.num_target_values = max_charge-min_charge+1
        self.min_charge = min_charge
        self.max_charge = max_charge

    def _get_targets_from_batch_df(self, batch_df, **kwargs):
        return self._as_tensor(
            np.stack(batch_df[self.target_column_to_train].values), 
            dtype=torch.float32
        )

    def _prepare_predict_data_df(self, precursor_df, **kwargs):
        precursor_df[self.target_column_to_predict] = [
            [0]*self.num_target_values
        ]*len(precursor_df)
        self.predict_df = precursor_df

    def _set_batch_predict_data(self, batch_df, predict_values, **kwargs):
        if self._predict_in_order:
            self.predict_df.loc[:,self.target_column_to_predict].values[
                batch_df.index.values[0]:batch_df.index.values[-1]+1
            ] = list(predict_values)
        else:
            self.predict_df.loc[
                batch_df.index,self.target_column_to_predict
            ] = list(predict_values)

In [7]:
model = ModelInterface_Chargeability(
    min_charge=min_charge,
    max_charge=max_charge,
)

In [8]:
def convert_probs_to_one_hot(test_charge_df, prob=0.5):
    test_charge_df["charge_pred"] = test_charge_df["charge_probs"].apply(
        lambda x: (x>prob).astype(float)
    )
    return test_charge_df

def test(test_charge_df:pd.DataFrame, prob):
    test_charge_df = convert_probs_to_one_hot(test_charge_df, prob)
    test_charge_df["recall"] = test_charge_df[["charge_vector","charge_pred"]].apply(
        lambda x: np.mean((x[1]==1)[x[0]==1]), axis=1
    ).fillna(0)
    test_charge_df["precision"] = test_charge_df[["charge_vector","charge_pred"]].apply(
        lambda x: np.mean((x[0]==1)[x[1]==1]), axis=1
    ).fillna(0)
    return test_charge_df

sample_df = test_charge_df.sample(10000)
sample_df = model.predict(sample_df, verbose=True)

sample_df = test(sample_df, 0.5)
sample_df

100%|██████████| 38/38 [00:06<00:00,  5.76it/s]
  lambda x: np.mean((x[1]==1)[x[0]==1]), axis=1
  lambda x: np.mean((x[0]==1)[x[1]==1]), axis=1
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,sequence,charge_vector,nAA,charge_probs,charge_pred,recall,precision
0,HFGADYK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.39429036, 0.5831823, 0.4726178, 0.47824824,...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",1.0,1.000000
1,FSPVTPK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.43197203, 0.4533282, 0.4839277, 0.44122347,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.000000
2,TSWINLK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.38592607, 0.45785996, 0.49804157, 0.4770193...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.000000
3,VTIIDIR,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.37938038, 0.4684875, 0.5677377, 0.5456818, ...","[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]",0.0,0.000000
4,HFGMLHR,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",7,"[0.41196373, 0.55956733, 0.47842178, 0.4272914...","[0.0, 1.0, 0.0, 0.0, 0.0, 1.0]",0.0,0.000000
...,...,...,...,...,...,...,...
9995,ETDFFDEHTQPAISWDMASPSLTDQNGAENVNPQLAQSNPK,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]",41,"[0.4449157, 0.47713187, 0.5012832, 0.54004925,...","[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]",1.0,1.000000
9996,GGTGAIVEYHGPGVDSISCTGMATICNMGAEIGATTSVFPFNHR,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",44,"[0.41215876, 0.56170076, 0.5402071, 0.5379819,...","[0.0, 1.0, 1.0, 1.0, 0.0, 0.0]",1.0,0.333333
9997,IIATAVCHTDAYTLSGADPEGCFPVILGHEGAGIVESVGEGVTR,"[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]",44,"[0.4224049, 0.54194677, 0.53662884, 0.5508797,...","[0.0, 1.0, 1.0, 1.0, 0.0, 0.0]",0.5,0.333333
9998,HGDAEYSASPEQVADNGEEHSEGGLVENHVDGNVNLMGGGGGAGR,"[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]",45,"[0.43189862, 0.5694854, 0.5199196, 0.5315261, ...","[0.0, 1.0, 1.0, 1.0, 0.0, 0.0]",0.5,0.333333


In [9]:
sample_df[["recall","precision"]].mean()

recall       0.517317
precision    0.319217
dtype: float64

In [10]:
model.train(
    train_charge_df.sample(10000), epoch=10, warmup_epoch=3,
    verbose=True, verbose_each_epoch=True
)

2024-01-16 17:44:03> Training with fixed sequence length: 0


  0%|          | 0/36 [00:00<?, ?it/s]

Epoch=1, nAA=22, batch=37, loss=0.6558: 100%|██████████| 36/36 [00:18<00:00,  1.93it/s]


[Training] Epoch=1, lr=3.3333333333333335e-05, loss=0.6468033387854293


Epoch=2, nAA=29, batch=37, loss=0.3951: 100%|██████████| 36/36 [00:18<00:00,  1.91it/s]


[Training] Epoch=2, lr=6.666666666666667e-05, loss=0.4537128129520932


Epoch=3, nAA=10, batch=37, loss=0.2374: 100%|██████████| 36/36 [00:19<00:00,  1.88it/s]


[Training] Epoch=3, lr=0.0001, loss=0.3407206172878678


Epoch=4, nAA=38, batch=37, loss=0.2552: 100%|██████████| 36/36 [00:19<00:00,  1.87it/s]


[Training] Epoch=4, lr=9.504844339512095e-05, loss=0.2816313315887709


Epoch=5, nAA=33, batch=37, loss=0.2727: 100%|██████████| 36/36 [00:19<00:00,  1.87it/s]


[Training] Epoch=5, lr=8.117449009293668e-05, loss=0.2388691914242667


Epoch=6, nAA=33, batch=37, loss=0.3088: 100%|██████████| 36/36 [00:19<00:00,  1.87it/s]


[Training] Epoch=6, lr=6.112604669781572e-05, loss=0.2223756720085402


Epoch=7, nAA=44, batch=37, loss=0.1278: 100%|██████████| 36/36 [00:19<00:00,  1.89it/s]


[Training] Epoch=7, lr=3.887395330218429e-05, loss=0.19951007938062823


Epoch=8, nAA=13, batch=37, loss=0.1703: 100%|██████████| 36/36 [00:19<00:00,  1.87it/s]


[Training] Epoch=8, lr=1.8825509907063327e-05, loss=0.1880632598657866


Epoch=9, nAA=35, batch=37, loss=0.2476: 100%|██████████| 36/36 [00:19<00:00,  1.87it/s]


[Training] Epoch=9, lr=4.951556604879048e-06, loss=0.1824499626417418


Epoch=10, nAA=14, batch=37, loss=0.1662: 100%|██████████| 36/36 [00:19<00:00,  1.87it/s]

[Training] Epoch=10, lr=0.0, loss=0.1795700248834249





In [11]:
sample_df = model.predict(sample_df, verbose=True)
sample_df

100%|██████████| 38/38 [00:06<00:00,  5.62it/s]


Unnamed: 0,sequence,charge_vector,nAA,charge_probs,charge_pred,recall,precision
0,HFGADYK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.023600304, 0.86607, 0.26293665, 0.03553947,...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",1.0,1.000000
1,FSPVTPK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.04537858, 0.95262057, 0.088946104, 0.027795...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.000000
2,TSWINLK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.044788714, 0.9518532, 0.08804332, 0.0274212...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.000000
3,VTIIDIR,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.039366055, 0.9476792, 0.11181728, 0.0265463...","[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]",0.0,0.000000
4,HFGMLHR,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",7,"[0.018164061, 0.13755983, 0.5690665, 0.3539470...","[0.0, 1.0, 0.0, 0.0, 0.0, 1.0]",0.0,0.000000
...,...,...,...,...,...,...,...
9995,ETDFFDEHTQPAISWDMASPSLTDQNGAENVNPQLAQSNPK,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]",41,"[0.030782506, 0.025343182, 0.36109295, 0.88964...","[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]",1.0,1.000000
9996,GGTGAIVEYHGPGVDSISCTGMATICNMGAEIGATTSVFPFNHR,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",44,"[0.039619558, 0.025748314, 0.27117324, 0.91269...","[0.0, 1.0, 1.0, 1.0, 0.0, 0.0]",1.0,0.333333
9997,IIATAVCHTDAYTLSGADPEGCFPVILGHEGAGIVESVGEGVTR,"[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]",44,"[0.043935925, 0.025122413, 0.23979905, 0.92850...","[0.0, 1.0, 1.0, 1.0, 0.0, 0.0]",0.5,0.333333
9998,HGDAEYSASPEQVADNGEEHSEGGLVENHVDGNVNLMGGGGGAGR,"[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]",45,"[0.045176506, 0.025506439, 0.21572013, 0.92795...","[0.0, 1.0, 1.0, 1.0, 0.0, 0.0]",0.5,0.333333


In [12]:
test(sample_df, 0.5)
sample_df[["recall","precision"]].mean()

  lambda x: np.mean((x[1]==1)[x[0]==1]), axis=1
  lambda x: np.mean((x[0]==1)[x[1]==1]), axis=1


recall       0.8976
precision    0.9002
dtype: float64

In [13]:
sample_df.query("recall<=0")

Unnamed: 0,sequence,charge_vector,nAA,charge_probs,charge_pred,recall,precision
51,DLPLDLF,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.039699428, 0.94549924, 0.108411275, 0.02597...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0
73,YYPDHIK,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",7,"[0.017886074, 0.8092141, 0.42117825, 0.0359392...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0
95,DVSLVDK,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.040783584, 0.948667, 0.10150761, 0.02672743...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0
107,DHCVAHK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.017713934, 0.15756896, 0.6266575, 0.3175838...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",0.0,0.0
113,MEGAGYR,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.047347274, 0.9552527, 0.07566819, 0.0288494...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0
...,...,...,...,...,...,...,...
9983,HVAGFADAHTSELDADTTHPVIHLMPDQNGIEDIGGTLR,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]",39,"[0.05672718, 0.025795449, 0.18834803, 0.945104...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",0.0,0.0
9984,NTGPPSEISEPATDAEVSEAESHASGVSSVLEVQEPIIR,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",39,"[0.030254027, 0.025425123, 0.36136982, 0.88745...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",0.0,0.0
9986,AQSPGGEDFAAAASTHAATDVPENAPDACPGVQSETAGK,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",39,"[0.024586773, 0.030268097, 0.43329796, 0.82445...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",0.0,0.0
9992,SHFGLIVGDSQHSFPFSGDETNHASATSTQDFLDQVTSQK,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]",40,"[0.043143958, 0.02523385, 0.23832221, 0.927190...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",0.0,0.0


### Predict spectral library based on charge prediction

In [14]:
def add_charge(df, model, prob=0.5):
    df["charge"] = df.charge_probs.apply(
        lambda x: np.arange(model.min_charge, model.max_charge+1)[
            x>prob
        ]
    )
    df = df.explode("charge")
    df.drop(columns="charge_probs", inplace=True)
    df["charge"] = df.charge.astype(np.int8)
    return df

In [15]:
from peptdeep.protein.fasta import PredictSpecLibFasta
fasta_lib = PredictSpecLibFasta(
    charged_frag_types=["b_z1","b_z2","y_z1","y_z2"],
    precursor_charge_min=0,
    precursor_charge_max=0,
    precursor_mz_min=400,
    precursor_mz_max=1000,
    peptide_length_min=7,
    peptide_length_max=30,
    max_missed_cleavages=2,
    var_mods=["Acetyl@Protein_N-term", "Oxidation@M"],
    fix_mods=["Carbamidomethyl@C"],
    special_mods=["Phospho@S","Phospho@T","Phospho@Y"],
    min_special_mod_num=0,
    max_special_mod_num=1,
)
fasta_lib.get_peptides_from_fasta(
    ["../test_data/small_fasta/test_protein.fasta"]
)
model.predict(fasta_lib.precursor_df)
fasta_lib._precursor_df = add_charge(
    fasta_lib.precursor_df, 
    model, prob=0.5
)
fasta_lib.append_decoy_sequence()
fasta_lib.add_modifications()
fasta_lib.add_special_modifications()
fasta_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,charge
0,QKELDSK,0,1,False,False,Phospho@S,6,7,2
1,QKELDSK,0,1,False,False,,,7,2
2,RKEVVHK,0,2,False,False,,,7,3
3,ILENAQR,0,0,False,False,,,7,2
4,ELDSKVR,0,1,False,False,Phospho@S,4,7,2
...,...,...,...,...,...,...,...,...,...
378,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@T,14,30,4
379,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@S,22,30,4
380,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@Y,28,30,4
381,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@S,29,30,4


In [16]:
fasta_lib.predict_all(
    predict_items=["rt","ms2"]
)
fasta_lib.precursor_df

2024-01-16 17:47:23> Predicting RT/IM/MS2 for 270 precursors ...
2024-01-16 17:47:23> Predicting RT ...


100%|██████████| 24/24 [00:00<00:00, 111.73it/s]

2024-01-16 17:47:24> Predicting MS2 ...



100%|██████████| 24/24 [00:00<00:00, 50.06it/s]

2024-01-16 17:47:24> End predicting RT/IM/MS2





Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,charge,precursor_mz,rt_pred,rt_norm_pred,nce,instrument,frag_start_idx,frag_stop_idx
0,QKELDSK,0,1,False,False,Phospho@S,6,7,2,464.212790,0.009475,0.009475,30.0,Lumos,0,6
1,QKELDSK,0,1,False,False,,,7,2,424.229625,0.012003,0.012003,30.0,Lumos,6,12
2,ILENAQR,0,0,False,False,,,7,2,422.237784,0.106143,0.106143,30.0,Lumos,12,18
3,ELDSKVR,0,1,False,False,Phospho@S,4,7,2,463.720782,0.042219,0.042219,30.0,Lumos,18,24
4,ELDSKVR,0,1,False,False,,,7,2,423.737617,0.044382,0.044382,30.0,Lumos,24,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@T,14,30,4,915.165315,0.900869,0.900869,30.0,Lumos,4211,4240
266,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@S,22,30,4,915.165315,0.906644,0.906644,30.0,Lumos,4240,4269
267,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@Y,28,30,4,915.165315,0.904113,0.904113,30.0,Lumos,4269,4298
268,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@S,29,30,4,915.165315,0.899686,0.899686,30.0,Lumos,4298,4327


In [17]:
fasta_lib.save_hdf("../test_data/small_fasta/predicted_charged.speclib.hdf")

### DIY: training models for modified sequences