# Deep learning to predict charges of peptides

### Go beyond MS2/RT prediction for DIA search

Input peptide sequences and output probabilities of each charge states:
| z=1 | z=2 | z=3 | z=4 | z=5 | z=6 |
| --- | --- | --- | --- | --- | --- |
| 0.1 | 0.8 | 0.9 | 0.0 | 0.0 | 0.0 |

### Prepare the data

In [1]:
import os

def get_msms_txt_list(dir, max_file_num=100000):
    msms_txt_list = []
    for msms_txt in os.listdir(dir):
        msms_txt_list.append(os.path.join(dir,msms_txt))
        if len(msms_txt_list) >= max_file_num:
            break
    return msms_txt_list

train_txt_list = get_msms_txt_list("../test_data/ProteomeKingdom/train/", max_file_num=10)
test_txt_list = get_msms_txt_list("../test_data/ProteomeKingdom/test/", max_file_num=10)


In [2]:
import pandas as pd
from alphabase.psm_reader import MaxQuantReader

def load_psm_df(msms_txt_list):
    psm_df_list = [
        MaxQuantReader().import_file(msms_txt) for msms_txt in msms_txt_list
    ]
    return pd.concat(psm_df_list, ignore_index=True)

train_df = load_psm_df(train_txt_list)
test_df = load_psm_df(test_txt_list)
test_df

Unnamed: 0,sequence,charge,rt,scan_num,raw_name,spec_idx,mods,mod_sites,nAA,rt_norm,precursor_mz
0,KLEEELR,3,44.996,34115,20181223_QX3_JoMu_SA_LC04_uPAC200cm_61_Deinoco...,34114,,,7,0.250033,306.174792
1,ALAGVQR,2,38.669,26563,20181223_QX3_JoMu_SA_LC04_uPAC200cm_61_Deinoco...,26562,,,7,0.214876,357.716488
2,SVLFLNK,2,113.480,112715,20181223_QX3_JoMu_SA_LC04_uPAC200cm_61_Deinoco...,112714,,,7,0.630585,410.749996
3,SVLFLNK,2,113.110,112266,20181223_QX3_JoMu_SA_LC04_uPAC200cm_61_Deinoco...,112265,,,7,0.628529,410.749996
4,SVEALNK,2,35.964,23307,20181223_QX3_JoMu_SA_LC04_uPAC200cm_61_Deinoco...,23306,,,7,0.199844,380.713611
...,...,...,...,...,...,...,...,...,...,...,...
959520,GQFALIPVSIFLQQELNAAEDVVVDQDEETITPSEVGGEQK,4,175.880,199212,20181010_QX1_JoMu_SA_Easy12-7_uPAC_500ng_2_Bac...,199211,,,41,0.977491,1111.557698
959521,WLEGGMVEITASYPAGVIGTTLENLQEAAAGEHEEWSLDYPK,3,175.230,198432,20181010_QX1_JoMu_SA_Easy12-7_uPAC_500ng_2_Bac...,198431,,,42,0.973879,1521.395119
959522,WLEGGMVEITASYPAGVIGTTLENLQEAAAGEHEEWSLDYPK,4,174.940,198078,20181010_QX1_JoMu_SA_Easy12-7_uPAC_500ng_2_Bac...,198077,Oxidation@M,6,42,0.972267,1145.296887
959523,MGKPEDMEAEGESASADSGSTSAGGGYGAGAWNSNTAYGTTR,3,97.891,103685,20181010_QX1_JoMu_SA_Easy12-7_uPAC_500ng_2_Bac...,103684,,,42,0.544050,1362.571380


In [3]:
import numpy as np

min_charge = 1
max_charge = 6

def get_charge_array(
    charge_list, 
    min_charge,
    max_charge,
):
    charge_array = np.zeros(max_charge-min_charge+1)
    for charge in charge_list:
        if charge <= max_charge and charge >= min_charge:
            charge_array[charge-min_charge] = 1.0
    return charge_array

In [4]:
train_charge_df = train_df.groupby("sequence")["charge"].apply(
    lambda x: get_charge_array(set(x), 
        min_charge=min_charge, 
        max_charge=max_charge
    )
).reset_index(drop=False).rename(columns={"charge":"charge_vector"})
train_charge_df

Unnamed: 0,sequence,charge_vector
0,AAAAAAAAAAAAGDSAVNGQAEQQAIPTIGR,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]"
1,AAAAAAAAAAAVAGTVPENETAEDK,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
2,AAAAAAAAAAEAAADDGANQR,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
3,AAAAAAAAAAGVWGATAEK,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0]"
4,AAAAAAAAAPSGPAPEGPAAK,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0]"
...,...,...
337362,YYYVHSYK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
337363,YYYVPMER,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
337364,YYYVQLSTGVSQWETPTDPAPVGATPGAAHEHPYGVPGSDADR,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
337365,YYYVVGEGTSMR,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"


In [5]:
test_charge_df = test_df.groupby("sequence")["charge"].apply(
    lambda x: get_charge_array(set(x), 
        min_charge=min_charge, 
        max_charge=max_charge
    )
).reset_index(drop=False).rename(columns={"charge":"charge_vector"})
test_charge_df = test_charge_df[~test_charge_df.sequence.isin(train_charge_df.sequence)]
test_charge_df

Unnamed: 0,sequence,charge_vector
0,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAMEEDSEASSSR,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]"
1,AAAAAAAAAAGAAGGR,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
2,AAAAAAAAADEQDEEK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
3,AAAAAAAAADEQDEEQEEEEAEEEEK,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
4,AAAAAAAAAGGAK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
...,...,...
316208,YYYVQNVYTPVDENVYPDHR,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
316209,YYYVTEGLK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
316210,YYYWLDDGGK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
316211,YYYYGLGQDLDVGK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"


In [6]:
from peptdeep.model.model_shop import (
    Model_for_Generic_ModAASeq_BinaryClassification_Transformer,
    ModelInterface_for_Generic_ModAASeq_BinaryClassification,
    Model_for_Generic_AASeq_BinaryClassification_Transformer,
    ModelInterface_for_Generic_AASeq_BinaryClassification,
)
import torch

class ModelInterface_Chargeability(
    ModelInterface_for_Generic_AASeq_BinaryClassification
):
    def __init__(self,
        min_charge = 1,
        max_charge = 6,
    ):
        super().__init__(
            model_class=Model_for_Generic_AASeq_BinaryClassification_Transformer,
            nlayers=4, hidden_dim=256, dropout=0.1,
            output_dim=max_charge-min_charge+1, # six target values
        )
        self.target_column_to_train = 'charge_vector'
        self.target_column_to_predict = 'charge_probs'
        self.num_target_values = max_charge-min_charge+1
        self.min_charge = min_charge
        self.max_charge = max_charge

    def _get_targets_from_batch_df(self, batch_df, **kwargs):
        return self._as_tensor(
            np.stack(batch_df[self.target_column_to_train].values), 
            dtype=torch.float32
        )

    def _prepare_predict_data_df(self, precursor_df, **kwargs):
        precursor_df[self.target_column_to_predict] = [
            [0]*self.num_target_values
        ]*len(precursor_df)
        self.predict_df = precursor_df

    def _set_batch_predict_data(self, batch_df, predict_values, **kwargs):
        if self._predict_in_order:
            self.predict_df.loc[:,self.target_column_to_predict].values[
                batch_df.index.values[0]:batch_df.index.values[-1]+1
            ] = list(predict_values)
        else:
            self.predict_df.loc[
                batch_df.index,self.target_column_to_predict
            ] = list(predict_values)

In [7]:
model = ModelInterface_Chargeability(
    min_charge=min_charge,
    max_charge=max_charge,
)

In [8]:
def convert_probs_to_one_hot(test_charge_df, prob=0.5):
    test_charge_df["charge_pred"] = test_charge_df["charge_probs"].apply(
        lambda x: (x>prob).astype(float)
    )
    return test_charge_df

def test(test_charge_df:pd.DataFrame, prob):
    test_charge_df = convert_probs_to_one_hot(test_charge_df, prob)
    test_charge_df["recall"] = test_charge_df[["charge_vector","charge_pred"]].apply(
        lambda x: np.mean((x[1]==1)[x[0]==1]), axis=1
    ).fillna(0)
    test_charge_df["precision"] = test_charge_df[["charge_vector","charge_pred"]].apply(
        lambda x: np.mean((x[0]==1)[x[1]==1]), axis=1
    ).fillna(0)
    return test_charge_df

sample_df = test_charge_df.sample(10000)
sample_df = model.predict(sample_df, verbose=True)

sample_df = test(sample_df, 0.5)
sample_df

100%|██████████| 39/39 [00:06<00:00,  5.96it/s]
  lambda x: np.mean((x[1]==1)[x[0]==1]), axis=1
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  lambda x: np.mean((x[0]==1)[x[1]==1]), axis=1


Unnamed: 0,sequence,charge_vector,nAA,charge_probs,charge_pred,recall,precision
0,LTLIHWK,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",7,"[0.4498245, 0.6713155, 0.5421653, 0.5817175, 0...","[0.0, 1.0, 1.0, 1.0, 1.0, 0.0]",1.0,0.25
1,HAQQELK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.43211332, 0.6579582, 0.51035947, 0.5726277,...","[0.0, 1.0, 1.0, 1.0, 1.0, 0.0]",1.0,0.25
2,SVIDSLK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.4362002, 0.6372726, 0.5346901, 0.5747135, 0...","[0.0, 1.0, 1.0, 1.0, 1.0, 0.0]",1.0,0.25
3,LLQAIAR,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.42873362, 0.61811274, 0.5524835, 0.5848642,...","[0.0, 1.0, 1.0, 1.0, 1.0, 0.0]",1.0,0.25
4,QVGSAPK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.44608125, 0.601893, 0.5134943, 0.5411422, 0...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0]",1.0,0.20
...,...,...,...,...,...,...,...
9995,TTASPGVLSSGSHGSTVPSPPEDDEEEDNDEPLLSGSGDVSK,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",42,"[0.4526516, 0.57627165, 0.45317578, 0.4795, 0....","[0.0, 1.0, 0.0, 0.0, 1.0, 1.0]",0.0,0.00
9996,AADIFPEFSAGNTDLFIPGGPGSIANPANYPESFATGATDINK,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]",43,"[0.44950494, 0.552338, 0.49170145, 0.46194026,...","[0.0, 1.0, 0.0, 0.0, 1.0, 1.0]",0.0,0.00
9997,DSGLGATGGSVVVVVASVPGAHGPDSGTDSSLTAVSQLPQSEK,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]",43,"[0.45489216, 0.54126066, 0.45976508, 0.4902752...","[0.0, 1.0, 0.0, 0.0, 1.0, 1.0]",0.0,0.00
9998,GPAGPPGNDGIPGQPGLPGPPGPPGPPGLGGNFSPQMSGGFDEK,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]",44,"[0.42923006, 0.5089521, 0.44896233, 0.48488104...","[0.0, 1.0, 0.0, 0.0, 1.0, 1.0]",0.0,0.00


In [9]:
sample_df[["recall","precision"]].mean()

recall       0.924317
precision    0.257067
dtype: float64

In [10]:
model.train(
    train_charge_df.sample(10000), epoch=10, warmup_epoch=3,
    verbose=True, verbose_each_epoch=True
)

2024-01-15 14:27:40> Training with fixed sequence length: 0


  0%|          | 0/37 [00:00<?, ?it/s]

Epoch=1, nAA=33, batch=38, loss=0.7266: 100%|██████████| 37/37 [00:18<00:00,  1.95it/s]


[Training] Epoch=1, lr=3.3333333333333335e-05, loss=0.7155396765784213


Epoch=2, nAA=30, batch=38, loss=0.3674: 100%|██████████| 37/37 [00:18<00:00,  1.95it/s]


[Training] Epoch=2, lr=6.666666666666667e-05, loss=0.4888677048055749


Epoch=3, nAA=7, batch=38, loss=0.2515: 100%|██████████| 37/37 [00:19<00:00,  1.93it/s] 


[Training] Epoch=3, lr=0.0001, loss=0.3526854130782579


Epoch=4, nAA=42, batch=38, loss=0.3346: 100%|██████████| 37/37 [00:19<00:00,  1.92it/s]


[Training] Epoch=4, lr=9.504844339512095e-05, loss=0.2903699733709034


Epoch=5, nAA=19, batch=38, loss=0.2965: 100%|██████████| 37/37 [00:19<00:00,  1.92it/s]


[Training] Epoch=5, lr=8.117449009293668e-05, loss=0.24534115783478083


Epoch=6, nAA=18, batch=38, loss=0.2336: 100%|██████████| 37/37 [00:19<00:00,  1.90it/s]


[Training] Epoch=6, lr=6.112604669781572e-05, loss=0.22159640569435923


Epoch=7, nAA=14, batch=38, loss=0.1859: 100%|██████████| 37/37 [00:19<00:00,  1.90it/s]


[Training] Epoch=7, lr=3.887395330218429e-05, loss=0.20944912202264132


Epoch=8, nAA=7, batch=38, loss=0.1002: 100%|██████████| 37/37 [00:19<00:00,  1.90it/s] 


[Training] Epoch=8, lr=1.8825509907063327e-05, loss=0.1978456881877623


Epoch=9, nAA=17, batch=38, loss=0.2104: 100%|██████████| 37/37 [00:19<00:00,  1.92it/s]


[Training] Epoch=9, lr=4.951556604879048e-06, loss=0.18900966252151288


Epoch=10, nAA=10, batch=38, loss=0.1227: 100%|██████████| 37/37 [00:19<00:00,  1.87it/s]

[Training] Epoch=10, lr=0.0, loss=0.18553961303673291





In [11]:
sample_df = model.predict(sample_df, verbose=True)
sample_df

100%|██████████| 39/39 [00:07<00:00,  5.56it/s]


Unnamed: 0,sequence,charge_vector,nAA,charge_probs,charge_pred,recall,precision
0,LTLIHWK,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",7,"[0.013929854, 0.64184904, 0.7427672, 0.0624784...","[0.0, 1.0, 1.0, 1.0, 1.0, 0.0]",1.0,0.25
1,HAQQELK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.01641336, 0.70997614, 0.6604279, 0.05098207...","[0.0, 1.0, 1.0, 1.0, 1.0, 0.0]",1.0,0.25
2,SVIDSLK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.05525869, 0.9426985, 0.09562596, 0.03578910...","[0.0, 1.0, 1.0, 1.0, 1.0, 0.0]",1.0,0.25
3,LLQAIAR,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.047096424, 0.9355265, 0.122900784, 0.034993...","[0.0, 1.0, 1.0, 1.0, 1.0, 0.0]",1.0,0.25
4,QVGSAPK,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.05804504, 0.9445013, 0.09150215, 0.03591959...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0]",1.0,0.20
...,...,...,...,...,...,...,...
9995,TTASPGVLSSGSHGSTVPSPPEDDEEEDNDEPLLSGSGDVSK,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",42,"[0.025207637, 0.02946127, 0.21876918, 0.885343...","[0.0, 1.0, 0.0, 0.0, 1.0, 1.0]",0.0,0.00
9996,AADIFPEFSAGNTDLFIPGGPGSIANPANYPESFATGATDINK,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]",43,"[0.029526327, 0.03102777, 0.17872286, 0.904083...","[0.0, 1.0, 0.0, 0.0, 1.0, 1.0]",0.0,0.00
9997,DSGLGATGGSVVVVVASVPGAHGPDSGTDSSLTAVSQLPQSEK,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]",43,"[0.028798232, 0.028433109, 0.1928856, 0.899985...","[0.0, 1.0, 0.0, 0.0, 1.0, 1.0]",0.0,0.00
9998,GPAGPPGNDGIPGQPGLPGPPGPPGPPGLGGNFSPQMSGGFDEK,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0]",44,"[0.026299888, 0.0374188, 0.16180916, 0.8917818...","[0.0, 1.0, 0.0, 0.0, 1.0, 1.0]",0.0,0.00


In [12]:
test(sample_df, 0.5)
sample_df[["recall","precision"]].mean()

  lambda x: np.mean((x[1]==1)[x[0]==1]), axis=1
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  lambda x: np.mean((x[0]==1)[x[1]==1]), axis=1


recall       0.905958
precision    0.901450
dtype: float64

In [18]:
sample_df.query("recall<=0")

Unnamed: 0,sequence,charge_vector,nAA,charge_probs,charge_pred,recall,precision
84,MNPDLQK,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.049847677, 0.93894637, 0.11579637, 0.034325...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0
94,RLDFDYK,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",7,"[0.048454612, 0.93664765, 0.12063176, 0.035065...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0
170,VREYELR,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",7,"[0.04127108, 0.92576957, 0.14717571, 0.0332601...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0
189,NSPVNTK,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.057770897, 0.94513583, 0.095615566, 0.03454...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0
225,LGSNMKK,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",7,"[0.05730622, 0.94540334, 0.09056047, 0.0363494...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0
...,...,...,...,...,...,...,...
9955,AATTVEHLAIQCHWSQRPAVIGDVLQVYSGSEGR,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]",34,"[0.027612975, 0.022449806, 0.28275248, 0.90506...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",0.0,0.0
9963,EGLGGNYIAGMSPEETEEPDISNLDVDHEFFQEK,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",34,"[0.018736077, 0.025399135, 0.49745125, 0.81723...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",0.0,0.0
9965,SVGVGDEPVGEFVESPQPQAPSGMETGLDHYIER,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",34,"[0.020644076, 0.025043054, 0.4365088, 0.834918...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",0.0,0.0
9978,LANVQLLDTDGGFVHSDGAISCHDMFDFLHLTGGGYAK,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]",38,"[0.046183363, 0.02243994, 0.12290018, 0.950397...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",0.0,0.0


### Predict spectral library based on charge prediction

In [14]:
def add_charge(df, model, prob=0.5):
    df["charge"] = df.charge_probs.apply(
        lambda x: np.arange(model.min_charge, model.max_charge+1)[
            x>prob
        ]
    )
    df = df.explode("charge")
    df.drop(columns="charge_probs", inplace=True)
    df["charge"] = df.charge.astype(np.int8)
    return df

In [15]:
from peptdeep.protein.fasta import PredictSpecLibFasta
fasta_lib = PredictSpecLibFasta(
    charged_frag_types=["b_z1","b_z2","y_z1","y_z2"],
    precursor_charge_min=0,
    precursor_charge_max=0,
    precursor_mz_min=400,
    precursor_mz_max=1000,
    peptide_length_min=7,
    peptide_length_max=30,
    max_missed_cleavages=2,
    var_mods=["Acetyl@Protein_N-term", "Oxidation@M"],
    fix_mods=["Carbamidomethyl@C"],
    special_mods=["Phospho@S","Phospho@T","Phospho@Y"],
    min_special_mod_num=0,
    max_special_mod_num=1,
)
fasta_lib.get_peptides_from_fasta(
    ["../test_data/small_fasta/test_protein.fasta"]
)
model.predict(fasta_lib.precursor_df)
fasta_lib._precursor_df = add_charge(
    fasta_lib.precursor_df, 
    model, prob=0.5
)
fasta_lib.append_decoy_sequence()
fasta_lib.add_modifications()
fasta_lib.add_special_modifications()
fasta_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,charge
0,QKELDSK,0,1,False,False,Phospho@S,6,7,2
1,QKELDSK,0,1,False,False,,,7,2
2,RKEVVHK,0,2,False,False,,,7,2
3,RKEVVHK,0,2,False,False,,,7,3
4,ILENAQR,0,0,False,False,,,7,2
...,...,...,...,...,...,...,...,...,...
385,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@T,14,30,4
386,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@S,22,30,4
387,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@Y,28,30,4
388,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@S,29,30,4


In [16]:
fasta_lib.predict_all(
    predict_items=["rt","ms2"]
)
fasta_lib.precursor_df

2024-01-15 14:31:03> Predicting RT/IM/MS2 for 275 precursors ...
2024-01-15 14:31:03> Predicting RT ...


100%|██████████| 24/24 [00:00<00:00, 95.42it/s]

2024-01-15 14:31:03> Predicting MS2 ...



100%|██████████| 24/24 [00:00<00:00, 52.06it/s]

2024-01-15 14:31:04> End predicting RT/IM/MS2





Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,charge,precursor_mz,rt_pred,rt_norm_pred,nce,instrument,frag_start_idx,frag_stop_idx
0,QKELDSK,0,1,False,False,Phospho@S,6,7,2,464.212790,0.009475,0.009475,30.0,Lumos,0,6
1,QKELDSK,0,1,False,False,,,7,2,424.229625,0.012003,0.012003,30.0,Lumos,6,12
2,RKEVVHK,0,2,False,False,,,7,2,448.277244,0.082575,0.082575,30.0,Lumos,12,18
3,ILENAQR,0,0,False,False,,,7,2,422.237784,0.106143,0.106143,30.0,Lumos,18,24
4,ELDSKVR,0,1,False,False,Phospho@S,4,7,2,463.720782,0.042219,0.042219,30.0,Lumos,24,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@T,14,30,4,915.165315,0.900869,0.900869,30.0,Lumos,4241,4270
271,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@S,22,30,4,915.165315,0.906644,0.906644,30.0,Lumos,4270,4299
272,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@Y,28,30,4,915.165315,0.904113,0.904113,30.0,Lumos,4299,4328
273,QDWEHAANDVSFATIRFHDLLSQLDDQYSR,0,1,False,False,Phospho@S,29,30,4,915.165315,0.899686,0.899686,30.0,Lumos,4328,4357


In [17]:
fasta_lib.save_hdf("../test_data/speclib/predicted_charged.speclib.hdf")

### DIY: training models for modified sequences