# AlphaBase, AlphaRaw and Spectrum Vis

### Load Thermo Raw Data using AlphaRaw

To access Raw data, `PythonNet` must be installed based on .NET Framework, .NETCore or Mono for different OS systems. See https://github.com/MannLabs/alpharaw#installation.

In [1]:
# from alpharaw.thermo import ThermoRawData
# from alpharaw.sciex import SciexWiffData
# from alpharaw.mzml import MzMLReader
# raw_data = ThermoRawData(save_as_hdf=True)
# raw_data.import_raw("/Users/wenfengzeng/data/ap_msdata/HeLa_500ng.raw")

### Load AlphaRaw's HDF format

In [2]:
from alpharaw.ms_data_base import MSData_Base

ms_data = MSData_Base()
ms_data.load_hdf("../test_data/dda/HeLa_500ng.raw.hdf")
ms_data.spectrum_df

Unnamed: 0,isolation_lower_mz,isolation_upper_mz,ms_level,nce,peak_start_idx,peak_stop_idx,precursor_charge,precursor_mz,rt,spec_idx
0,-1.000000,-1.000000,1,0.0,0,947,0,-1.000000,0.002132,0
1,-1.000000,-1.000000,1,0.0,947,1804,0,-1.000000,0.006426,1
2,-1.000000,-1.000000,1,0.0,1804,2745,0,-1.000000,0.010237,2
3,-1.000000,-1.000000,1,0.0,2745,3666,0,-1.000000,0.014216,3
4,-1.000000,-1.000000,1,0.0,3666,4554,0,-1.000000,0.018199,4
...,...,...,...,...,...,...,...,...,...,...
133451,513.694897,515.094897,2,27.0,52689507,52689523,0,514.394897,119.992310,133451
133452,-1.000000,-1.000000,1,0.0,52689523,52690736,0,-1.000000,119.993600,133452
133453,747.706555,749.106555,2,27.0,52690736,52690748,0,748.406555,119.996860,133453
133454,-1.000000,-1.000000,1,0.0,52690748,52691929,0,-1.000000,119.998150,133454


In [3]:
ms_data.peak_df

Unnamed: 0,intensity,mz
0,78647.179688,300.062225
1,16854.835938,300.181488
2,48675.617188,300.277588
3,37171.460938,300.298889
4,12439.951172,300.335724
...,...,...
52691937,8425.477539,684.554565
52691938,9639.876953,897.415161
52691939,10831.402344,1007.536987
52691940,9109.153320,1427.651367


### Get the peak list for a given spectrum

In [4]:
spec_idx = 100 # scan_num = spec_idx + 1
peak_start_idx = ms_data.spectrum_df.peak_start_idx.values[spec_idx]
peak_stop_idx = ms_data.spectrum_df.peak_stop_idx.values[spec_idx]
ms_data.peak_df.iloc[peak_start_idx:peak_stop_idx]

Unnamed: 0,intensity,mz
76866,82767.609375,300.061005
76867,33471.000000,300.180450
76868,40469.039062,300.276886
76869,12977.758789,300.289520
76870,42692.718750,300.298004
...,...,...
77658,20136.412109,1360.538208
77659,22530.769531,1366.057007
77660,25471.552734,1373.409546
77661,21047.003906,1476.254028


# AlphaBase

### Load PSMs from search engines

In [5]:
from alphabase.psm_reader import (
    AlphaPeptReader, 
    pFindReader, 
    MaxQuantReader, 
    MSFragger_PSM_TSV_Reader,
)

from alphabase.psm_reader.dia_psm_reader import (
    DiannReader, 
    SpectronautReportReader, 
    SpectronautReader, 
    SwathReader
)

In [6]:
psm_reader = pFindReader()
psm_df = psm_reader.import_file("../test_data/dda/pFind-Filtered.spectra")
psm_df

Unnamed: 0,sequence,charge,raw_name,query_id,scan_num,score,proteins,uniprot_ids,fdr,decoy,spec_idx,mods,mod_sites,nAA,precursor_mz
0,ESLLVK,2,HeLa_500ng,HeLa_500ng.35692.35692.2.0.dta,35692,1.622062,sp|Q6S5L8|SHC4_HUMAN;REV_sp|Q8WZ42|TITIN_HUMAN...,sp|Q6S5L8|SHC4_HUMAN;REV_sp|Q8WZ42|TITIN_HUMAN...,0.003092,0,35691,,,6,344.715622
1,VPAFVR,2,HeLa_500ng,HeLa_500ng.42111.42111.2.0.dta,42111,1.221872,REV_sp|Q8N6M6|AMPO_HUMAN;sp|P48739|PIPNB_HUMAN...,REV_sp|Q8N6M6|AMPO_HUMAN;sp|P48739|PIPNB_HUMAN...,0.005711,0,42110,,,6,344.710674
2,RLEDFR,2,HeLa_500ng,HeLa_500ng.18181.18181.2.0.dta,18181,1.221981,sp|Q659C4|LAR1B_HUMAN;sp|Q6PKG0|LARP1_HUMAN,sp|Q659C4|LAR1B_HUMAN;sp|Q6PKG0|LARP1_HUMAN,0.005711,0,18180,,,6,418.224677
3,MENLLR,2,HeLa_500ng,HeLa_500ng.42606.42606.2.0.dta,42606,1.496833,REV_sp|Q92833|JARD2_HUMAN;REV_sp|P08779|K1C16_...,REV_sp|Q92833|JARD2_HUMAN;REV_sp|P08779|K1C16_...,0.003616,0,42605,,,6,388.210181
4,YLMALK,2,HeLa_500ng,HeLa_500ng.57170.57170.2.1.dta,57170,1.222059,sp|P50579|MAP2_HUMAN,sp|P50579|MAP2_HUMAN,0.005711,0,57169,,,6,369.714568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165446,TPAEPEVGPAAPAMAPAPASAPAPASAPAPAPVPTPAMVSAPSSTV...,4,HeLa_500ng,HeLa_500ng.119581.119581.4.1.dta,119581,6.142999,sp|Q14244|MAP7_HUMAN,sp|Q14244|MAP7_HUMAN,0.000034,0,119580,Deamidated@N,47,53,1204.107032
165447,AAAAAADPNAAWAAYYSHYYQQPPGPVPGPAPAPAAPPAQGEPPQP...,5,HeLa_500ng,HeLa_500ng.112172.112172.5.0.dta,112172,9.357916,sp|Q92945|FUBP2_HUMAN,sp|Q92945|FUBP2_HUMAN,0.000000,0,112171,,,56,1125.538184
165448,QDPAAAQEGEDEGASAGQGPKPEAHSQEQGHPQTGCECEDGPDGQE...,5,HeLa_500ng,HeLa_500ng.42554.42554.5.0.dta,42554,6.239731,sp|Q4V326|GAG2E_HUMAN;sp|Q13066|GAG2B_HUMAN;sp...,sp|Q4V326|GAG2E_HUMAN;sp|Q13066|GAG2B_HUMAN;sp...,0.000000,0,42553,Pentose@T,34,56,1186.293216
165449,QDPAAAQEGQDEGASAGQGPKPEAHSQEQGHPQTGCECEDGPDGQE...,5,HeLa_500ng,HeLa_500ng.50483.50483.5.0.dta,50483,7.362675,sp|Q6NT46|GAG2A_HUMAN,sp|Q6NT46|GAG2A_HUMAN,0.000000,0,50482,2-succinyl@C,38,56,1182.890153


In [7]:
psm_reader = MaxQuantReader()
psm_df = psm_reader.import_file("../test_data/dda/MaxQuant_msms.txt")
psm_df

Unnamed: 0,sequence,charge,rt,scan_num,raw_name,precursor_mz,score,proteins,genes,decoy,spec_idx,mods,mod_sites,nAA,rt_norm
0,ISFELMR,2,72.800,84562,HeLa_500ng,448.23894,153.880,Q9UNE7,STUB1,0,84561,,,7,0.608340
1,IYCTDLR,2,34.141,34871,HeLa_500ng,470.73148,89.427,Q8TAF3,WDR48,0,34870,Carbamidomethyl@C,3,7,0.285293
2,EGIVALR,2,45.346,49451,HeLa_500ng,379.23197,118.060,P40227,CCT6A,0,49450,,,7,0.378925
3,TLDVAVK,2,35.632,36818,HeLa_500ng,373.22635,66.993,A0FGR8,ESYT2,0,36817,,,7,0.297752
4,EGIPVMR,2,45.502,49650,HeLa_500ng,401.21801,82.263,Q14697,GANAB,0,49649,,,7,0.380229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55217,AVNPDEAVAIGAAIQGGVLAGDVTDVLLLDVTPLSLGIETLGGVFTK,4,108.190,128548,HeLa_500ng,1148.12960,36.090,P38646,HSPA9,0,128547,,,47,0.904070
55218,SPVGSGAPQAAAPAPAAHVAGNPGGDAAPAATGTAAAASLATAAGS...,3,75.088,87443,HeLa_500ng,1460.37600,239.620,P16989,YBX3,0,87442,,,51,0.627459
55219,SPVGSGAPQAAAPAPAAHVAGNPGGDAAPAATGTAAAASLATAAGS...,4,75.052,87398,HeLa_500ng,1095.53380,177.480,P16989,YBX3,0,87397,,,51,0.627158
55220,SSEAETQQPPAAPPAAPALSAADTKPGTTGSGAGSGGPGGLTSAAP...,3,76.213,88863,HeLa_500ng,1505.39000,145.780,P67809,YBX1,0,88862,Acetyl@Protein N-term,0,51,0.636860


### Get fragment m/z values of PSMs

In [8]:
from alphabase.peptide.fragment import create_fragment_mz_dataframe

In [9]:
fragment_mz_df = create_fragment_mz_dataframe(
    psm_df, 
    charged_frag_types=["b_z1","b_z2","y_z1","y_z2"]
)
fragment_mz_df

Unnamed: 0,b_z1,b_z2,y_z1,y_z2
0,114.091339,57.549309,782.386536,391.696899
1,201.123367,101.065323,695.354492,348.180878
2,348.191772,174.599533,548.286072,274.646698
3,477.234375,239.120819,419.243500,210.125381
4,590.318420,295.662872,306.159424,153.583359
...,...,...,...,...
681643,4067.942871,2034.475098,447.219788,224.113525
681644,4138.979980,2069.993652,376.182678,188.594971
681645,4196.001465,2098.504395,319.161224,160.084244
681646,4253.022949,2127.015137,262.139740,131.573517


In [10]:
psm_idx = 100
frag_start_idx = psm_df.frag_start_idx.values[psm_idx]
frag_stop_idx = psm_df.frag_stop_idx.values[psm_idx]
fragment_mz_df.iloc[frag_start_idx:frag_stop_idx]

Unnamed: 0,b_z1,b_z2,y_z1,y_z2
600,88.039307,44.523293,676.406189,338.706757
601,201.123367,101.065323,563.322144,282.164703
602,314.207428,157.607361,450.238068,225.622681
603,385.244537,193.125916,379.200958,190.104126
604,486.292236,243.64975,278.15329,139.580276
605,617.332703,309.169983,147.112808,74.060043


### Extract matched fragment intensities for PSMs against spectra

In [11]:
from alpharaw.match.psm_match import PepSpecMatch

match = PepSpecMatch(
    charged_frag_types=["b_z1","b_z2","y_z1","y_z2"],
    match_closest=True, use_ppm=True, tol_value=20 # 20 ppm
)
(
    psm_df, fragment_mz_df,
    matched_intensity_df,
    matched_mass_error_df
) = match.match_ms2_multi_raw(
    psm_df, 
    ms_files=["../test_data/dda/HeLa_500ng.raw.hdf"],
    ms_file_type="alpharaw_hdf"
)
matched_intensity_df

100%|██████████| 1/1 [00:02<00:00,  2.14s/it]


Unnamed: 0,b_z1,b_z2,y_z1,y_z2
0,0.00000,0.0,6.393688e+06,0.0000
1,994209.43750,0.0,3.624949e+06,83244.8125
2,0.00000,0.0,1.687645e+06,0.0000
3,247355.09375,0.0,1.748330e+06,0.0000
4,0.00000,0.0,4.013234e+05,0.0000
...,...,...,...,...
681643,0.00000,0.0,0.000000e+00,0.0000
681644,0.00000,0.0,4.155696e+05,0.0000
681645,0.00000,0.0,1.136814e+05,0.0000
681646,0.00000,0.0,6.669666e+04,0.0000


### Visualize annotated peaks for a PSM

In [12]:
from alpharaw.viz.psm_plot import PSM_Plot
psm_plotter = PSM_Plot()

In [13]:
psm_idx = 100
plot_psm = psm_df.loc[psm_idx:psm_idx]
plot_psm

Unnamed: 0,sequence,charge,rt,scan_num,raw_name,precursor_mz,score,proteins,genes,decoy,spec_idx,mods,mod_sites,nAA,rt_norm,frag_start_idx,frag_stop_idx,nce
100,SLLATMK,2,44.227,48002,HeLa_500ng,382.22276,86.866,P15924,DSP,0,48001,,,7,0.369575,600,606,27.0


In [14]:
plot_psm = plot_psm.drop(columns=["frag_start_idx","frag_stop_idx"])
plot_frag_mz_df = create_fragment_mz_dataframe(
    plot_psm, 
    charged_frag_types=["b_z1","b_z2","y_z1","y_z2"]
)
plot_frag_mz_df

Unnamed: 0,b_z1,b_z2,y_z1,y_z2
0,88.039307,44.523293,676.406189,338.706757
1,201.123367,101.065323,563.322144,282.164703
2,314.207428,157.607361,450.238068,225.622681
3,385.244537,193.125916,379.200958,190.104126
4,486.292236,243.64975,278.15329,139.580276
5,617.332703,309.169983,147.112808,74.060043


In [15]:
(
    plot_psm, plot_frag_mz_df,
    matched_intensity_df,
    matched_mass_error_df
) = match.match_ms2_multi_raw(
    plot_psm, 
    ms_files=["../test_data/dda/HeLa_500ng.raw.hdf"],
    ms_file_type="alpharaw_hdf"
)
matched_intensity_df

100%|██████████| 1/1 [00:00<00:00,  1.64it/s]


Unnamed: 0,b_z1,b_z2,y_z1,y_z2
0,0.0,0.0,20505.599609,0.0
1,477963.625,0.0,984503.75,14939.34082
2,24317.273438,0.0,445672.25,0.0
3,0.0,0.0,404354.96875,0.0
4,0.0,0.0,419753.5625,0.0
5,0.0,0.0,584019.8125,0.0


In [16]:
from alpharaw.viz.df_utils import make_psm_plot_for_dfs

spec_idx = plot_psm.spec_idx.values[0]
peak_mzs, peak_intens = ms_data.get_peaks(spec_idx)

plot_df = make_psm_plot_for_dfs(
    peak_mzs, peak_intens,
    plot_psm,
    plot_frag_mz_df,
    fragment_intensity_df=None,
)
psm_plotter.plot(plot_df, plot_psm.sequence[0], "", plot_unmatched_peaks=True)

### Mirror plot with predicted intensities

In [17]:
from peptdeep.pretrained_models import ModelManager

model_mgr = ModelManager()
pred_inten_df = model_mgr.predict_ms2(plot_psm)

2024-01-04 19:57:36> Predicting MS2 ...


100%|██████████| 1/1 [00:00<00:00, 172.21it/s]


In [18]:
from alpharaw.viz.df_utils import make_psm_plot_for_dfs

spec_idx = plot_psm.spec_idx.values[0]
peak_mzs, peak_intens = ms_data.get_peaks(spec_idx)

plot_df = make_psm_plot_for_dfs(
    peak_mzs, peak_intens,
    plot_psm,
    plot_frag_mz_df,
    fragment_intensity_df=pred_inten_df[plot_frag_mz_df.columns],
)
psm_plotter.plot(plot_df, plot_psm.sequence[0], "", plot_unmatched_peaks=True)