In [None]:
# libs
import pandas as pd
import altair as alt

from glob import glob

from src.mzml_parser import df_from_mzml

In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999 // disable scolling in notebook

In [None]:
# signal settings
scans_per_second = 8   # resample time dimension
window_length = 5      # smooth window length (scipy.signal.savgol_filter)
polyorder = 3          # smooth polyorder (scipy.signal.savgol_filter)

In [None]:
# read in targets of interest
targets = pd.read_csv(
    'targets.csv', 
    sep="\t", 
    converters = {
        'precursor': str, 
        'product': str
    }
)

# add column with unique transition information
targets['transition'] = targets.apply(lambda x: f"{x['precursor']}_{x['product']}_{x['polarity']}", axis=1)
targets

In [None]:
# define pciis transition to use
pciis = {
    'name': '2F_AEA_02',
    'transition': '350.3_269.2_pos'
}

In [None]:
# read in signal data (from mzml files)
signal_df = pd.concat(
    [df_from_mzml(mzml_file, scans_per_second, window_length, polyorder) for mzml_file in glob('mzML/*.mzML')]
)
signal_df.head(5)

In [None]:
# extract pciis signal
pciis_data = signal_df[signal_df['transition'] == pciis['transition']][['file', 'rt','intensity']].copy()
pciis_data['pciis'] = pciis_data['intensity']
pciis_data = pciis_data.drop(['intensity'], axis=1)
pciis_data = pciis_data.set_index(['file', 'rt'])
pciis_data.head(5)

In [None]:
# append pciis signal to targets
dfs = []
for tIdx, target in targets.iterrows():
    
    # extract target signal
    target_data = signal_df[
        signal_df['transition'] == target['transition']
    ].copy()
    
    # keep index, in case we have multiple targets with the same transition
    target_data['target'] = tIdx
       
    # prepare index for join with pciis signal using file/rt
    target_data = target_data.set_index(['file', 'rt'])    
    
    # glue the pciis signal to the dataframe as a column
    dfs.append(target_data.join(pciis_data['pciis']).reset_index())
    
df = pd.concat(dfs)

# add concentration
df['concentration'] = df['file'].apply(
    lambda x: x.split("_")[-1].replace(".mzML","")
)

# add sample
df['sample'] = df['file'].apply(
    lambda x: x.split("_")[-2]
)

# determine ratio
df['ratio'] = df['intensity']/df['pciis']
df.head(5)

In [None]:
# code: https://github.com/pearsonkyle/Signal-Alignment
# paper: https://iopscience.iop.org/article/10.3847/1538-3881/aaf1ae/meta
from scipy.ndimage import shift
from src.signal_alignment import phase_align

# add reference (ratio) signal to align to
reference_file = 'mzML/inj043_PlDiv2_LOW.mzML'

# extract ratio by target/rt from the reference file
reference_ratio = df[
    (df['file'] == reference_file)
][['target','rt','ratio']].copy().set_index(['target','rt'])

# change the column name
reference_ratio.columns = ['ratio_reference']

# append a column with the ratio by rt from the reference file
df_aligned = df.set_index(['target','rt']).join(reference_ratio).reset_index().dropna()

# apply alignement by target/file
aligned_dfs = []
for gIdx, df_grouped in df_aligned.groupby(['target','file']):
                   
    df_grouped['ratio_aligned'] = shift( # apply phase shift
        df_grouped['ratio'], 
        float(phase_align( # calculate phase shift
            df_grouped['ratio_reference'].values, df_grouped['ratio'].values, res=1
        )), 
        mode='constant', 
        cval=0.0
    )
    
    aligned_dfs.append(df_grouped)
    
# merge them back together
df_aligned = pd.concat(aligned_dfs, ignore_index=True)

# make all negative value = 0
df_aligned['ratio_aligned'] = df_aligned['ratio_aligned'].clip(0)

df_aligned.head(5)

In [None]:
import warnings
from detecta import detect_peaks
from scipy.signal import peak_widths

peak_dfs = []
for tIdx, df_target in df_aligned.groupby('target'):
    
    # get target details
    t = targets.loc[tIdx]
    
    for fIdx, df_target_file in df_target.groupby('file'):
    
        # by signal type (intensity or ratio)
        for signal_type in ['intensity','ratio','ratio_aligned']:
            peak_index = detect_peaks(
                df_target_file[signal_type],
                mph=0,
                mpd=5,
                threshold=0,
                edge='both',
                kpsh=True
            )

            with warnings.catch_warnings(): # ignore the peaks warning of the peak_width function
                warnings.simplefilter("ignore")
                peak_width_index = peak_widths(
                    df_target_file[signal_type],
                    peak_index,
                    rel_height=0.98
                )

            peak_rt_apex = df_target_file.iloc[peak_index,]['rt']
            peak_int_apex = df_target_file.iloc[peak_index,][signal_type]

            peak_rt_start = df_target_file.iloc[peak_width_index[2],]['rt']
            peak_int_start = df_target_file.iloc[peak_width_index[2],][signal_type]

            peak_rt_end = df_target_file.iloc[peak_width_index[3],]['rt']
            peak_int_end = df_target_file.iloc[peak_width_index[3],][signal_type]

            peak_width = [ pw[0] - pw[1] for pw in zip(peak_rt_end,peak_rt_start) ]

            peaks_df = pd.DataFrame({
                'peak_rt_apex':peak_rt_apex.tolist(),
                'peak_rt_start':peak_rt_start.tolist(),
                'peak_rt_end':peak_rt_end.tolist(),
                'peak_width':peak_width,
            })

            peaks_df['abs_rt_error'] = peaks_df['peak_rt_apex'].apply(
                lambda x: abs(x - t.rt)
            )

            peaks_df['signal_type'] = signal_type
            peaks_df['target'] = tIdx
            peaks_df['file'] = fIdx

            # keep best hit
            peaks_df = peaks_df.nsmallest(1, 'abs_rt_error')

            # find peaks
            def area_and_scans(peak, signal, signal_type):
                peak_signal = signal[
                    (signal['rt'] >= peak['peak_rt_start']) & 
                    (signal['rt'] <= peak['peak_rt_end'])
                ]
                return (peak_signal[signal_type].sum(),len(peak_signal))

            
            # area & scans
            peaks_df['area_scans'] = peaks_df.apply(
                lambda x: area_and_scans(
                    x, 
                    df_target_file, 
                    signal_type
                ), axis=1
            )
            
            peaks_df['area'] = peaks_df['area_scans'].apply(
                lambda x: x[0]
            )
            
            peaks_df['scans'] = peaks_df['area_scans'].apply(
                lambda x: x[1]
            )
            
            peaks_df = peaks_df.drop(['area_scans'], axis=1)
            
            peak_dfs.append(peaks_df)
               
# merge them back together
df_peaks = pd.concat(peak_dfs, ignore_index=True)  

# add concentration
df_peaks['concentration'] = df_peaks['file'].apply(
    lambda x: x.split("_")[-1].replace(".mzML","")
)

# add sample
df_peaks['sample'] = df_peaks['file'].apply(
    lambda x: x.split("_")[-2]
)

# save peaks as csv file
df_peaks.to_csv('peaks_found.csv', index=False)

df_peaks

In [None]:
plot_height = 180
plot_width = 210
rt_window = 12

for tIdx, target in targets.iterrows():

    # get target details
    t = targets.loc[tIdx]

    # extract target signal
    df_aligned_window = df_aligned[
        (df_aligned['rt'] >= t.rt - rt_window/2) &
        (df_aligned['rt'] <= t.rt + rt_window/2)
    ].copy()   

    plots = alt.vconcat()
    
    # for concentration in df_aligned_window['concentration'].unique():
    for concentration in ['LOW','MEDIUM','HIGH']:

        # data frame with peaks of target, grouped by concentration
        df_concentration_peaks = df_peaks[
            (df_peaks['target'] == tIdx) &
            (df_peaks['concentration'] == concentration)
        ]

        # data frame with signals of target, grouped by concentration
        df_concentration_signal = df_aligned_window[
            (df_aligned_window['target'] == tIdx) &
            (df_aligned_window['concentration'] == concentration)
        ]

        # original signal
        intensity_plot = alt.Chart(df_concentration_signal).mark_line().encode(
            x='rt', y='intensity', color='sample'
        ).properties(width=plot_width, height=plot_height, title=f"intensity ({concentration}) {t['name']}")
        
        # pciis signal
        intensity_plot = intensity_plot + alt.Chart(df_concentration_signal).mark_line(
            strokeWidth=0.5
        ).encode(
            x='rt', y='pciis', color='sample'
        ).properties(width=plot_width, height=plot_height)        

        # add rt of target (red verticle line)
        intensity_plot = intensity_plot + alt.Chart(
            df_concentration_peaks[df_concentration_peaks['signal_type'] == 'intensity']).mark_rule(
                color='red', strokeWidth=2
            ).encode(
                alt.X('mean(peak_rt_apex)',
                  title='rt')        
            )

        # add area window (start) of target (red verticle line)
        intensity_plot = intensity_plot + alt.Chart(
            df_concentration_peaks[df_concentration_peaks['signal_type'] == 'intensity']).mark_rule(
                color='grey', strokeWidth=2
            ).encode(
                x='min(peak_rt_start)'
            )

        # add area window (end) of target (red verticle line)
        intensity_plot = intensity_plot + alt.Chart(
            df_concentration_peaks[df_concentration_peaks['signal_type'] == 'intensity']).mark_rule(
                color='grey', strokeWidth=2
            ).encode(
                x='min(peak_rt_end)'
            )   

        # ratio
        unaligned_plot = alt.Chart(df_concentration_signal).mark_line().encode(
            x='rt', y='ratio', color='sample'
        ).properties(width=plot_width, height=plot_height, title=f"ratio ({concentration}) {t['name']}")

        # add rt of target (red verticle line)
        unaligned_plot = unaligned_plot + alt.Chart(
            df_concentration_peaks[df_concentration_peaks['signal_type'] == 'ratio']).mark_rule(
                color='red', strokeWidth=2
            ).encode(
                alt.X('mean(peak_rt_apex)',
                  title='rt')        
            )

        # add area window (start) of target (red verticle line)
        unaligned_plot = unaligned_plot + alt.Chart(
            df_concentration_peaks[df_concentration_peaks['signal_type'] == 'ratio']).mark_rule(
                color='grey', strokeWidth=2
            ).encode(
                x='min(peak_rt_start)'
            )

        # add area window (end) of target (red verticle line)
        unaligned_plot = unaligned_plot + alt.Chart(
            df_concentration_peaks[df_concentration_peaks['signal_type'] == 'ratio']).mark_rule(
                color='grey', strokeWidth=2
            ).encode(
                x='min(peak_rt_end)'
            )    

        # ratio aligned + target rt
        aligned_plot = alt.Chart(df_concentration_signal).mark_line().encode(
            x='rt', y='ratio_aligned', color='sample'
        ).properties(width=plot_width, height=plot_height, title=f"ratio aligned ({concentration}) {t['name']}")

        # add rt of target (red verticle line)
        aligned_plot = aligned_plot + alt.Chart(
            df_concentration_peaks[df_concentration_peaks['signal_type'] == 'ratio_aligned']).mark_rule(
                color='red', strokeWidth=2
            ).encode(
                alt.X('mean(peak_rt_apex)',
                  title='rt')        
            )

        # add area window (start) of target (red verticle line)
        aligned_plot = aligned_plot + alt.Chart(
            df_concentration_peaks[df_concentration_peaks['signal_type'] == 'ratio_aligned']).mark_rule(
                color='grey', strokeWidth=2
            ).encode(
                x='min(peak_rt_start)'
            )

        # add area window (end) of target (red verticle line)
        aligned_plot = aligned_plot + alt.Chart(
            df_concentration_peaks[df_concentration_peaks['signal_type'] == 'ratio_aligned']).mark_rule(
                color='grey', strokeWidth=2
            ).encode(
                x='min(peak_rt_end)'
            )    

        plots = alt.vconcat(plots, (intensity_plot | unaligned_plot | aligned_plot))
                
    # display the plots        
    plots.display()

    # save to disk
    # plots.save(f"plots/{t['name']}.png", scale_factor=1.5)

In [None]:
for tIdx, target in targets.iterrows():
    # tIdx = 0

    # get target details
    t = targets.loc[tIdx]

    for signal_type in ['intensity','ratio_aligned']:

        # filtered data source
        peaks_filter = f"target == {tIdx} and signal_type == '{signal_type}'"    
        source = df_peaks.query(peaks_filter)

        # individual area values by sample/concentration
        area = alt.Chart(source).mark_point(filled=True, color='black').encode(
            x=alt.X('area:Q', axis=alt.Axis(labels=False), scale=alt.Scale(zero=False)),
            y=alt.Y('sample:N'),
            color="concentration"
        ).properties(width=750, height=120, title=f"Area (based on {signal_type}) - {t['name']}")

        # mean area by concentration
        error_points = alt.Chart(source).mark_point(filled=True).encode(
          x=alt.X('area:Q', aggregate='mean', scale=alt.Scale(zero=False)),
          y=alt.Y('concentration:N'),
          color=alt.Color("concentration", legend=None)
        ).properties(width=742, height=30, title=f"Mean area + error bars (based on {signal_type}) - {t['name']}")    

        # mean area by concentration error bars
        error_bars = alt.Chart(source).mark_errorbar(extent='ci').encode(
          x=alt.X('area:Q', axis=alt.Axis(labels=False), scale=alt.Scale(zero=False)),
          y=alt.Y('concentration:N'),
          color=alt.Color("concentration", legend=None)
        ).properties(width=742, height=30)

        area.display()
        (error_points + error_bars).display()