In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.layouts import column

from BaselineRemoval import BaselineRemoval
from scipy import signal

In [3]:
DATA_DIR_PATH = "datasets/export"
# file1, file2 = "L185_0_A10_1", "L186_0_A11_1"
file1, file2 = "L103_0_A5_1", "L103_0_A6_1"

df1 = pd.read_table(f"{DATA_DIR_PATH}/{file1}.txt",sep=" ", header=None,names=['m/z', 'intensity']) 
df2 = pd.read_table(f"{DATA_DIR_PATH}/{file2}.txt",sep=" ", header=None,names=['m/z', 'intensity']) 
df1.head()

Unnamed: 0,m/z,intensity
0,1000.322,2552
1,1000.417,2515
2,1000.512,2498
3,1000.606,2506
4,1000.701,2544


In [4]:
p = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='intensity', width=600, height=400)
p.line(df1['m/z'].to_numpy(), df1['intensity'].to_numpy(), color="red",legend_label=f'{file1}')
p.line(df2['m/z'].to_numpy(), df2['intensity'].to_numpy(), color="green", legend_label=f'{file2}')
output_notebook()
show(p)

In [5]:
print ('Peak Detection in original spectrum')

x = df1['m/z'].to_numpy()
y = df1['intensity'].to_numpy()

dist, prom = 500, 500
peaks,_ = signal.find_peaks(y, distance=dist, prominence=prom)
print(f'peaks: {len(peaks)}')

p = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='rel. intensity', width=600, height=400)
p.line(x, y, color="blue",legend_label=f'{file1}')
p.x(x[peaks], y[peaks], size=10, color="red", alpha=0.5)
show(p)

Peak Detection in original spectrum
peaks: 60


In [6]:
print ('Peak Detection with Smoothing and Baseline Correction')

def find_peaks(y):
    dist, prom = 500, 500
    window, deg = 15, 2     
#     smoothed = signal.savgol_filter(y, window, deg, deriv=0)
#     brm = BaselineRemoval(smoothed)
#     z = brm.ZhangFit(lambda_=400,repitition=15, porder=1)
    z = y
    peaks,_ = signal.find_peaks(z, distance=dist, prominence=prom)
    return peaks, z

x = df1['m/z'].to_numpy()
y = df1['intensity'].to_numpy()
peaks, z = find_peaks(y)
pk1 = (x[peaks], z[peaks])
print(f'{file1} peaks: {len(peaks)}')
p1 = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='intensity', width=600, height=400)
p1.line(x, z, color="red",legend_label=f'{file1}')
p1.x(x[peaks], z[peaks], size=10, color="blue", alpha=0.5)

show(p1)

x = df2['m/z'].to_numpy()
y = df2['intensity'].to_numpy()
peaks, z = find_peaks(y) 
pk2 = (x[peaks], z[peaks])
print(f'{file2} peaks: {len(peaks)}')
p2 = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='intensity', width=600, height=400)
p2.line(x, z, color="blue",legend_label=f'{file2}')

p2.x(x[peaks], z[peaks], size=10, color="red", alpha=0.5)

show(p2)

Peak Detection with Smoothing and Baseline Correction
L103_0_A5_1 peaks: 60


L103_0_A6_1 peaks: 69


In [7]:
import similarity as sim

In [8]:
sim.jaccard_similarity(pk1,pk2)

0.4659090909090909

In [9]:
import numpy as np
def find_peaks_cwt(y):
    return signal.find_peaks_cwt(y, [50])
    

In [10]:
%%time
x = df1['m/z'].to_numpy()
y = df1['intensity'].to_numpy()
peaks = signal.find_peaks_cwt(y, [20])

print(peaks)

pk1 = (x[peaks], y[peaks])
print(f'{file1} peaks: {len(peaks)}')
p1 = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='intensity', width=600, height=400)
p1.line(x, y, color="red",legend_label=f'{file1}')
p1.x(x[peaks], y[peaks], size=10, color="blue", alpha=0.5)

show(p1)


[   16    79   466   564   804  1303  1687  2081  2166  2274  2608  3123
  3248  3682  4025  4088  4153  4325  4897  5141  5276  5451  5716  5788
  5925  6011  6360  6511  7091  7585  7679  8023  8245  8455  9295  9590
  9746  9891 10343 10425 10635 11125 11326 11807 11895 12008 13118 13232
 13604 13898 14140 15436 16196 16501 16585 16905 17831 17949 18383 19084
 19456 19556 20329 20912 21289 22734 22924 23636 23713 24256 24572 25443
 25537 26578 26833 27807 28761 29548 29861 30459 30555 32062 32171 32236
 32314 33809 33965 34101 34716 35361 35708 36231 36378 36707 38288 38427
 38816 40598 41547 42137 42239 42369 43004 43458 44264 44688 44817 46290
 46520 46649 47140 48528 48740 49373 49531 49942 50482 50917 51021 51126
 51487 51930 52431 52623 52708 52880 53069 53324 53559 54083 54515 54615
 54742 54849 55106 55395 55818 55912 56495 56581 56721 56924 57019 57249
 57909 58692 59086 60270 60455 60593 60684 60769 60883 60992 61165 61283
 62548 62762 63414 63534 63641 63894 64014 64184 64

CPU times: total: 7.89 s
Wall time: 8.22 s
