In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.layouts import column

from BaselineRemoval import BaselineRemoval
from scipy import signal

In [2]:
DATA_DIR_PATH = "datasets/export"
# file1 = "L187_0_B1_1.txt"
file1 = "L185_0_A9_1.txt"
file2 = "L185_0_A10_1.txt"

df1 = pd.read_table(f"{DATA_DIR_PATH}/{file1}",sep=" ", header=None,names=['m/z', 'intensity']) 
df2 = pd.read_table(f"{DATA_DIR_PATH}/{file2}",sep=" ", header=None,names=['m/z', 'intensity']) 
df1.head()

Unnamed: 0,m/z,intensity
0,997.552,6279
1,997.646,6207
2,997.741,6192
3,997.836,6163
4,997.931,6157


In [3]:
p = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='intensity', width=600, height=400)
p.line(df1['m/z'].to_numpy(), df1['intensity'].to_numpy(), color="red",legend_label=f'{file1}')
p.line(df2['m/z'].to_numpy(), df2['intensity'].to_numpy(), color="green", legend_label=f'{file2}')
output_notebook()
show(p)

In [4]:
print ('Peak Detection in original spectrum')

x = df1['m/z'].to_numpy()
y = df1['intensity'].to_numpy()

dist, prom = 500, 500
peaks,_ = signal.find_peaks(y, distance=dist, prominence=prom)
print(f'peaks: {len(peaks)}')

p = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='rel. intensity', width=600, height=400)
p.line(x, y, color="blue",legend_label=f'{file1}')
p.x(x[peaks], y[peaks], size=10, color="red", alpha=0.5)
show(p)

Peak Detection in original spectrum
peaks: 59


In [5]:
print ('Peak Detection with Smoothing and Baseline Correction')

def find_peaks(y):
    dist, prom = 500, 500
    window, deg = 15, 2     
    smoothed = signal.savgol_filter(y, window, deg, deriv=0)
    brm = BaselineRemoval(smoothed)
    z = brm.ZhangFit(lambda_=400,repitition=15, porder=1)
    peaks,_ = signal.find_peaks(z, distance=dist, prominence=prom)
    return peaks, z

x = df1['m/z'].to_numpy()
y = df1['intensity'].to_numpy()
peaks, z = find_peaks(y)
pk1 = (x[peaks], z[peaks])
print(f'{file1} peaks: {len(peaks)}')
p1 = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='rel. intensity', width=600, height=400)
p1.line(x, z, color="red",legend_label=f'{file1}')
p1.x(x[peaks], z[peaks], size=10, color="blue", alpha=0.5)

show(p1)

x = df2['m/z'].to_numpy()
y = df2['intensity'].to_numpy()
peaks, z = find_peaks(y) 
pk2 = (x[peaks], z[peaks])
print(f'{file2} peaks: {len(peaks)}')
p2 = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='rel. intensity', width=600, height=400)
p2.line(x, z, color="blue",legend_label=f'{file2}')
p2.x(x[peaks], z[peaks], size=10, color="red", alpha=0.5)
show(p2)

Peak Detection with Smoothing and Baseline Correction
L185_0_A9_1.txt peaks: 63


L185_0_A10_1.txt peaks: 61


In [6]:
import similarity as sim

In [7]:
sim.jaccard_similarity(pk1,pk2)

0.8524590163934426

In [45]:
import numpy as np
def find_peaks_cwt(y):
    return signal.find_peaks_cwt(y, [50])
    

In [53]:
%%time
x = df1['m/z'].to_numpy()
y = df1['intensity'].to_numpy()
peaks = signal.find_peaks_cwt(y, [20])

print(peaks)

pk1 = (x[peaks], y[peaks])
print(f'{file1} peaks: {len(peaks)}')
p1 = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='intensity', width=600, height=400)
p1.line(x, y, color="red",legend_label=f'{file1}')
p1.x(x[peaks], y[peaks], size=10, color="blue", alpha=0.5)

show(p1)


[   25   115   286   497   596  1524  1716  2205  2488  2635  3159  3274
  3708  4053  4186  4352  4932  5175  5295  5483  5958  6043  6265  6540
  7119  7613  7704  8059  8269  8458  8596  8820  9191  9326  9550  9924
 10378 10565 10666 11158 11354 11443 11839 12041 12649 13155 13255 13374
 13533 13625 13928 14183 15464 16123 16223 16529 16608 16706 16934 17975
 18413 19483 19998 20939 21316 22767 22958 23741 24283 24600 25471 26605
 26739 26860 27454 27853 28792 29242 29575 30584 31518 32089 32199 32276
 32342 33539 33837 34128 34261 34747 36258 36402 36734 36980 38315 38444
 38846 39295 40622 41589 42161 42264 42405 43033 43484 44715 44847 46318
 46504 46675 47152 47300 47485 48944 49400 49520 49883 49973 50178 50510
 50862 50953 51029 51101 52032 52105 52174 52889 53072 53423 53503 53775
 53884 54099 54418 54753 54968 55140 55473 55546 55617 56093 56458 56524
 56738 57934 58593 58654 58719 58876 59460 60623 60703 60762 61410 62569
 63016 63448 63571 63662 63902 63975 64237 64474 64

CPU times: user 10.4 s, sys: 12.9 ms, total: 10.4 s
Wall time: 10.4 s
