In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.layouts import column

from BaselineRemoval import BaselineRemoval
from scipy import signal

In [2]:
DATA_DIR_PATH = "datasets/export"
file1 = "L185_0_A10_1.txt"
file2 = "L186_0_A11_1.txt"

df1 = pd.read_table(f"{DATA_DIR_PATH}/{file1}",sep=" ", header=None,names=['m/z', 'intensity']) 
df2 = pd.read_table(f"{DATA_DIR_PATH}/{file2}",sep=" ", header=None,names=['m/z', 'intensity']) 
df1.head()

Unnamed: 0,m/z,intensity
0,997.552,7296
1,997.646,7368
2,997.741,7376
3,997.836,7325
4,997.931,7324


In [3]:
p = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='intensity', width=600, height=400)
p.line(df1['m/z'].to_numpy(), df1['intensity'].to_numpy(), color="red",legend_label=f'{file1}')
p.line(df2['m/z'].to_numpy(), df2['intensity'].to_numpy(), color="green", legend_label=f'{file2}')
output_notebook()
show(p)

In [4]:
print ('Peak Detection in original spectrum')

x = df1['m/z'].to_numpy()
y = df1['intensity'].to_numpy()

dist, prom = 500, 500
peaks,_ = signal.find_peaks(y, distance=dist, prominence=prom)
print(f'peaks: {len(peaks)}')

p = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='rel. intensity', width=600, height=400)
p.line(x, y, color="blue",legend_label=f'{file1}')
p.x(x[peaks], y[peaks], size=10, color="red", alpha=0.5)
show(p)

Peak Detection in original spectrum
peaks: 58


In [11]:
print ('Peak Detection with Smoothing and Baseline Correction')

def find_peaks(y):
    dist, prom = 500, 500
    window, deg = 15, 2     
#     smoothed = signal.savgol_filter(y, window, deg, deriv=0)
#     brm = BaselineRemoval(smoothed)
#     z = brm.ZhangFit(lambda_=400,repitition=15, porder=1)
    z = y
    peaks,_ = signal.find_peaks(z, distance=dist, prominence=prom)
    return peaks, z

x = df1['m/z'].to_numpy()
y = df1['intensity'].to_numpy()
peaks, z = find_peaks(y)
pk1 = (x[peaks], z[peaks])
print(f'{file1} peaks: {len(peaks)}')
p1 = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='intensity', width=600, height=400)
p1.line(x, z, color="red",legend_label=f'{file1}')
p1.x(x[peaks], z[peaks], size=10, color="blue", alpha=0.5)

show(p1)

x = df2['m/z'].to_numpy()
y = df2['intensity'].to_numpy()
peaks, z = find_peaks(y) 
pk2 = (x[peaks], z[peaks])
print(f'{file2} peaks: {len(peaks)}')
p2 = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='intensity', width=600, height=400)
p2.line(x, z, color="blue",legend_label=f'{file2}')

p2.x(x[peaks], z[peaks], size=10, color="red", alpha=0.5)

show(p2)

Peak Detection with Smoothing and Baseline Correction
L185_0_A10_1.txt peaks: 58


L186_0_A11_1.txt peaks: 59


In [12]:
import similarity as sim

In [13]:
sim.jaccard_similarity(pk1,pk2)

0.6714285714285714

In [14]:
import numpy as np
def find_peaks_cwt(y):
    return signal.find_peaks_cwt(y, [50])
    

In [15]:
%%time
x = df1['m/z'].to_numpy()
y = df1['intensity'].to_numpy()
peaks = signal.find_peaks_cwt(y, [20])

print(peaks)

pk1 = (x[peaks], y[peaks])
print(f'{file1} peaks: {len(peaks)}')
p1 = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='intensity', width=600, height=400)
p1.line(x, y, color="red",legend_label=f'{file1}')
p1.x(x[peaks], y[peaks], size=10, color="blue", alpha=0.5)

show(p1)


[   23   117   285   597   706  1349  1527  1717  1914  2205  2637  2780
  3160  3274  3700  3930  4053  4188  4350  4930  5175  5296  5484  5957
  6043  6541  7028  7122  7614  7710  8061  8274  8478  8600  8820  9194
  9325  9928 10380 10563 10668 11159 11351 11448 11648 11836 12038 12655
 13076 13156 13258 13381 13929 14184 15466 16123 16222 16528 16610 16710
 16934 17977 18414 19483 19991 20939 21036 21318 22769 22868 22959 23741
 24600 25472 25567 26609 26739 26862 27455 27835 28794 29243 29573 29757
 30585 31518 32091 32201 32278 32344 33538 33836 34130 34749 35391 36259
 36408 36735 36980 38316 38457 38849 40198 40823 41590 42164 42266 42395
 43033 43485 44288 44716 44851 44980 46318 46509 46676 46779 47159 47304
 48776 49402 49729 50510 50864 50950 51065 52522 53113 53329 53436 53508
 53875 54072 55137 55925 56399 56526 56615 56745 58395 58463 58554 58717
 59446 59603 59862 60027 60620 60711 60969 61027 61232 61409 62069 62269
 62575 62770 63095 63265 63440 63532 63572 63672 63

CPU times: user 10.3 s, sys: 10.9 ms, total: 10.3 s
Wall time: 10.3 s
