In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.layouts import column

from BaselineRemoval import BaselineRemoval
from scipy import signal

In [2]:
DATA_DIR_PATH = "datasets/export"
META_DIR_PATH = "datasets"

In [3]:
files = os.listdir(DATA_DIR_PATH)
strains = [x[:4] for x in files]
strains = sorted(list(set(strains)))
print(strains)

['L001', 'L010', 'L011', 'L012', 'L013', 'L014', 'L100', 'L101', 'L102', 'L103', 'L104', 'L105', 'L106', 'L107', 'L108', 'L109', 'L110', 'L111', 'L112', 'L113', 'L114', 'L115', 'L116', 'L117', 'L118', 'L119', 'L120', 'L121', 'L122', 'L123', 'L124', 'L125', 'L126', 'L127', 'L128', 'L129', 'L130', 'L131', 'L132', 'L133', 'L134', 'L135', 'L136', 'L137', 'L138', 'L139', 'L140', 'L185', 'L186', 'L187', 'L291']


In [4]:
meta = pd.read_csv(META_DIR_PATH + '/meta.csv', index_col=0,encoding='utf-8')
meta[['菌株名','Serotype','血清型']].head()

Unnamed: 0_level_0,菌株名,Serotype,血清型
Listeria Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
L001,LM1,1/2a,1/2a
L002,LM3,1/2a,1/2a
L003,LM4,4b,4b
L004,LM7,1/2a,1/2a
L005,LM8,1/2a,1/2a


In [12]:
meta = meta.filter(items=strains, axis=0)
meta[['菌株名','Serotype','血清型']].head()

['L001' 'L010' 'L011' 'L012' 'L013' 'L014' 'L100' 'L101' 'L102' 'L103'
 'L104' 'L105' 'L106' 'L107' 'L108' 'L109' 'L110' 'L111' 'L112' 'L113'
 'L114' 'L115' 'L116' 'L117' 'L118' 'L119' 'L120' 'L121' 'L122' 'L123'
 'L124' 'L125' 'L126' 'L127' 'L128' 'L129' 'L130' 'L131' 'L132' 'L133'
 'L134' 'L135' 'L136' 'L137' 'L138' 'L139' 'L140']


Unnamed: 0_level_0,菌株名,Serotype,血清型
Listeria Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
L001,LM1,1/2a,1/2a
L010,LM13,1/2a,1/2a
L011,LM14,1/2a,1/2a
L012,LM15,1/2a,1/2a
L013,LM16,1/2c,1/2c


In [13]:
strains = meta.index.values
file = 'L011_0_G12_1.txt'
b_no = file[:4]
serotype = meta[meta.index==b_no]['Serotype'].values[0]
serotype

'1/2a'

In [14]:
files = [ f for f in files if f[:4] in strains]
print(files)

['L001_0_F3_1.txt', 'L001_0_F4_1.txt', 'L010_0_G10_1.txt', 'L010_0_G9_1.txt', 'L011_0_G11_1.txt', 'L011_0_G12_1.txt', 'L012_0_H1_1.txt', 'L012_0_H2_1.txt', 'L013_0_H3_1.txt', 'L013_0_H4_1.txt', 'L014_0_H5_1.txt', 'L014_0_H6_1.txt', 'L100_0_G7_1.txt', 'L100_0_G8_1.txt', 'L101_0_A1_1.txt', 'L101_0_A2_1.txt', 'L102_0_A3_1.txt', 'L102_0_A4_1.txt', 'L103_0_A5_1.txt', 'L103_0_A6_1.txt', 'L104_0_A7_1.txt', 'L104_0_A8_1.txt', 'L105_0_A10_1.txt', 'L105_0_A9_1.txt', 'L106_0_A11_1.txt', 'L106_0_A12_1.txt', 'L107_0_B1_1.txt', 'L107_0_B2_1.txt', 'L108_0_B3_1.txt', 'L108_0_B4_1.txt', 'L109_0_B5_1.txt', 'L109_0_B6_1.txt', 'L110_0_B7_1.txt', 'L110_0_B8_1.txt', 'L111_0_B10_1.txt', 'L111_0_B9_1.txt', 'L112_0_B11_1.txt', 'L112_0_B12_1.txt', 'L113_0_C1_1.txt', 'L113_0_C2_1.txt', 'L114_0_C3_1.txt', 'L114_0_C4_1.txt', 'L115_0_C5_1.txt', 'L115_0_C6_1.txt', 'L116_0_C7_1.txt', 'L116_0_C8_1.txt', 'L117_0_C10_1.txt', 'L117_0_C9_1.txt', 'L118_0_C11_1.txt', 'L118_0_C12_1.txt', 'L119_0_D1_1.txt', 'L119_0_D2_1.txt',

In [15]:
%time
print ('Peak Detection with Smoothing and Baseline Correction')

for file in files:
    file_path = f"{DATA_DIR_PATH}/{file}"
    df = pd.read_table(file_path, sep=" ", header=None, names=['m/z', 'intensity']) 
    x, y = df['m/z'].to_numpy(), df['intensity'].to_numpy()
    dist, prom = 500, 500
    window, deg = 15, 2     
    smoothed = signal.savgol_filter(y, window, deg, deriv=0)
    brm = BaselineRemoval(smoothed)
    z = brm.ZhangFit(lambda_=400,repitition=15, porder=1)
    peaks,_ = signal.find_peaks(z, distance=dist, prominence=prom)
    
    b_no = file[:4]
    serotype = meta[meta.index==b_no]['Serotype'].values[0]

    print(f'{file}: {serotype} \tpeaks: {len(peaks)}')

# p = figure(x_axis_type="auto", x_axis_label='m/z',y_axis_label='rel. intensity', width=600, height=400)
# p.line(x, z, color="blue",legend_label=f'{file}')
# p.x(x[peaks], z[peaks], size=10, color="red", alpha=0.5)
# output_notebook()
# show(p)

CPU times: total: 0 ns
Wall time: 0 ns
Peak Detection with Smoothing and Baseline Correction
L001_0_F3_1.txt: 1/2a 	peaks: 61
L001_0_F4_1.txt: 1/2a 	peaks: 71
L010_0_G10_1.txt: 1/2a 	peaks: 64
L010_0_G9_1.txt: 1/2a 	peaks: 58
L011_0_G11_1.txt: 1/2a 	peaks: 63
L011_0_G12_1.txt: 1/2a 	peaks: 55
L012_0_H1_1.txt: 1/2a 	peaks: 64
L012_0_H2_1.txt: 1/2a 	peaks: 66
L013_0_H3_1.txt: 1/2c 	peaks: 65
L013_0_H4_1.txt: 1/2c 	peaks: 69
L014_0_H5_1.txt: 4b 	peaks: 57
L014_0_H6_1.txt: 4b 	peaks: 63
L100_0_G7_1.txt: 1/2b 	peaks: 55
L100_0_G8_1.txt: 1/2b 	peaks: 55
L101_0_A1_1.txt: 1/2a 	peaks: 57
L101_0_A2_1.txt: 1/2a 	peaks: 71
L102_0_A3_1.txt: 1/2b 	peaks: 66
L102_0_A4_1.txt: 1/2b 	peaks: 66
L103_0_A5_1.txt: 1/2b 	peaks: 62
L103_0_A6_1.txt: 1/2b 	peaks: 70
L104_0_A7_1.txt: 1/2a 	peaks: 67
L104_0_A8_1.txt: 1/2a 	peaks: 64
L105_0_A10_1.txt: 1/2b 	peaks: 67
L105_0_A9_1.txt: 1/2b 	peaks: 63
L106_0_A11_1.txt: 1/2a 	peaks: 60
L106_0_A12_1.txt: 1/2a 	peaks: 67
L107_0_B1_1.txt: 1/2b 	peaks: 64
L107_0_B2_1.tx

In [None]:
import similarity as sim
 