In [1]:
import os
import pandas as pd
from BaselineRemoval import BaselineRemoval
from scipy import signal
import pickle
import similarity as sim

def find_peaks(y, cwt=False, smooth=False, baseline=False):
    window, deg = 15, 2     
    z = y
    if smooth:
        z = signal.savgol_filter(y, window, deg, deriv=0)

    if baseline:
        brm = BaselineRemoval(z)
        z = brm.ZhangFit(lambda_=400,repitition=15, porder=1)
    if cwt == True:
        peaks = signal.find_peaks_cwt(z, [20])  
    else:
        dist, prom = 100, 600
        peaks,_ = signal.find_peaks(z, distance=dist, prominence=prom)
    return peaks, z

def similar_to(pk1, pk2, method='jaccard', rank=5):
    if method == 'rank':
        return sim.rank_similarity(pk1, pk2, good_with=rank)
    if method == 'weighted':
        return sim.rank_similarity(pk1, pk2, good_with=rank, weighted=True)
    else:
        return sim.jaccard_similarity(pk1, pk2)

In [2]:
DATA_DIR_PATH = "datasets/export"
META_DIR_PATH = "doc"
PEAK_DIR_PATH = "peaks"
files =[
    "L185_0_A9_1", "L185_0_A10_1",
    "L186_0_A11_1", "L186_0_A12_1",
    "L187_0_B1_1", "L187_0_B2_1",
    "L291_0_A1_1", "L291_0_A2_1",
    "L100_0_G7_1", "L100_0_G8_1",
    "L101_0_A1_1", "L101_0_A2_1",
    "L103_0_A5_1", "L103_0_A6_1",
    "L125_0_E1_1", "L125_0_E2_1",
    "L126_0_E3_1", "L126_0_E4_1",
    "L128_0_E7_1", "L128_0_E8_1",   
]

In [3]:
meta = pd.read_csv(META_DIR_PATH + '/meta.csv', index_col=0,encoding='utf-8')
meta[['菌株名','血清型']].head()

Unnamed: 0_level_0,菌株名,血清型
MALDITOFMS Listeria serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1
L001,LM1,1/2a
L002,LM3,1/2a
L003,LM4,4b
L004,LM7,1/2a
L005,LM8,1/2a


In [4]:
serotype = meta['血清型'].to_dict()

In [5]:
peaks_extracted = []
n = len(files)
for i in range(n):
    df = None
    df = pd.read_table(f"{DATA_DIR_PATH}/{files[i]}.txt",sep=" ", header=None,names=['m/z', 'intensity']) 
    x, y = df['m/z'].to_numpy(), df['intensity'].to_numpy()

    #  Peak detection in new datasets (time-consuming) 

#     %time peaks, _ =  find_peaks(y, cwt=True, smooth=True, baseline=True)
#     print(f'{files[i]} peaks: {len(peaks)}')
#     with  open(f'{PEAK_DIR_PATH}/{files[i]}_peaks_ext.pkl', 'wb') as peak_file:
#         pickle.dump(peaks, peak_file)
#         peaks_extracted += [(x[peaks], y[peaks])]

   #  Read peaks from pickle files

    with open(f'{PEAK_DIR_PATH}/{files[i]}_peaks_ext.pkl', 'rb') as peak_file:
        peaks = pickle.load(peak_file)
        st = serotype.get(files[i][:4],'NA')
        print(f'{files[i]} ({st}) peaks: {len(peaks)}')
        peaks_extracted +=  [(x[peaks], y[peaks]) ] 

L185_0_A9_1 (NA) peaks: 197
L185_0_A10_1 (NA) peaks: 199
L186_0_A11_1 (NA) peaks: 208
L186_0_A12_1 (NA) peaks: 220
L187_0_B1_1 (NA) peaks: 205
L187_0_B2_1 (NA) peaks: 220
L291_0_A1_1 (NA) peaks: 221
L291_0_A2_1 (NA) peaks: 200
L100_0_G7_1 (1/2b) peaks: 208
L100_0_G8_1 (1/2b) peaks: 226
L101_0_A1_1 (1/2a) peaks: 226
L101_0_A2_1 (1/2a) peaks: 195
L103_0_A5_1 (1/2b) peaks: 192
L103_0_A6_1 (1/2b) peaks: 188
L125_0_E1_1 (1/2a) peaks: 211
L125_0_E2_1 (1/2a) peaks: 192
L126_0_E3_1 (1/2b) peaks: 191
L126_0_E4_1 (1/2b) peaks: 194
L128_0_E7_1 (1/2a) peaks: 198
L128_0_E8_1 (1/2a) peaks: 189


In [6]:
import importlib
n = len(files)

In [9]:
importlib.reload(sim)
rs = []
print('Rank_Similarity:')
for i in range(n-1):
    for j in range(i+1,n):
        score = similar_to(peaks_extracted[i], peaks_extracted[j], method='rank', rank=2)
        rs += [(i, j, score)]

rs = sorted(rs, key=lambda x: x[2], reverse=True)
algo_score=0
for k, v in enumerate(rs):
    i, j, s = v
    tag = '*' if files[i][:4]==files[j][:4] else '' # tag the same strain id
    st1 =  serotype.get(files[i][:4],'NA')
    st2 =  serotype.get(files[j][:4],'NA')
    if tag=='*':
        algo_score += k
        print(f'#{k+1:2d}{tag}:\t{files[i]}[{st1}] & {files[j]}[{st2}]\t{s:.4f}{tag}')

p = n/2      # pairs of files   
print(f'algo_score={2*algo_score/float(p*(p+1)):.2f} (closer to 1.0 is better )')

Rank_Similarity:
# 3*:	L128_0_E7_1[1/2a] & L128_0_E8_1[1/2a]	0.1015*
#13*:	L185_0_A9_1[NA] & L185_0_A10_1[NA]	0.0753*
#14*:	L291_0_A1_1[NA] & L291_0_A2_1[NA]	0.0723*
#15*:	L100_0_G7_1[1/2b] & L100_0_G8_1[1/2b]	0.0708*
#21*:	L126_0_E3_1[1/2b] & L126_0_E4_1[1/2b]	0.0599*
#31*:	L125_0_E1_1[1/2a] & L125_0_E2_1[1/2a]	0.0550*
#32*:	L186_0_A11_1[NA] & L186_0_A12_1[NA]	0.0541*
#43*:	L103_0_A5_1[1/2b] & L103_0_A6_1[1/2b]	0.0481*
#54*:	L187_0_B1_1[NA] & L187_0_B2_1[NA]	0.0440*
#56*:	L101_0_A1_1[1/2a] & L101_0_A2_1[1/2a]	0.0438*
algo_score=4.95 (closer to 1.0 is better )


In [11]:
importlib.reload(sim)
rs = []
print('Weighted_Similarity:')
for i in range(n-1):
    for j in range(i+1,n):
        score = similar_to(peaks_extracted[i], peaks_extracted[j], method='weighted', rank=2)
        rs += [(i, j, score)]

rs = sorted(rs, key=lambda x: x[2], reverse=True)
algo_score=0
for k, v in enumerate(rs):
    i, j, s = v
    tag = '*' if files[i][:4]==files[j][:4] else '' # tag the same strain id
    if tag=='*':
        print(f'#{k+1:2d}{tag}:\t{files[i]} & {files[j]}\t{s:.4f}{tag}')
        algo_score += k;      

p = n/2      # pairs of files         
print(f'algo_score={2*algo_score/float(p*(p+1)):.2f} (closer to 1.0 is better )')

Weighted_Similarity:
# 1*:	L128_0_E7_1 & L128_0_E8_1	0.0372*
# 9*:	L185_0_A9_1 & L185_0_A10_1	0.0280*
#12*:	L126_0_E3_1 & L126_0_E4_1	0.0268*
#13*:	L291_0_A1_1 & L291_0_A2_1	0.0264*
#14*:	L125_0_E1_1 & L125_0_E2_1	0.0258*
#15*:	L101_0_A1_1 & L101_0_A2_1	0.0258*
#25*:	L100_0_G7_1 & L100_0_G8_1	0.0238*
#31*:	L187_0_B1_1 & L187_0_B2_1	0.0231*
#75*:	L103_0_A5_1 & L103_0_A6_1	0.0185*
#86*:	L186_0_A11_1 & L186_0_A12_1	0.0179*
algo_score=4.93 (closer to 1.0 is better )
