In [2]:
import os
import pandas as pd
from BaselineRemoval import BaselineRemoval
from scipy import signal

def find_peaks(y, cwt=False, smooth=False, baseline=False):
    window, deg = 15, 2     
    smoothed = y
    if smooth:
        smoothed = signal.savgol_filter(y, window, deg, deriv=0)

    z = smoothed
    if baseline:
        brm = BaselineRemoval(smoothed)
        z = brm.ZhangFit(lambda_=400,repitition=15, porder=1)
    if cwt == True:
        peaks = signal.find_peaks_cwt(z, [20])  
    else:
        dist, prom = 100, 600
        peaks,_ = signal.find_peaks(z, distance=dist, prominence=prom)
    return peaks, z

def similar_to(pk1, pk2, method='jaccard', rank=5):
    if method=='rank':
        return sim.rank_similarity(pk1, pk2, good_with=rank)
    else:
        return sim.jaccard_similarity(pk1, pk2)
    

DATA_DIR_PATH = "datasets/export"
files =[
    "L185_0_A9_1.txt", "L185_0_A10_1.txt",
    "L186_0_A11_1.txt", "L186_0_A12_1.txt",
    "L187_0_B1_1.txt", "L187_0_B2_1.txt",
    "L291_0_A1_1.txt", "L291_0_A2_1.txt",
        "L100_0_G7_1.txt", "L100_0_G8_1.txt",
        "L101_0_A1_1.txt", "L101_0_A2_1.txt",
        "L103_0_A5_1.txt", "L103_0_A6_1.txt",
        "L125_0_E1_1.txt", "L125_0_E2_1.txt",
        "L126_0_E3_1.txt", "L126_0_E4_1.txt",
        "L128_0_E7_1.txt", "L128_0_E8_1.txt",
    
]
peaks_extracted = []
n = len(files)
for i in range(n):
    df = None
    df = pd.read_table(f"{DATA_DIR_PATH}/{files[i]}",sep=" ", header=None,names=['m/z', 'intensity']) 
    x, y = df['m/z'].to_numpy(), df['intensity'].to_numpy()
    %time peaks, z =  find_peaks(y, cwt=True, smooth=False, baseline=False)
    print(f'{files[i]} peaks: {len(peaks)}')
    peaks_extracted += [(x[peaks], z[peaks])]

CPU times: user 10.5 s, sys: 14.5 ms, total: 10.5 s
Wall time: 10.5 s
L185_0_A9_1.txt peaks: 195
CPU times: user 10 s, sys: 8.77 ms, total: 10 s
Wall time: 10 s
L185_0_A10_1.txt peaks: 202
CPU times: user 9.93 s, sys: 7.64 ms, total: 9.94 s
Wall time: 9.95 s
L186_0_A11_1.txt peaks: 213
CPU times: user 9.78 s, sys: 6.76 ms, total: 9.79 s
Wall time: 9.79 s
L186_0_A12_1.txt peaks: 219
CPU times: user 9.87 s, sys: 5.62 ms, total: 9.88 s
Wall time: 9.88 s
L187_0_B1_1.txt peaks: 211
CPU times: user 9.81 s, sys: 5.37 ms, total: 9.81 s
Wall time: 9.81 s
L187_0_B2_1.txt peaks: 215
CPU times: user 9.84 s, sys: 4.18 ms, total: 9.85 s
Wall time: 9.85 s
L291_0_A1_1.txt peaks: 215
CPU times: user 9.86 s, sys: 6.12 ms, total: 9.87 s
Wall time: 9.87 s
L291_0_A2_1.txt peaks: 201
CPU times: user 9.84 s, sys: 6.39 ms, total: 9.84 s
Wall time: 9.84 s
L100_0_G7_1.txt peaks: 207
CPU times: user 9.81 s, sys: 5.38 ms, total: 9.81 s
Wall time: 9.82 s
L100_0_G8_1.txt peaks: 224
CPU times: user 9.79 s, sys: 3.85

In [7]:
import importlib

In [8]:
?importlib

In [17]:
import similarity as sim
importlib.reload(sim)
rs = []
n = len(files)
for i in range(n-1):
    for j in range(i+1,n):
        if i == j:
            continue

        score = similar_to(peaks_extracted[i], peaks_extracted[j], method='rank', rank=2)
        rs += [(i, j, score)]

for i,j,s in sorted(rs, key=lambda x: x[2], reverse=True):
    tag = '*' if files[i][:4]==files[j][:4] else '' # tag the same strain id
    print(f'{files[i]} & {files[j]}\t{s:.4f}{tag}')

L128_0_E7_1.txt & L128_0_E8_1.txt	0.1397*
L185_0_A10_1.txt & L186_0_A11_1.txt	0.1212
L125_0_E2_1.txt & L128_0_E7_1.txt	0.1014
L125_0_E2_1.txt & L128_0_E8_1.txt	0.1011
L103_0_A5_1.txt & L128_0_E8_1.txt	0.0854
L100_0_G7_1.txt & L100_0_G8_1.txt	0.0810*
L185_0_A9_1.txt & L185_0_A10_1.txt	0.0791*
L103_0_A6_1.txt & L125_0_E1_1.txt	0.0777
L185_0_A10_1.txt & L186_0_A12_1.txt	0.0728
L185_0_A9_1.txt & L187_0_B2_1.txt	0.0726
L186_0_A11_1.txt & L186_0_A12_1.txt	0.0696*
L101_0_A2_1.txt & L125_0_E2_1.txt	0.0682
L101_0_A1_1.txt & L125_0_E1_1.txt	0.0677
L103_0_A6_1.txt & L126_0_E4_1.txt	0.0676
L186_0_A11_1.txt & L187_0_B2_1.txt	0.0667
L185_0_A9_1.txt & L186_0_A12_1.txt	0.0662
L101_0_A1_1.txt & L103_0_A6_1.txt	0.0629
L185_0_A10_1.txt & L187_0_B2_1.txt	0.0627
L101_0_A2_1.txt & L128_0_E7_1.txt	0.0618
L185_0_A10_1.txt & L126_0_E3_1.txt	0.0610
L103_0_A6_1.txt & L128_0_E7_1.txt	0.0609
L187_0_B2_1.txt & L291_0_A2_1.txt	0.0605
L101_0_A2_1.txt & L128_0_E8_1.txt	0.0604
L185_0_A9_1.txt & L186_0_A11_1.txt	0.0602
