In [1]:
import os
import pandas as pd
import pickle
import biotype.similarity as sim

In [2]:
DATA_DIR_PATH = "datasets/export"
META_FILE_PATH = "datasets/meta.csv"
PEAK_DIR_PATH = "extracted_peaks"
files =  os.listdir(DATA_DIR_PATH)
files = [fn  for fn in files if fn.endswith(".txt")]
files = sorted(files)
for file in files[:5] :
    print(file)

L100_0_G7_1.txt
L100_0_G8_1.txt
L101_0_A1_1.txt
L101_0_A2_1.txt
L102_0_A3_1.txt


In [3]:
meta = pd.read_csv(META_FILE_PATH, index_col=0,encoding='utf-8')
meta[['菌株名','血清型']].head()

Unnamed: 0_level_0,菌株名,血清型
MALDITOFMS Listeria serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1
L1,LM1,1/2a
L2,LM3,1/2a
L3,LM4,4b
L4,LM7,1/2a
L5,LM8,1/2a


In [4]:
serotype = meta['血清型'].to_dict()

In [5]:
peaks_extracted = []
n = len(files)
for i in range(n):
    df = None
    df = pd.read_table(f"{DATA_DIR_PATH}/{files[i]}",sep=" ", header=None,names=['m/z', 'intensity']) 
    x, y = df['m/z'].to_numpy(), df['intensity'].to_numpy()

    pickle_file = files[i][:-4] # trim '.txt'
    with open(f'{PEAK_DIR_PATH}/{pickle_file}_peaks.pkl', 'rb') as peak_file:
        peaks = pickle.load(peak_file)
        stype = files[i].split('_')[0]
        st = serotype.get(stype, 'NA')
        print(f'{pickle_file} ({st}) peaks: {len(peaks)}')
        peaks_extracted +=  [(x[peaks], y[peaks]) ] 
        

L100_0_G7_1 (1/2b) peaks: 208
L100_0_G8_1 (1/2b) peaks: 226
L101_0_A1_1 (1/2a) peaks: 226
L101_0_A2_1 (1/2a) peaks: 195
L102_0_A3_1 (1/2b) peaks: 208
L102_0_A4_1 (1/2b) peaks: 200
L103_0_A5_1 (1/2b) peaks: 192
L103_0_A6_1 (1/2b) peaks: 188
L104_0_A7_1 (1/2a) peaks: 193
L104_0_A8_1 (1/2a) peaks: 221
L105_0_A10_1 (1/2b) peaks: 186
L105_0_A9_1 (1/2b) peaks: 204
L106_0_A11_1 (1/2a) peaks: 208
L106_0_A12_1 (1/2a) peaks: 196
L107_0_B1_1 (1/2b) peaks: 190
L107_0_B2_1 (1/2b) peaks: 195
L108_0_B3_1 (UT) peaks: 213
L108_0_B4_1 (UT) peaks: 186
L109_0_B5_1 (UT) peaks: 214
L109_0_B6_1 (UT) peaks: 205
L10_0_G10_1 (1/2a) peaks: 189
L10_0_G9_1 (1/2a) peaks: 197
L110_0_B7_1 (UT) peaks: 190
L110_0_B8_1 (UT) peaks: 203
L111_0_B10_1 (UT) peaks: 240
L111_0_B9_1 (UT) peaks: 214
L112_0_B11_1 (1/2a) peaks: 226
L112_0_B12_1 (1/2a) peaks: 223
L113_0_C1_1 (UT) peaks: 202
L113_0_C2_1 (UT) peaks: 210
L114_0_C3_1 (UT) peaks: 289
L114_0_C4_1 (UT) peaks: 265
L115_0_C5_1 (3a) peaks: 207
L115_0_C6_1 (3a) peaks: 195
L11

In [6]:
n = len(files)
rs = []
print('Jaccard_Similarity:')
for i in range(n-1):
    for j in range(i+1,n):
        score = sim.score(peaks_extracted[i], peaks_extracted[j], method='jaccard')
        rs += [(i, j, score)]

rs = sorted(rs, key=lambda x: x[2], reverse=True)
algo_score=0
for k, v in enumerate(rs):
    i, j, s = v
    strain1, strain2 = files[i].split('_')[0], files[j].split('_')[0]  
    tag = '*' if strain1==strain2 else '' # tag the same strain id
    stype1, stype2 =  serotype.get(strain1,'NA'), serotype.get(strain2,'NA')
    if tag=='*':
        algo_score += k
        print(f'#{k+1:4d}{tag}:\t{files[i][:-4]}[{stype1}] & {files[j][:-4]}[{stype2}]\t{s:.4f}{tag}')

p = n/2      # pairs of files   
print(f'algo_score={2*algo_score/float(p*(p+1)):.2f} (closer to 1.0 is better )')

Jaccard_Similarity:
#   6*:	L121_0_D5_1[1/2b] & L121_0_D6_1[1/2b]	0.4802*
#   7*:	L129_0_E10_1[1/2b] & L129_0_E9_1[1/2b]	0.4781*
#  12*:	L128_0_E7_1[1/2a] & L128_0_E8_1[1/2a]	0.4715*
#  16*:	L12_0_H1_1[1/2a] & L12_0_H2_1[1/2a]	0.4683*
#  21*:	L116_0_C7_1[nan] & L116_0_C8_1[nan]	0.4672*
#  23*:	L138_0_G3_1[1/2c] & L138_0_G4_1[1/2c]	0.4662*
#  43*:	L126_0_E3_1[1/2b] & L126_0_E4_1[1/2b]	0.4474*
#  45*:	L122_0_D7_1[1/2b] & L122_0_D8_1[1/2b]	0.4472*
#  49*:	L123_0_D10_1[1/2a] & L123_0_D9_1[1/2a]	0.4457*
#  51*:	L119_0_D1_1[1/2a] & L119_0_D2_1[1/2a]	0.4440*
#  60*:	L139_0_G5_1[1/2c] & L139_0_G6_1[1/2c]	0.4424*
#  65*:	L13_0_H3_1[1/2c] & L13_0_H4_1[1/2c]	0.4407*
#  69*:	L110_0_B7_1[UT] & L110_0_B8_1[UT]	0.4396*
#  80*:	L107_0_B1_1[1/2b] & L107_0_B2_1[1/2b]	0.4366*
#  89*:	L132_0_F3_1[4b] & L132_0_F4_1[4b]	0.4343*
# 108*:	L101_0_A1_1[1/2a] & L101_0_A2_1[1/2a]	0.4320*
# 110*:	L102_0_A3_1[1/2b] & L102_0_A4_1[1/2b]	0.4316*
# 129*:	L105_0_A10_1[1/2b] & L105_0_A9_1[1/2b]	0.4286*
# 162*:	L135_0_F10_

In [18]:
rs = []
print('Rank_Similarity:')
for i in range(n-1):
    for j in range(i+1,n):
        score = sim.score(peaks_extracted[i], peaks_extracted[j], method='rank', rank=30)
        rs += [(i, j, score)]

rs = sorted(rs, key=lambda x: x[2], reverse=True)
algo_score=0
for k, v in enumerate(rs):
    i, j, s = v
    strain1, strain2 = files[i].split('_')[0], files[j].split('_')[0]  
    tag = '*' if strain1==strain2 else '' # tag the same strain id
    stype1, stype2 =  serotype.get(strain1,'NA'), serotype.get(strain2,'NA')
    if tag=='*':
        algo_score += k
        print(f'#{k+1:4d}{tag}:\t{files[i][:-4]}[{stype1}] & {files[j][:-4]}[{stype2}]\t{s:.4f}{tag}')

p = n/2      # pairs of files   
print(f'algo_score={2*algo_score/float(p*(p+1)):.2f} (closer to 1.0 is better )')

Rank_Similarity:
#   3*:	L121_0_D5_1[1/2b] & L121_0_D6_1[1/2b]	0.4802*
#   6*:	L129_0_E10_1[1/2b] & L129_0_E9_1[1/2b]	0.4781*
#   8*:	L128_0_E7_1[1/2a] & L128_0_E8_1[1/2a]	0.4715*
#  10*:	L12_0_H1_1[1/2a] & L12_0_H2_1[1/2a]	0.4683*
#  12*:	L116_0_C7_1[nan] & L116_0_C8_1[nan]	0.4672*
#  14*:	L138_0_G3_1[1/2c] & L138_0_G4_1[1/2c]	0.4662*
#  33*:	L122_0_D7_1[1/2b] & L122_0_D8_1[1/2b]	0.4472*
#  36*:	L119_0_D1_1[1/2a] & L119_0_D2_1[1/2a]	0.4440*
#  42*:	L139_0_G5_1[1/2c] & L139_0_G6_1[1/2c]	0.4424*
#  44*:	L13_0_H3_1[1/2c] & L13_0_H4_1[1/2c]	0.4407*
#  45*:	L110_0_B7_1[UT] & L110_0_B8_1[UT]	0.4396*
#  48*:	L123_0_D10_1[1/2a] & L123_0_D9_1[1/2a]	0.4384*
#  51*:	L107_0_B1_1[1/2b] & L107_0_B2_1[1/2b]	0.4366*
#  61*:	L102_0_A3_1[1/2b] & L102_0_A4_1[1/2b]	0.4316*
#  66*:	L132_0_F3_1[4b] & L132_0_F4_1[4b]	0.4307*
# 109*:	L120_0_D3_1[1/2a] & L120_0_D4_1[1/2a]	0.4212*
# 111*:	L126_0_E3_1[1/2b] & L126_0_E4_1[1/2b]	0.4211*
# 113*:	L115_0_C5_1[3a] & L115_0_C6_1[3a]	0.4205*
# 116*:	L185_0_A10_1[NA] & 

In [16]:
rs = []
print('Weighted_Similarity:')
for i in range(n-1):
    for j in range(i+1,n):
        score = sim.score(peaks_extracted[i], peaks_extracted[j], method='weighted', rank=30)
        rs += [(i, j, score)]

rs = sorted(rs, key=lambda x: x[2], reverse=True) # sort all results in decending of similarity scores
algo_score=0
for k, v in enumerate(rs):
    i, j, s = v
    strain1, strain2 = files[i].split('_')[0], files[j].split('_')[0]  
    tag = '*' if strain1==strain2 else '' # tag the same strain id
    stype1, stype2 =  serotype.get(strain1,'NA'), serotype.get(strain2,'NA')
    if tag=='*':
        algo_score += k
        print(f'#{k+1:4d}{tag}:\t{files[i][:-4]}[{stype1}] & {files[j][:-4]}[{stype2}]\t{s:.4f}{tag}')

p = n/2      # pairs of files   
print(f'algo_score={2*algo_score/float(p*(p+1)):.2f} (closer to 1.0 is better )')

Weighted_Similarity:
#   3*:	L121_0_D5_1[1/2b] & L121_0_D6_1[1/2b]	0.9716*
#   6*:	L129_0_E10_1[1/2b] & L129_0_E9_1[1/2b]	0.9679*
#   8*:	L128_0_E7_1[1/2a] & L128_0_E8_1[1/2a]	0.9548*
#  11*:	L12_0_H1_1[1/2a] & L12_0_H2_1[1/2a]	0.9472*
#  12*:	L116_0_C7_1[nan] & L116_0_C8_1[nan]	0.9468*
#  14*:	L138_0_G3_1[1/2c] & L138_0_G4_1[1/2c]	0.9441*
#  32*:	L122_0_D7_1[1/2b] & L122_0_D8_1[1/2b]	0.9062*
#  35*:	L119_0_D1_1[1/2a] & L119_0_D2_1[1/2a]	0.8992*
#  40*:	L139_0_G5_1[1/2c] & L139_0_G6_1[1/2c]	0.8959*
#  44*:	L13_0_H3_1[1/2c] & L13_0_H4_1[1/2c]	0.8918*
#  45*:	L110_0_B7_1[UT] & L110_0_B8_1[UT]	0.8897*
#  46*:	L123_0_D10_1[1/2a] & L123_0_D9_1[1/2a]	0.8878*
#  51*:	L107_0_B1_1[1/2b] & L107_0_B2_1[1/2b]	0.8828*
#  60*:	L102_0_A3_1[1/2b] & L102_0_A4_1[1/2b]	0.8743*
#  63*:	L132_0_F3_1[4b] & L132_0_F4_1[4b]	0.8720*
# 108*:	L120_0_D3_1[1/2a] & L120_0_D4_1[1/2a]	0.8519*
# 109*:	L126_0_E3_1[1/2b] & L126_0_E4_1[1/2b]	0.8519*
# 112*:	L115_0_C5_1[3a] & L115_0_C6_1[3a]	0.8505*
# 116*:	L185_0_A10_1[NA