In [1]:
import os
import pandas as pd
import pickle
import biotype.similarity as sim

In [2]:
DATA_DIR_PATH = "datasets/export"
META_DIR_PATH = "doc"
PEAK_DIR_PATH = "extracted_peaks"
files =  os.listdir(DATA_DIR_PATH)
files = [fn  for fn in files if fn.endswith(".txt")]
files = sorted(files)

In [3]:
meta = pd.read_csv(META_DIR_PATH + '/meta.csv', index_col=0,encoding='utf-8')
serotype = meta['血清型'].to_dict()

In [4]:
peaks_extracted = []
n = len(files)
for i in range(n):
    df = None
    df = pd.read_table(f"{DATA_DIR_PATH}/{files[i]}",sep=" ", header=None,names=['m/z', 'intensity']) 
    x, y = df['m/z'].to_numpy(), df['intensity'].to_numpy()

    pickle_file = files[i][:-4] # trim '.txt'
    with open(f'{PEAK_DIR_PATH}/{pickle_file}_peaks.pkl', 'rb') as peak_file:
        peaks = pickle.load(peak_file)
        st = serotype.get(files[i][:4],'NA')
        print(f'{pickle_file} ({st}) peaks: {len(peaks)}')
        peaks_extracted +=  [(x[peaks], y[peaks]) ] 

L100_0_G7_1 (1/2b) peaks: 208
L100_0_G8_1 (1/2b) peaks: 226
L101_0_A1_1 (1/2a) peaks: 226
L101_0_A2_1 (1/2a) peaks: 195
L102_0_A3_1 (1/2b) peaks: 208
L102_0_A4_1 (1/2b) peaks: 200
L103_0_A5_1 (1/2b) peaks: 192
L103_0_A6_1 (1/2b) peaks: 188
L104_0_A7_1 (1/2a) peaks: 193
L104_0_A8_1 (1/2a) peaks: 221
L105_0_A10_1 (1/2b) peaks: 186
L105_0_A9_1 (1/2b) peaks: 204
L106_0_A11_1 (1/2a) peaks: 208
L106_0_A12_1 (1/2a) peaks: 196
L107_0_B1_1 (1/2b) peaks: 190
L107_0_B2_1 (1/2b) peaks: 195
L108_0_B3_1 (UT) peaks: 213
L108_0_B4_1 (UT) peaks: 186
L109_0_B5_1 (UT) peaks: 214
L109_0_B6_1 (UT) peaks: 205
L10_0_G10_1 (NA) peaks: 189
L10_0_G9_1 (NA) peaks: 197
L110_0_B7_1 (UT) peaks: 190
L110_0_B8_1 (UT) peaks: 203
L111_0_B10_1 (UT) peaks: 240
L111_0_B9_1 (UT) peaks: 214
L112_0_B11_1 (1/2a) peaks: 226
L112_0_B12_1 (1/2a) peaks: 223
L113_0_C1_1 (UT) peaks: 202
L113_0_C2_1 (UT) peaks: 210
L114_0_C3_1 (UT) peaks: 289
L114_0_C4_1 (UT) peaks: 265
L115_0_C5_1 (3a) peaks: 207
L115_0_C6_1 (3a) peaks: 195
L116_0_

In [19]:
def biotype(peaks, algo="jaccard", topn=1):
    rs = []
    for i in range(n-1):
        for j in range(i+1,n):
            score = sim.similar_to(peaks[i], peaks[j], method=algo, rank=2)
            rs += [(i, j, score)]

    rs += [(_j,_i,_s) for _i,_j,_s in rs ]
    hit = 0
    for i in range(n):
        rs_ = [(_i,_j,_s) for _i,_j,_s in rs if _i==i]
        rs_ = sorted(rs_, key=lambda x: x[2], reverse=True)
        for k in range(topn):
            if k >= len(rs_):
                break
            _i, _j , _s = rs_[k]
            tag = '*' if files[_i][:4]==files[_j][:4] else '' # tag the same strain id
            if tag=='*':
                hit = hit + 1
    return hit

In [20]:
topn = 1 # 類似度の高さがTop 1位のみ、正解と認める
algos = {'jaccard':'Jaccard Similarity', 'rank':'Rank Similarity', 'weighted': 'Weighted Rank Similarity'}
for algo in algos:
    print(algos[algo])
    hit = biotype(peaks_extracted, algo, topn)
    print(f"  n={n}, top2={hit}, hit rate:{hit/n: .3f}")

Jaccard Similarity
  n=102, top2=24, hit rate: 0.235
Rank Similarity
  n=102, top2=29, hit rate: 0.284
Weighted Rank Similarity
  n=102, top2=31, hit rate: 0.304


In [21]:
topn = 2 # 類似度の高さがTop 2位以内なら、正解と認める
algos = {'jaccard':'Jaccard Similarity', 'rank':'Rank Similarity', 'weighted': 'Weighted Rank Similarity'}
for algo in algos:
    print(algos[algo])
    hit = biotype(peaks_extracted, algo, topn)
    print(f"  n={n}, top2={hit}, hit rate:{hit/n: .3f}")

Jaccard Similarity
  n=102, top2=39, hit rate: 0.382
Rank Similarity
  n=102, top2=36, hit rate: 0.353
Weighted Rank Similarity
  n=102, top2=42, hit rate: 0.412


In [22]:
topn = 3 # 類似度の高さがTop 3位以内なら、正解と認める
algos = {'jaccard':'Jaccard Similarity', 'rank':'Rank Similarity', 'weighted': 'Weighted Rank Similarity'}
for algo in algos:
    print(algos[algo])
    hit = biotype(peaks_extracted, algo, topn)
    print(f"  n={n}, top2={hit}, hit rate:{hit/n: .3f}")

Jaccard Similarity
  n=102, top2=49, hit rate: 0.480
Rank Similarity
  n=102, top2=44, hit rate: 0.431
Weighted Rank Similarity
  n=102, top2=46, hit rate: 0.451


In [23]:
topn = 4  # 類似度の高さがTop 4位以内なら、正解と認める
algos = {'jaccard':'Jaccard Similarity', 'rank':'Rank Similarity', 'weighted': 'Weighted Rank Similarity'}
for algo in algos:
    print(algos[algo])
    hit = biotype(peaks_extracted, algo, topn)
    print(f"  n={n}, top2={hit}, hit rate:{hit/n: .3f}")

Jaccard Similarity
  n=102, top2=57, hit rate: 0.559
Rank Similarity
  n=102, top2=49, hit rate: 0.480
Weighted Rank Similarity
  n=102, top2=51, hit rate: 0.500


In [24]:
topn = 5  # 類似度の高さがTop 5位以内なら、正解と認める
algos = {'jaccard':'Jaccard Similarity', 'rank':'Rank Similarity', 'weighted': 'Weighted Rank Similarity'}
for algo in algos:
    print(algos[algo])
    hit = biotype(peaks_extracted, algo, topn)
    print(f"  n={n}, top2={hit}, hit rate:{hit/n: .3f}")

Jaccard Similarity
  n=102, top2=62, hit rate: 0.608
Rank Similarity
  n=102, top2=53, hit rate: 0.520
Weighted Rank Similarity
  n=102, top2=58, hit rate: 0.569
