# TP/FP Classification dataset

In [1]:
import re
import shutil
import multiprocessing as mp

import cv2
import pandas as pd
import tokenizers
import Levenshtein
from dataclasses import dataclass
from pathlib import Path
from rdkit import Chem
from tqdm.auto import tqdm

tqdm.pandas()

  from pandas import Panel


In [2]:
%load_ext autoreload
%autoreload 2
from nncomp_molecule import constants

## Config

In [3]:
NOTEBOOK_ID = "bms-preprocess-tpfp-v1"
KFOLD_PATH = constants.INPUTDIR / "kfujikawa" / "bms-kfold" / "10fold.csv"
OUTDIR = constants.INPUTDIR / "kfujikawa" / NOTEBOOK_ID
OUTDIR.mkdir(parents=True, exist_ok=True)
print(f"{OUTDIR / 'train.pkl'}: {(OUTDIR / 'train.pkl').exists()}")
print(f"{OUTDIR / 'test.pkl'}: {(OUTDIR / 'test.pkl').exists()}")

/work/input/kfujikawa/bms-preprocess-tpfp-v1/train.pkl: True
/work/input/kfujikawa/bms-preprocess-tpfp-v1/test.pkl: True


## Preprocess datasets

In [4]:
%%time
def assign_exact_match(is_gt, is_hn):
    if is_hn:
        if is_gt:
            return "FN"
        else:
            return "FP"
    else:
        if is_gt:
            return "TP"
        else:
            return "TN"

COLUMNS = ["image_id", "normed_InChI", "InChI_GT", "is_valid"]

# データ読み込み
df = pd.concat([
    pd.read_csv(constants.OUTPUTDIR / "1102_vtnt_bert_224-448_denoise-5/train_beam=4.csv", usecols=COLUMNS),
    pd.read_csv(constants.OUTPUTDIR / "1102_vtnt_bert_224-448_denoise-5/valid_beam=4.csv", usecols=COLUMNS),
], ignore_index=True)
df["InChI"] = df.pop("normed_InChI")

# 完全一致で生成できたかどうか (Hard Negative) のアサイン
is_HN = df.groupby("image_id").first().eval("InChI != InChI_GT")
df["is_HN"] = df.image_id.map(is_HN)

# GTのレコード追加
gt_df = df.assign(InChI=df.InChI_GT, is_valid=True)
df = pd.concat([gt_df, df], ignore_index=True).drop_duplicates()

# Levenshtein計算
df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(df[["InChI", "InChI_GT"]].values)
]

# 正解と完全一致したレコードかどうかのアサイン
df["is_GT"] = df.levenshtein == 0

# TP/FP/FN/TNのアサイン
df["exact_match"] = [
    assign_exact_match(is_gt, is_hn)
    for is_gt, is_hn in tqdm(df[["is_GT", "is_HN"]].values)
]

df = df.sort_values(["image_id", "levenshtein"]).reset_index(drop=True)
df["image_path"] = df.image_id.progress_apply(
    lambda x: str(constants.COMPETITION_DATADIR / f"train/{x[0]}/{x[1]}/{x[2]}/{x}.png")
)
df["InChI_length"] = df.InChI.apply(len)
df["levenshtein_rate"] = (df.levenshtein / df.InChI_length).apply(lambda x: min(x, 1))
df.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9563602.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9563602.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9563602.0), HTML(value='')))


CPU times: user 5min 25s, sys: 9.03 s, total: 5min 34s
Wall time: 5min 31s


Unnamed: 0,image_id,InChI_GT,is_valid,InChI,is_HN,...,is_GT,exact_match,image_path,InChI_length,levenshtein_rate
0,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,True,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,False,...,True,TP,/work/input/bms-molecular-translation/train/0/...,81,0.0
1,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,True,InChI=1S/C13H20OSi/c1-9(2)8-15-13-6-5-10(3)7-1...,False,...,False,TN,/work/input/bms-molecular-translation/train/0/...,82,0.012195
2,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,False,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,False,...,False,TN,/work/input/bms-molecular-translation/train/0/...,82,0.012195
3,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,False,InChI=1S/C14H22OS/c1-9(2)10(3)16-14-7-6-11(4)8...,False,...,False,TN,/work/input/bms-molecular-translation/train/0/...,79,0.379747
4,000019cc0cd2,InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(1...,True,InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(1...,True,...,True,FN,/work/input/bms-molecular-translation/train/0/...,155,0.0


In [5]:
df.exact_match.value_counts()

TN    6186594
TP    2126612
FP     952822
FN     297574
Name: exact_match, dtype: int64

## Load KFold

In [6]:
kfold_df = pd.read_csv(KFOLD_PATH)
benchmark_ids = kfold_df.query("fold.isin([0,1,2])").image_id
benchmark_ids

0          000011a64c74
2          0000252b6d2b
4          000026fc6c36
6          000029a61c01
12         000061d37e54
               ...     
2424159    ffff58ad694c
2424165    ffff8397e15e
2424172    ffffac645f33
2424174    ffffb8682cf9
2424181    ffffe824f539
Name: image_id, Length: 727257, dtype: object

## Save datasets

In [7]:
OUTDIR.mkdir(parents=True, exist_ok=True)
df.head(1000).to_pickle(OUTDIR / "train.debug.pkl")
df.query("image_id.isin(@benchmark_ids)").reset_index(drop=True).to_pickle(OUTDIR / "train.bench.pkl")
df.to_pickle(OUTDIR / "train.pkl")
list(OUTDIR.iterdir())

[PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/dataset-metadata.json'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/train.debug.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/test.bench.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/train.bench.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/test.debug.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/train.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/test.pkl')]

In [8]:
PREPROCESS_DIR = constants.INPUTDIR / "kfujikawa/bms-preprocess-v2"
for filename in tqdm(["test.debug.pkl", "test.bench.pkl", "test.pkl"]):
    shutil.copy(str(PREPROCESS_DIR / filename), str(OUTDIR / filename))
list(OUTDIR.iterdir())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




[PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/dataset-metadata.json'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/train.debug.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/test.bench.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/train.bench.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/test.debug.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/train.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-tpfp-v1/test.pkl')]