# Hard negative dataset

In [1]:
import re
import shutil
import multiprocessing as mp

import cv2
import pandas as pd
import tokenizers
import Levenshtein
from dataclasses import dataclass
from pathlib import Path
from rdkit import Chem
from tqdm.auto import tqdm

tqdm.pandas()

  from pandas import Panel


In [2]:
%load_ext autoreload
%autoreload 2
from nncomp_molecule import constants

## Config

In [3]:
NOTEBOOK_ID = "bms-preprocess-hn-v1"
KFOLD_PATH = constants.INPUTDIR / "kfujikawa" / "bms-kfold" / "10fold.csv"
OUTDIR = constants.INPUTDIR / "kfujikawa" / NOTEBOOK_ID
OUTDIR.mkdir(parents=True, exist_ok=True)
print(f"{OUTDIR / 'train.pkl'}: {(OUTDIR / 'train.pkl').exists()}")
print(f"{OUTDIR / 'test.pkl'}: {(OUTDIR / 'test.pkl').exists()}")

/work/input/kfujikawa/bms-preprocess-hn-v1/train.pkl: False
/work/input/kfujikawa/bms-preprocess-hn-v1/test.pkl: False


## Preprocess datasets

In [4]:
%%time
COLUMNS = ["image_id", "normed_InChI", "InChI_GT", "is_valid"]

train_df = pd.read_pickle("/work/input/kfujikawa/bms-preprocess-v2/train.pkl").set_index("image_id")

# データ読み込み
df = pd.concat([
    pd.read_csv(constants.OUTPUTDIR / "1106_swin_bert_384/train_beam=2.csv", usecols=COLUMNS),
    pd.read_csv(constants.OUTPUTDIR / "1106_swin_bert_384/valid_beam=2.csv", usecols=COLUMNS),
], ignore_index=True).groupby("image_id").first()

train_df["is_HN"] = df.eval("InChI_GT != normed_InChI")
train_df = train_df.reset_index()
train_df.head()

CPU times: user 29.6 s, sys: 1.77 s, total: 31.3 s
Wall time: 31.3 s


Unnamed: 0,image_id,InChI,image_path,InChI_length,w,h,w/h,w*h,is_flipped,is_HN
0,4435736fd10b,InChI=1S/C65H110O6/c1-4-7-10-13-16-19-22-25-28...,/work/input/bms-molecular-translation/train/4/...,403,1472,788,1.86802,1159936,False,True
1,8c0e35ce3f1f,InChI=1S/C60H98O6/c1-4-7-10-13-16-19-22-25-27-...,/work/input/bms-molecular-translation/train/8/...,398,939,309,3.038835,290151,False,True
2,934593ad3cae,InChI=1S/C63H106O6/c1-4-7-10-13-16-19-22-25-27...,/work/input/bms-molecular-translation/train/9/...,397,826,661,1.249622,545986,False,True
3,c963808e309d,InChI=1S/C59H92O6/c1-4-7-10-13-16-19-22-25-28-...,/work/input/bms-molecular-translation/train/c/...,393,1268,883,1.436014,1119644,False,True
4,0aa425d5f5ac,InChI=1S/C62H106O6/c1-4-7-10-13-16-19-22-25-27...,/work/input/bms-molecular-translation/train/0/...,393,918,543,1.690608,498474,False,True


## Load KFold

In [5]:
kfold_df = pd.read_csv(KFOLD_PATH)
benchmark_ids = kfold_df.query("fold.isin([0,1,2])").image_id
benchmark_ids

0          000011a64c74
2          0000252b6d2b
4          000026fc6c36
6          000029a61c01
12         000061d37e54
               ...     
2424159    ffff58ad694c
2424165    ffff8397e15e
2424172    ffffac645f33
2424174    ffffb8682cf9
2424181    ffffe824f539
Name: image_id, Length: 727257, dtype: object

## Save datasets

In [6]:
OUTDIR.mkdir(parents=True, exist_ok=True)
train_df.head(1000).to_pickle(OUTDIR / "train.debug.pkl")
train_df.query("image_id.isin(@benchmark_ids)").reset_index(drop=True).to_pickle(OUTDIR / "train.bench.pkl")
train_df.to_pickle(OUTDIR / "train.pkl")
list(OUTDIR.iterdir())

[PosixPath('/work/input/kfujikawa/bms-preprocess-hn-v1/train.debug.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-hn-v1/train.bench.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-hn-v1/train.pkl')]

In [7]:
PREPROCESS_DIR = constants.INPUTDIR / "kfujikawa/bms-preprocess-v2"
for filename in tqdm(["test.debug.pkl", "test.bench.pkl", "test.pkl"]):
    shutil.copy(str(PREPROCESS_DIR / filename), str(OUTDIR / filename))
list(OUTDIR.iterdir())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




[PosixPath('/work/input/kfujikawa/bms-preprocess-hn-v1/train.debug.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-hn-v1/test.bench.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-hn-v1/train.bench.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-hn-v1/test.debug.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-hn-v1/train.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-hn-v1/test.pkl')]